In [None]:
import json
import logging
import random
import os

import pandas as pd
from tqdm import tqdm
import torch
import transformers
import numpy as np

from preprocessing import preprocessing
from tokenize_and_pad_text import *

In [None]:
random_seed = 42

torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
# torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print(f'There are {torch.cuda.device_count()} GPU(s) available.')

    print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

data_path = '../reviews.csv'
data_name = data_path.split('/')[-1]
print(f'use {data_name} data', end='\n')

In [None]:
model_class = transformers.BertModel
tokenizer_class = transformers.BertTokenizer
pretrained_weights = 'bert-base-uncased'

max_seq = 128
bert_batch_size = 16

In [None]:
with open('all_word.json', 'r') as f:
    word_dict = json.load(f)
    all_word_list = word_dict['word']
    
pad_word_list = []
for idx in range(0, len(all_word_list), max_seq):
    pad_word_list.append(all_word_list[idx:idx+max_seq])
    
df = pd.DataFrame({'review': pad_word_list})

In [None]:
def tokenize_text(df, max_seq, tokenizer):
    return [
        # tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.comment_text.values
        tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.review.values
    ]


def pad_text(tokenized_text, max_seq):
    return np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])


def tokenize_and_pad_text(df, max_seq, tokenizer):
    tokenized_text = tokenize_text(df, max_seq, tokenizer)
    padded_text = pad_text(tokenized_text, max_seq)
    return torch.tensor(padded_text)


def targets_to_tensor(df, target_columns):
    return torch.tensor(df[target_columns].values, dtype=torch.float32)

def tokenize_and_pad_text_bert(df, device, model_class, tokenizer_class, pretrained_weights, max_seq=128, batch_size=16, target_columns=['label']):
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights).to(device)
    bert_model.eval()

    data_indices = tokenize_and_pad_text(df, max_seq, tokenizer)
    data_indices = data_indices.to(device)

    with torch.no_grad():
        x_data = bert_model(data_indices[:batch_size])[0]
        for idx in tqdm(range(batch_size, len(data_indices), batch_size)):
            x_data = torch.cat((x_data, bert_model(data_indices[idx:idx+batch_size])[0]), 0)

    return x_data

In [None]:
x_train = tokenize_and_pad_text_bert(df, device, model_class, tokenizer_class, pretrained_weights,
                                                max_seq=max_seq, batch_size=bert_batch_size, target_columns=None)
x_train = x_train.reshape(x_train.shape[0]*x_train.shape[1], x_train.shape[2])
x_train.shape
x_train = x_train.to('cpu')

In [None]:
reshape_x_train = []
for emb in x_train:
    reshape_x_train.append(emb.reshape(-1, 1))

In [None]:
from collections import defaultdict

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

file_list = os.listdir('./json_folder')

for idx1, word1 in tqdm(enumerate(all_word_list)):
    file_name = f'{word1}.json'
    if file_name in file_list:
        continue
    
    dic = {}
    
    for idx2, word2 in enumerate(all_word_list):
        cs = cosine_similarity(x_train[idx1].view([1, 768]), x_train[idx2].view([1, 768]))[0][0]
        ed = euclidean_distances(x_train[idx1].view([1, 768]), x_train[idx2].view([1, 768]))[0][0]
        dic[word2] = [float(cs), float(ed)]
    
    with open(f'./json_folder/{file_name}', 'w', encoding='utf-8') as make_file:
        json.dump({word1: dic}, make_file, indent='\t')