In [1]:
from WordTransformer import WordTransformer, InputExample

In [2]:
import json
import numpy as np
import pickle
import os
from tqdm import tqdm

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="MIG-4c6deb33-4c3f-5990-89bf-891bd00dac17"

In [4]:
model = WordTransformer('pierluigic/xl-lexeme', device='cuda:0')

In [5]:
def matrix(input, output):
    total_files = sum(len(files) for _, _, files in os.walk(input))
    with tqdm(total=total_files) as pbar:
        for root, dirs, files in os.walk(input):
            for file_name in files:
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r') as file:
                    json_data = file.read()
                    json_objects = json.loads(json_data)
                    word = []
                    for obj in json_objects:
                        examples = InputExample(texts=obj['sentence'], positions=[obj['first_index'],obj['last_index']])
                        embedding = model.encode(examples)
                        word.append(embedding)
                    embedding_matrix = np.vstack(word)
                    with open('{}/{}.pickle'.format(output, os.path.splitext(file_name)[0]), 'wb') as file:
                        pickle.dump(embedding_matrix, file)
                pbar.update(1)

In [6]:
matrix('../JSONLgood1', '../embeddingsgood1')

  0%|          | 12/12869 [00:12<1:43:08,  2.08it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (901 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 12869/12869 [1:37:07<00:00,  2.21it/s]


In [19]:
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
def embeddings(csv):
    df = pd.read_csv(csv, sep=';')
    df['cosine_distance'] = None
    df['cosine_label'] = None
    with tqdm(total=df.shape[0]) as pbar:
        for index, row in df.iterrows():
            sentence_1 = row['sentence_1']
            first_index_1 = row['first_index_1']
            last_index_1 = row['last_index_1']
            sentence_2 = row['sentence_2']
            first_index_2 = row['first_index_2']
            last_index_2 = row['last_index_2']
            example1 = InputExample(texts=sentence_1, positions=[first_index_1, last_index_1])
            embedding1 = model.encode(example1)
            example2 = InputExample(texts=sentence_2, positions=[first_index_2, last_index_2])
            embedding2 = model.encode(example2)
            distance = cosine(embedding1, embedding2)
            df.at[index, 'cosine_distance'] = distance
            if distance > 0.5:
                df.at[index, 'cosine_label'] = 1
            else:
                df.at[index, 'cosine_label'] = 0
            pbar.update(1)
    return(df)


In [20]:
(embeddings('../annotation.csv')).to_csv('../annotation_with_cosine.csv', index=False)

100%|██████████| 1000/1000 [00:37<00:00, 26.52it/s]
