In [1]:
import pandas as pd

from sentence_transformers import SentenceTransformer, models, InputExample, losses
from torch import nn
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, util

In [2]:
test = pd.read_csv("/Users/gulsumbudakoglu/Desktop/working_mix/stsb_multi_mt-dataset-test.csv")
test['similarity_score'] = test['similarity_score'].astype(float)

In [3]:
test.head()

Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


In [4]:
#loading fine-tuned model
model = SentenceTransformer("/Users/gulsumbudakoglu/Desktop/working_mix/model")

In [5]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [6]:
#taking embeddings of paie sentence of test set
embeddings1 = model.encode(test["sentence1"].to_list(), convert_to_tensor=True)
embeddings2 = model.encode(test["sentence2"].to_list(), convert_to_tensor=True)

In [7]:
embeddings1

tensor([[ 0.2911, -0.4535, -0.2336,  ..., -0.5521,  0.1239, -0.1041],
        [ 0.1143, -0.1039, -0.3723,  ..., -0.4470,  0.1718, -0.2451],
        [ 0.0634, -0.3298, -0.3007,  ..., -0.4745,  0.1426, -0.0717],
        ...,
        [ 0.4151, -0.1007, -0.3630,  ..., -0.2394,  0.2557,  0.0115],
        [-0.1775, -0.1145, -0.1418,  ..., -0.5425,  0.1107, -0.1174],
        [-0.2082, -0.1785, -0.2496,  ..., -0.4536,  0.0816, -0.2233]])

In [8]:
embeddings2

tensor([[ 0.2003, -0.4308, -0.2572,  ..., -0.5210,  0.1191, -0.0970],
        [ 0.1397, -0.1387, -0.3283,  ..., -0.4520,  0.1338, -0.2724],
        [ 0.0124, -0.4089, -0.2747,  ..., -0.5099,  0.0624, -0.1058],
        ...,
        [ 0.0966, -0.1952, -0.1907,  ..., -0.4140,  0.2513, -0.0777],
        [ 0.0345, -0.1034, -0.1078,  ..., -0.4854,  0.1179, -0.1316],
        [-0.2990, -0.1659, -0.0809,  ..., -0.5120,  0.1733, -0.1698]])

In [9]:
#appending embedding as a column on the test dataset

test["embedding_sentence1"] = [t.numpy() for t in embeddings1]
test["embedding_sentence2"] = [t.numpy() for t in embeddings2]

In [10]:
#adding seperatly embeddings layers on the test dataset
df1 = pd.DataFrame(test['embedding_sentence1'].tolist(), index= test.index).add_prefix('sentence1_embedding_').fillna('')
df2 = pd.DataFrame(test['embedding_sentence2'].tolist(), index= test.index).add_prefix('sentence2_embedding_').fillna('')

test = test.join(df1)
test = test.join(df2)

In [11]:
#Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
cosine_similarity = []
for i in range(len(test.sentence1)):
    cosine_similarity.append(cosine_scores[i][i].numpy())
test['cosine_similarity'] = cosine_similarity

In [13]:
test.columns

Index(['sentence1', 'sentence2', 'similarity_score', 'embedding_sentence1',
       'embedding_sentence2', 'sentence1_embedding_0', 'sentence1_embedding_1',
       'sentence1_embedding_2', 'sentence1_embedding_3',
       'sentence1_embedding_4',
       ...
       'sentence2_embedding_759', 'sentence2_embedding_760',
       'sentence2_embedding_761', 'sentence2_embedding_762',
       'sentence2_embedding_763', 'sentence2_embedding_764',
       'sentence2_embedding_765', 'sentence2_embedding_766',
       'sentence2_embedding_767', 'cosine_similarity'],
      dtype='object', length=1542)