### Load the Data

In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

train_df = pd.read_csv('./Data/train.csv', sep='\t')
val_df = pd.read_csv('./Data/dev.csv', sep='\t')
test_df = pd.read_csv('./Data/sample_demo.csv', sep='\t')

train_df = train_df.dropna()
val_df = val_df.dropna()
test_df = test_df.dropna()
print(train_df.shape, val_df.shape, test_df.shape)

(5708, 3) (1468, 3) (6, 4)


### Scale the Data

In [6]:
scaler = MinMaxScaler(feature_range=(0, 1))
train_df['score'] = scaler.fit_transform(train_df[['score']])
val_df['score'] = scaler.fit_transform(val_df[['score']])
test_df['score'] = scaler.fit_transform(test_df[['score']])

### Load the Model

In [7]:
from sentence_transformers import SentenceTransformer, util
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity


# all-mpnet-base-v2 (large-size), all-MiniLM-L12-v1 (moderate), all-MiniLM-L6-v2 (fast & good & small), paraphrase-mpnet-base-v2
model = SentenceTransformer("all-MiniLM-L6-v2")

### Pearson correlation on Validation set 

In [8]:
val_predicted_scores = []
    
for i in range(len(val_df)):
    s1 = val_df.iloc[i]['sentence1']
    s2 = val_df.iloc[i]['sentence2']
    emb1 = model.encode(s1, convert_to_tensor=True)
    emb2 = model.encode(s2, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(emb1, emb2)
    val_predicted_scores.append(cosine_sim.item())
    # cosine_sim = cosine_similarity(emb1.cpu().reshape(1, -1), emb2.cpu().reshape(1, -1))

pearson_corr, _ = pearsonr(val_predicted_scores, val_df['score'])
print('Validation-Set Pearson Correlation:', pearson_corr)

Validation-Set Pearson Correlation: 0.8631423846336786


### Pearson correlation on Test set

In [11]:
test_predicted_scores = []

for i in range(len(test_df)):
    s1 = test_df.iloc[i]['sentence1']
    s2 = test_df.iloc[i]['sentence2']
    emb1 = model.encode(s1, convert_to_tensor=True)
    emb2 = model.encode(s2, convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(emb1, emb2)
    test_predicted_scores.append(cosine_sim.item())

perason_corr, _ = pearsonr(test_predicted_scores, test_df['score'])
print('Test-Set Pearson Correlation:', perason_corr)

Test-Set Pearson Correlation: 0.982463644410194
