--------------------------
#### STS-B dataset using the datasets library
---------------------------

In [1]:
from datasets import load_dataset
import pandas as pd

In [3]:
# Load the STS-B dataset
dataset = load_dataset("glue", "stsb")

train-00000-of-00001.parquet:   0%|          | 0.00/502k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/114k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [4]:
# Convert the train and validation datasets into pandas DataFrames
train_df      = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])

In [5]:
train_df.shape, validation_df.shape

((5749, 4), (1500, 4))

In [6]:
train_df.sample(6)

Unnamed: 0,sentence1,sentence2,label,idx
1220,Two dogs in a fenced kennel look ahead.,Two woman pose in a dining room with a baby.,0.0,1220
5538,Iran's president condemns use of chemical weap...,Probe alleged use of chemical weapons in Syria,1.6,5538
495,A man and a woman are kissing each other.,A man and a woman are talking to each other.,0.6,495
2011,There's no chance of a fair trial.,there is no chance at a fair trial.,5.0,2011
5102,East Ukraine Separatists Ask to Join Russia,Crimean Parliament Votes to Join Russia,2.8,5102
4476,China stock index futures close higher -- Dec. 4,China stock index futures close lower -- Jan. 24,2.2,4476


In [8]:
# Check for repetitions of sentence1
repeated_sentences = train_df[train_df.duplicated(subset=['sentence1'], keep=False)]

In [12]:
# Sort the repeated sentences first by sentence1, then by similarity score
sorted_repeated_sentences = repeated_sentences.sort_values(by=['sentence1', 'label'], ascending=[True, False])

In [16]:
# Display sorted repeated sentence1 pairs
sorted_repeated_sentences[['sentence1', 'sentence2', 'label']]

Unnamed: 0,sentence1,sentence2,label
2035,"""Fairies don't exist"" - fine.","""Satyrs don't exist"" - fine.",1.20
2122,"""Fairies don't exist"" - fine.","""Leprechauns don't exist"" - fine.",1.00
4829,"10 dead, five injured in SW China road accident","1 dead, 39 injured in E China road accident",1.75
4980,"10 dead, five injured in SW China road accident",5 hurt in Gaza City car accident,1.00
4861,"17 killed, 133 wounded in bomb attacks in nort...","15 killed, 90 wounded in fresh attacks in Iraq",2.80
...,...,...,...
2284,to repeat: They are NOT as sovereign as they w...,They are NOT as sovereign as they were before.,4.50
1936,two puppies playing around in the grass,Two puppies play in the grass,5.00
1679,two puppies playing around in the grass,Two dogs are wrestling in the grass.,4.00
2077,you have no remains of a missile at the pentagon.,you have no witnesses for a missile at the pen...,2.60


In [28]:
from transformers import pipeline
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

from scipy.stats import pearsonr

import torch

In [29]:
# Convert the train dataset to pandas dataframe
df = pd.DataFrame(dataset['validation'])

In [30]:
# Load the pre-trained model for sentence embeddings
model_name = "sentence-transformers/bert-base-nli-mean-tokens"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModel.from_pretrained(model_name)

In [31]:
# Function to compute sentence embeddings
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
    
    with torch.no_grad():
        outputs = model(**inputs)
        
    sentence_embedding = outputs.last_hidden_state.mean(dim=1)
    
    return sentence_embedding

In [50]:
# Compute similarities
true_scores      = []
predicted_scores = []

In [51]:
for idx, row in df.sample(5).iterrows():
    sentence1 = row['sentence1']
    sentence2 = row['sentence2']
    
    true_score = row['label']                      # STS-B scores

    # Get embeddings
    embedding1 = get_sentence_embedding(sentence1)
    embedding2 = get_sentence_embedding(sentence2)

    # Compute cosine similarity
    similarity = cosine_similarity(embedding1.numpy(), embedding2.numpy())[0][0]

    # Normalize cosine similarity to match the STS-B scale (0–5)
    normalized_score = similarity * 5             # Map [0, 1] to [0, 5]

    true_scores.append(true_score)
    predicted_scores.append(normalized_score)
    
    print(idx)

402
1467
1479
471
663


In [52]:
true_scores, predicted_scores

([2.5999999046325684,
  0.4000000059604645,
  4.0,
  2.200000047683716,
  2.799999952316284],
 [3.9386707544326782,
  3.2423135638237,
  4.591420292854309,
  3.765115737915039,
  2.7710020542144775])

In [53]:
# Evaluation metrics
mse = mean_squared_error(true_scores, predicted_scores)
pearson_corr, _ = pearsonr(true_scores, predicted_scores)

print(f"Mean Squared Error: {mse}")
print(f"Pearson Correlation: {pearson_corr}")

Mean Squared Error: 2.5341983940040365
Pearson Correlation: 0.5546792500723023


#### Challenges

1. **Cosine Similarity Precision**  
   - If embeddings are not robust, the cosine similarity may not fully reflect semantic similarity.

2. **Human Judgment Variability**  
   - Annotators in STS-B might rate sentences subjectively, which can make model evaluation noisy.
