# Lab 2: Sentence Similarity Analysis

Lab session by:
* Daniel Hess
* Pandelis Laurens Symeonidis

### Imports

In [5]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.metrics.distance import jaccard_distance
from scipy.stats import pearsonr


-there are some shortcomings in this approach, for instance if we consider the tokens the uppcercase version and lowercase version will be two diff tokens so we can check if we are getting better results with everyhting lowercase or not

- what does stopword mean? Its a word with no meaning, if we add words with no meaning with same wiehghts as rest of the tokens then the metric can be poisoned, so we can try to remove the stopwords (include this in analysis)

### Data Loading

In [None]:
# Load data
dt = pd.read_csv('./STS.input.SMTeuroparl.txt',sep='\t',header=None)
gold_standard_dt = pd.read_csv('./STS.gs.SMTeuroparl.txt',sep='\t',header=None)

# Download corpus
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

Original correlation: 0.4504977169318684
Lowercase correlation: 0.4624951397591497
Stopword removal correlation: 0.4373710526352063
Lowercase + Stopword removal correlation: 0.4451596378377866


### Preprocessing function

In [None]:
# # Tokenize and process sentences
# dt_tokenized = dt.map(nltk.word_tokenize)
# dt_tokenized.head()

# Tokenize and option to make sentences all lower and/or removal of SW for analysis
def preprocess_sentences(sentences, lowercase=False, remove_stopwords=False): 
    sw = set(stopwords.words('english'))
    processed = []
    for s in sentences:
        tokens = nltk.word_tokenize(s)
        if lowercase:
            tokens = [t.lower() for t in tokens]
        if remove_stopwords:
            tokens = [t for t in tokens if t.lower() not in sw]
        processed.append(tokens)
    return processed

### Similarity evaluation

In [None]:
# jaccard_distances = dt_tokenized.apply(lambda row: jaccard_distance(set(row[0]), set(row[1])), axis=1)
# print(jaccard_distances.head())

# similarity_scores = 1 - jaccard_distances
# print(similarity_scores.shape)

# Calculate jaccard distance for each sentence pair and  Pearson coefficient between similarity scores and gold standard
def evaluate_similarity(dt, lowercase=False, remove_stopwords=False):
    sent1 = preprocess_sentences(dt[0], lowercase, remove_stopwords)
    sent2 = preprocess_sentences(dt[1], lowercase, remove_stopwords)

    jaccard_distances = [
        jaccard_distance(set(s1), set(s2)) for s1, s2 in zip(sent1, sent2) 
    ]
    similarity_scores = [1 - d for d in jaccard_distances]

    corr = pearsonr(similarity_scores, gold_standard_dt[0])[0] # Calculate Pearson coefficient between similarity scores and gold standard
    return corr

### Run cases

In [None]:
# comparison = pearsonr(similarity_scores, gold_standard_dt[0])
# print(comparison[0])

# Run all scenarios for comparison
corr_original = evaluate_similarity(dt, lowercase=False, remove_stopwords=False)
corr_lower = evaluate_similarity(dt, lowercase=True, remove_stopwords=False)
corr_stop = evaluate_similarity(dt, lowercase=False, remove_stopwords=True)
corr_both = evaluate_similarity(dt, lowercase=True, remove_stopwords=True)

print("Original correlation:", corr_original)
print("Lowercase correlation:", corr_lower)
print("Stopword removal correlation:", corr_stop)
print("Lowercase + Stopword removal correlation:", corr_both)


### Analysis

The results show that making the sentences lowercase improves the correlation with human similarity judgments slightly (r = 0.462 vs 0.450), which shows that case normalization helps reduce some superficial differences, making the sentence matching slightly more forgiving. On the other hand, stopword removal reduced the correlation (r = 0.437), suggesting that stopwords ("the", "and", "to", etc) still provide useful information for sentence matching similarity measures like Jaccard. Combining both methods yielded performance similar to the baseline (r = 0.445). Overall, this highlights how preprocessing choices can influence evaluation results, even with a simple metric.