In [1]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K

In [2]:
import pandas as pd
import nltk
from nltk.metrics import jaccard_distance
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from scipy.stats import pearsonr
import contractions
import string
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
#Reading the file and assign it to a dataframe
dt = pd.read_csv('STS.input.SMTeuroparl.txt',sep='\t',header=None)
dt.head()

Unnamed: 0,0,1
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi..."


In [4]:
#Adding gold standard values into a column in the dataframe called gs
dt['gs'] = pd.read_csv('STS.gs.SMTeuroparl.txt',sep='\t',header=None)
dt.head()


Unnamed: 0,0,1,gs
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0


In [5]:
# Removing the punctuation from the text, using maketrans() is slightly faster
punctuation_table = str.maketrans('', '', string.punctuation)

# Lowercasing the text
def lowercase_text(sentence):
    sentence = sentence.lower()
    return sentence

#Removing the punctuation and lowercasing the text
def remove_punctuation_and_lowercase(sentence):
    sentence = sentence.translate(punctuation_table).lower()
    return sentence

#Expanding contractions and lowercasing the text
def expand_contractions_and_lowercase(sentence):
    sentence = contractions.fix(sentence).lower()
    return sentence

# Expanding contractions, removing punctuation,lowercasing the text
def full_preprocessing(sentence):
    sentence = contractions.fix(sentence).translate(punctuation_table).lower()
    return sentence

#Function to preprocess and tokenize sentences
def preprocess_and_tokenize(sentence, preprocessing):
    sentence = preprocessing(sentence)
    tokens = set(word_tokenize(sentence))
    return tokens

# Compute Jaccard similarity using NLTK's jaccard_distance
def compute_jaccard_similarity(sentence1, sentence2, preprocessing):
    tokens1 = preprocess_and_tokenize(sentence1, preprocessing)
    tokens2 = preprocess_and_tokenize(sentence2, preprocessing)

    # Compute Jaccard distance
    distance = jaccard_distance(tokens1, tokens2)

    # Return Jaccard similarity
    return 1 - distance

#Use Pandas apply method to vectorize the operation
#apply iterates over each row of data frame. For each row
#it computes the Jaccard similarity between the two sentences in that row
#using the compute_jaccard_similarity function and stored in a new column called Jaccard.
def add_jaccard_to_dataframe(df, preprocessing):
    df['jaccard'] = df.apply(lambda row: compute_jaccard_similarity(row[0], row[1], preprocessing), axis=1)
    return df

# Compute Pearson correlation between two lists
def compute_pearson_correlation(refs, tsts):
    # Check if lengths of the inputs are the same
    if len(refs) != len(tsts):
        raise ValueError("The two input lists must have the same length.")

    # Compute Pearson correlation using scipy's pearsonr
    correlation, _ = pearsonr(refs, tsts)
    return correlation


def process_results(dt, preprocessing):
    dt = dt.copy()

    table = add_jaccard_to_dataframe(dt, preprocessing)
    table.head()

    #Dividing the golden standart by 5
    table['gs_normalized'] = table['gs'] / 5

    # Call the function to compute Pearson correlation using the normalized 'gs' values
    refs = table['gs_normalized']
    tsts = table['jaccard']
    correlation = compute_pearson_correlation(refs, tsts)

    # Print the Pearson correlation result rounding to 4 decimal points
    print(f'Pearson Correlation: {correlation:.4f}')
    return table


In [6]:
table = process_results(dt, lowercase_text)
table.head()

Pearson Correlation: 0.4625


Unnamed: 0,0,1,gs,jaccard,gs_normalized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.346154,0.9
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,0.785714,1.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.391304,0.85
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.545455,0.9
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,1.0


In [7]:
table = process_results(dt, remove_punctuation_and_lowercase)
table.head()

Pearson Correlation: 0.4822


Unnamed: 0,0,1,gs,jaccard,gs_normalized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.333333,0.9
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,0.769231,1.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.380952,0.85
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.6,0.9
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,1.0


In [8]:
table = process_results(dt, expand_contractions_and_lowercase)
table.head()

Pearson Correlation: 0.4626


Unnamed: 0,0,1,gs,jaccard,gs_normalized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.4,0.9
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,0.785714,1.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.391304,0.85
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.545455,0.9
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,1.0


In [9]:
table = process_results(dt, full_preprocessing)
table.head()

Pearson Correlation: 0.4823


Unnamed: 0,0,1,gs,jaccard,gs_normalized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.391304,0.9
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,0.769231,1.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.380952,0.85
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.6,0.9
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,1.0


**Analyze and Conlusions**

After some experiments we reached some conclustions:
- First we must lowercase so the words with different casing are treated as the same token.
-We should handle the contractions to ensure that the meaning and token consistancy are preserved. (e.g. let's -> let us)
-Removing the punctuation is something we have thought twice, since in some cases punctuation carry meaning, but in our case, for tasks like Jaccard Similarity, we thought that they don't contribute to the semantic similiarity between sentences .
-We tried to remove stopwords and compare the result,since the Jaccard similiarity between words was higher and correlation was even higher without removing them, we thought that in this case stopwords shoud not be removed because they might add useful structural context.
- To have a realistic value of the score we devide the gold standard by 5 in order to be a number from 0 to 1, this ensure that the correlation is accurate by putting both sets of values on the same scale.

Future Work

We should use  more advanced tokenization to deal with all the problems :  
- We have multilinguality problems, in the first sentence we have a word in french. To deal with this we have to do this steps :
    1-Language detection
    2-Text Normalization
    3-Maybe Translation
    4-Multilingual Tokenization
- Also to perform better we must reduce different word forms to their base forms.(Lemmatization)
e.g. proposes - is proposing
- Abrev like p.m is defined by punkt but in the case of third sentence 5:30pm is another problem that should be solved, maybe by writing hand crafted rules.


Correlation helps quantify how well the computed scores (Jaccard similarities) align with the human-assigned scores (gold standard). In the last case since the value is near 0.48 we have a moderate positive correlation, which means that there are some alignment but the relationship is not perfect.