In [None]:
!pip install contractions



Please note that the comparison between our previous results and lemmatization is at the end ( Step 9), below you can see some additional experiments that we conducted.

## **Step 1: Importing libraries**

In [None]:
import pandas as pd
import nltk
from nltk.metrics import jaccard_distance
from nltk.tokenize import word_tokenize
from scipy.stats import pearsonr
import contractions
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## **Step 2: Reading the data and adding the golden score**

In [None]:
#Reading the file and assign it to a dataframe
dt = pd.read_csv('STS.input.SMTeuroparl.txt',sep='\t',header=None)

In [None]:
#Adding gold standard values into a column in the dataframe called gs
dt['gs'] = pd.read_csv('STS.gs.SMTeuroparl.txt',sep='\t',header=None)

In [None]:
# Normalize the gold standard scores (assuming they are on a scale from 0 to 5)
dt['gs_normalized'] = dt['gs']/5

## **Step 3: Creating the jaccard similarity function**

In [None]:
# Compute Jaccard similarity
def compute_jaccard_similarity(tokens1, tokens2):
    # Compute Jaccard distance between two sets of tokens
    distance = jaccard_distance(tokens1, tokens2)

    # Return Jaccard similarity
    return 1 - distance

## **Step 4: Adding Jaccard similarity after word tokenization without utilizing any preprocessing**

In [None]:
#Adding the computed results to a new column
dt['jaccard'] = dt.apply(lambda row: compute_jaccard_similarity(set(word_tokenize(row[0])), set(word_tokenize(row[1]))), axis=1)


## **Step 5: Adding lemmatization without utilizing any preprocessing**

In [None]:
# Initialize WordNetLemmatizer
wnl = WordNetLemmatizer()

# Lemmatization function using POS tags
def lemmatize(p):
    d = {
        'NN': 'n', 'NNS': 'n',
        'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
        'VB': 'v', 'VBD': 'v', 'VBG': 'v',
        'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
        'RB': 'r', 'RBR': 'r', 'RBS': 'r'
    }
    if p[1] in d:
        return wnl.lemmatize(p[0], pos=d[p[1]])
    return p[0]
def tokenize_and_lemmatize(sentence):

    # Tokenize the preprocessed sentence
    tokens = word_tokenize(sentence)

    # Perform POS tagging on the tokens
    pos_tags = nltk.pos_tag(tokens)

    # Lemmatize the tokens based on their POS tags
    lemmatized_tokens = {lemmatize((word.lower(), pos)) for word, pos in pos_tags}

    return lemmatized_tokens


In [None]:
#Adding the computed results to a new column
dt['jaccard_lemmatized'] = dt.apply(lambda row: compute_jaccard_similarity(tokenize_and_lemmatize(row[0]), tokenize_and_lemmatize(row[1])), axis=1)

In [None]:
dt.head()

Unnamed: 0,0,1,gs,gs_normalized,jaccard,jaccard_lemmatized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.9,0.346154,0.346154
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,1.0,0.785714,0.923077
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.85,0.391304,0.391304
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.9,0.545455,0.545455
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,1.0,1.0


## **Step 6: Computing correlation between the created column and gold standard**

In [None]:
# Compute Pearson correlation between Jaccard similarity and gold standard
def compute_pearson_correlation(refs, tsts):
    if len(refs) != len(tsts):
        raise ValueError("The two input lists must have the same length.")
    correlation, _ = pearsonr(refs, tsts)
    return correlation

In [None]:
correlation = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard'])
correlation_lemmatized = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard_lemmatized'])
print(f"Correlation comparing the Jaccard similarity based on tokenization:  {correlation:.4f}")
print(f"Correlation comparing the Jaccard similarity based lemmatization:  {correlation_lemmatized:.4f}")

Correlation comparing the Jaccard similarity based on tokenization:  0.4505
Correlation comparing the Jaccard similarity based lemmatization:  0.4613


### **Conclusions:**

We can see from the results that Lemmatization is working better than tokenization when we don't perform any preprocessing.



## **Step 7: Recreating the previous lab's results**

In [None]:
# Removing the punctuation from the text, using maketrans() is slightly faster
punctuation_table = str.maketrans('', '', string.punctuation)

# Expanding contractions, removing punctuation,lowercasing the text
def previous_preprocessing(sentence):
    sentence = contractions.fix(sentence).translate(punctuation_table).lower()
    return sentence

#Function to preprocess and tokenize sentences
def preprocess_and_tokenize(sentence, preprocessing):
    sentence = preprocessing(sentence)
    tokens = set(word_tokenize(sentence))
    return tokens


In [None]:
#Adding the computed results to a new column
dt['previous_best_jaccard'] =  dt.apply(lambda row: compute_jaccard_similarity(preprocess_and_tokenize(row[0], previous_preprocessing),preprocess_and_tokenize(row[1], previous_preprocessing)), axis=1)

## **Step 8: Performing the same preprocessing steps for lemmatization process**

In [None]:
def preprocess_and_lemmatize(sentence, preprocessing):
    # Preprocessing the sentence
    preprocessed_sentence = preprocessing(sentence)
    # Tokenize the preprocessed sentence
    tokens = word_tokenize(preprocessed_sentence)
    # Perform POS tagging on the tokens
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatize the tokens based on their POS tags
    lemmatized_tokens = {lemmatize((word, pos)) for word, pos in pos_tags}
    return lemmatized_tokens

In [None]:
#Adding the computed result to a new column
dt['jaccard_preprocessed_lemmatized'] = dt.apply(lambda row: compute_jaccard_similarity(
    preprocess_and_lemmatize(row[0], previous_preprocessing),
    preprocess_and_lemmatize(row[1], previous_preprocessing)), axis=1)

In [None]:
dt.head()

Unnamed: 0,0,1,gs,gs_normalized,jaccard,jaccard_lemmatized,previous_best_jaccard,jaccard_preprocessed_lemmatized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.9,0.346154,0.346154,0.391304,0.391304
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,1.0,0.785714,0.923077,0.769231,0.769231
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.85,0.391304,0.391304,0.380952,0.380952
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.9,0.545455,0.545455,0.6,0.6
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,1.0,1.0,1.0,1.0


## **Step 9: Computing correlation between preprocessed tokenization and preprocessed lemmatization**

In [None]:
correlation = compute_pearson_correlation(dt['gs_normalized'], dt['previous_best_jaccard'])
correlation_lemmatized = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard_preprocessed_lemmatized'])
print(f"Correlation comparing the Jaccard similarity based on preprocessing tokenization:  {correlation:.4f}")
print(f"Correlation comparing the Jaccard similarity based on preprocessing and lemmatization:  {correlation_lemmatized:.4f}")

Correlation comparing the Jaccard similarity based on preprocessing tokenization:  0.4823
Correlation comparing the Jaccard similarity based on preprocessing and lemmatization:  0.4762


## **Step 10: Adding some more preprocessing steps to lemmatization process**

In [None]:
#Removing Stopwords
sw = set(stopwords.words('english'))

def preprocess_and_lemmatize(sentence, preprocessing):
    # Preprocessing the sentence
    preprocessed_sentence = preprocessing(sentence)
    # Tokenize the preprocessed sentence and then remove stopwords
    tokens = word_tokenize(preprocessed_sentence)
    filtered_tokens = {token for token in tokens if token not in sw}
    # Perform POS tagging on the tokens
    pos_tags = nltk.pos_tag(filtered_tokens)
    # Lemmatize the tokens based on their POS tags
    lemmatized_tokens = {lemmatize((word, pos)) for word, pos in pos_tags}


    return lemmatized_tokens



In [None]:
# Add the computed values in a new column
dt['jaccard_lemmatized_sw'] = dt.apply(lambda row: compute_jaccard_similarity(
    preprocess_and_lemmatize(row[0], previous_preprocessing),
    preprocess_and_lemmatize(row[1], previous_preprocessing)), axis=1)

In [None]:
correlation_lemmatized_sw = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard_lemmatized_sw'])
print(f"Correlation comparing the Jaccard similarity based on preprocessing, removing stopwords and lemmatization:  {correlation_lemmatized_sw:.4f}")

Correlation comparing the Jaccard similarity based on preprocessing, removing stopwords and lemmatization:  0.4872


# **Question Answers**

1) Using lemmas is preferable, and it gives a better score because it converts the tokens into their base form propose, proposed, proposing -> propose .So it avoids treating different forms of the same word as different ones.


2) No because lemmas as we said above perform better. Words may perform better when we have to do stylistic or sentiment analysis . For example :

I was sick.

I am sick.

The difference between present and past tense can indicate different sentiments. By lemmatizing it , the nuance will be lost. This is more used in the poetry texts.


## **Conclusions:**
 In our previous lab we didn't remove the stopwords so when we use the same preprocessing technique with lemmas we get a lower correlation. This happens because the number of unique words increases, this means higher dimensionality in feature representations.