## **Import the necessary libraries**

In [3]:
import nltk
import pandas as pd
from scipy.stats import pearsonr
from nltk.metrics import jaccard_distance
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


#from google.colab import drive
#drive.mount('/content/drive')
#sys.path.insert(0,'/content/drive/My Drive/Colab Notebooks/ihlt')
#from textserver import TextServer


#Create a user in TextServer and use that credentials

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
pip install contractions



In [5]:
import contractions

## **Read the pair of sentences**

In [6]:
dt = pd.read_csv('STS.input.SMTeuroparl.txt',sep='\t',header=None)
dt['gs'] = pd.read_csv('STS.gs.SMTeuroparl.txt',sep='\t',header=None)
dt['gs_normalized'] = dt['gs'] / 5
dt.head()

Unnamed: 0,0,1,gs,gs_normalized
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.9
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,1.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.85
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.9
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0


## **Transform NLTK POS_Tag into Wordnet ones**

In [7]:
def convert_pos_tag(pos_tag):
    # Map specific starting characters to their corresponding simplified tags
    if pos_tag.startswith('j'):
        return 'a'  # Convert adjective tags
    elif pos_tag.startswith('rb'):
        return 'r'  # Convert adverb tags
    return pos_tag[0]  # Default to the first letter ('n' for noun, 'v' for verb)


## Tokenize a **sentence**

In [8]:
def tokenize_sentence(sentence):
    return nltk.word_tokenize(sentence)

## **Extract Lesk synstes from a sentence**

In this function :

* Step 1 : Tag each word in the sentence with a POS tag using NLTK's pos_tag function. We convert the tags to Wordnet tags using convert_pos_tag and also convert words to lowercase for consistency.
* Step 2: Filter relevant POS tags by keeping only the open class words, which are most meaningful for word sense disambiguation.
* Step 3: Apply the Lesk algorithm to extract synsets




In [9]:
def extract_lesk_synsets(sentence):
    # POS tagging
    tagged_words = [(word.lower(), convert_pos_tag(tag.lower())) for word, tag in nltk.pos_tag(sentence)]
    # Filter relevant POS tags (nouns, verbs, adjectives, adverbs)
    relevant_words = [(word, tag) for word, tag in tagged_words if tag in ['n', 'v', 'a', 'r']]

    synsets = [
        nltk.wsd.lesk(sentence, word, tag).name()
        for word, tag in relevant_words if nltk.wsd.lesk(sentence, word, tag)
    ]
    return synsets

In [10]:
dt['synsets1'] = dt.apply(lambda row: set(extract_lesk_synsets(tokenize_sentence(row[0]))), axis=1)
dt['synsets2'] = dt.apply(lambda row: set(extract_lesk_synsets(tokenize_sentence(row[1]))), axis=1)
dt.head()

Unnamed: 0,0,1,gs,gs_normalized,synsets1,synsets2
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.9,"{give.v.43, now.r.06, assume.v.06, leadership....","{luck.n.03, profit.v.01, leadership.n.02, impo..."
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,1.0,"{propose.v.05, paragraph.v.02, change.n.09, no...","{propose.v.05, be.v.12, paragraph.v.02, change..."
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.85,"{allies.n.02, remind.v.01, tax.n.01, include.v...","{allies.n.02, remind.v.01, tax.n.01, wish.v.02..."
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.9,"{vote.n.05, today.n.02, take.v.24, home.n.01}","{vote.n.05, take.v.24, home.n.01}"
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,"{tired.a.01, exist.v.01, dormant.a.02, fisherm...","{tired.a.01, exist.v.01, dormant.a.02, fisherm..."


## **Compute Jaccard Similarity**

In [11]:
dt['jaccard_lesk'] = dt.apply(lambda row: 1 - jaccard_distance(row['synsets1'], row['synsets2']), axis=1)
dt.head()

Unnamed: 0,0,1,gs,gs_normalized,synsets1,synsets2,jaccard_lesk
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,0.9,"{give.v.43, now.r.06, assume.v.06, leadership....","{luck.n.03, profit.v.01, leadership.n.02, impo...",0.214286
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,1.0,"{propose.v.05, paragraph.v.02, change.n.09, no...","{propose.v.05, be.v.12, paragraph.v.02, change...",0.666667
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,0.85,"{allies.n.02, remind.v.01, tax.n.01, include.v...","{allies.n.02, remind.v.01, tax.n.01, wish.v.02...",0.333333
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,0.9,"{vote.n.05, today.n.02, take.v.24, home.n.01}","{vote.n.05, take.v.24, home.n.01}",0.75
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,1.0,"{tired.a.01, exist.v.01, dormant.a.02, fisherm...","{tired.a.01, exist.v.01, dormant.a.02, fisherm...",1.0


## **Compute Pearson Correlation**

In [12]:
def compute_pearson_correlation(refs, tsts):
    # Check if lengths of the inputs are the same
    if len(refs) != len(tsts):
        raise ValueError("The two input lists must have the same length.")

    # Compute Pearson correlation using scipy's pearsonr
    correlation, _ = pearsonr(refs, tsts)
    return correlation

In [13]:
correlation_lesk = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard_lesk'])
print(f'Pearson Correlation: {correlation_lesk:.4f}')

Pearson Correlation: 0.4203


# **Comparison Lab 2**

In [14]:
stop_words = set(stopwords.words('english'))

In [15]:
def preprocess_tokenize(sentence):
    sentence =  contractions.fix(sentence)
    # Tokenize the preprocessed sentence and then remove stopwords
    tokens = tokenize_sentence(sentence)
    preprocessed = [ token for token in tokens if (token.lower() not in stop_words) and (token.isalnum())]
    return set(preprocessed)

In [16]:
dt['tokenized_1'] = dt.apply(lambda row: preprocess_tokenize(row[0]),axis = 1)
dt['tokenized_2'] = dt.apply(lambda row: preprocess_tokenize(row[1]),axis = 1)

In [17]:
dt['jaccard_tokenized'] = dt.apply(lambda row: 1 - jaccard_distance(row['tokenized_1'], row['tokenized_2']), axis=1)


In [18]:
dt[['tokenized_1','tokenized_2','gs_normalized','jaccard_tokenized','jaccard_lesk']].head()

Unnamed: 0,tokenized_1,tokenized_2,gs_normalized,jaccard_tokenized,jaccard_lesk
0,"{us, let, seize, new, leaders, hope, given, ch...","{us, let, seize, new, leaders, luck, hui, aujo...",0.9,0.384615,0.214286
1,"{proposes, paragraphs, 7, certain, Amendment, ...","{paragraphs, 7, proposing, certain, Amendment,...",1.0,0.75,0.666667
2,"{tax, Let, fervent, allies, remind, supporters...","{strong, like, would, tax, among, allies, remind}",0.85,0.272727,0.333333
3,"{take, place, today, vote}","{take, place, vote}",0.9,0.75,0.75
4,"{tired, fishermen, inactive, disappointed}","{tired, fishermen, inactive, disappointed}",1.0,1.0,1.0


In [19]:
correlation_tokenization = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard_tokenized'])
print(f'Pearson Correlation: {correlation_tokenization:.4f}')

Pearson Correlation: 0.4672


# **Comparison Lab 3**

In [20]:
def lemmatize(p):

    wnl = WordNetLemmatizer()
    d = {
        'NN': 'n', 'NNS': 'n',
        'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
        'VB': 'v', 'VBD': 'v', 'VBG': 'v',
        'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
        'RB': 'r', 'RBR': 'r', 'RBS': 'r'
    }
    if p[1] in d:
        return wnl.lemmatize(p[0], pos=d[p[1]])
    return p[0]


In [21]:
def preprocess_and_lemmatize(sentence):

    sentence =  contractions.fix(sentence)
    # Tokenize the preprocessed sentence and then remove stopwords
    tokens = tokenize_sentence(sentence)
    # Perform POS tagging on the tokens
    pos_tags = [(word.lower(), convert_pos_tag(tag.lower())) for word, tag in nltk.pos_tag(tokens)]
    # Lemmatize the tokens based on their POS tags
    lemmatized_tokens = {lemmatize((word, pos)) for word, pos in pos_tags}
    # Preprocessing the sentence
    preprocessed = [ token for token in lemmatized_tokens if (token.lower() not in stop_words) and (token.isalnum())]
    # Removing the stopwords
    return set(preprocessed)

In [22]:
dt['lemmatized_1'] = dt.apply(lambda row: preprocess_and_lemmatize(row[0]),axis = 1)
dt['lemmatized_2'] = dt.apply(lambda row: preprocess_and_lemmatize(row[1]),axis = 1)

In [23]:
dt['jaccard_lemmatized'] = dt.apply(lambda row: 1 - jaccard_distance(row['lemmatized_1'], row['lemmatized_2']), axis=1)


In [24]:
dt[['lemmatized_1','lemmatized_2','gs_normalized','jaccard_lemmatized','jaccard_lesk']].head()

Unnamed: 0,lemmatized_1,lemmatized_2,gs_normalized,jaccard_lemmatized,jaccard_lesk
0,"{us, let, seize, new, leaders, hope, given, ch...","{us, let, seize, new, leaders, luck, hui, aujo...",0.9,0.384615,0.214286
1,"{proposes, paragraphs, 7, certain, changes, am...","{paragraphs, 7, proposing, certain, changes, a...",1.0,0.75,0.666667
2,"{let, tax, fervent, allies, remind, supporters...","{strong, like, tax, would, among, allies, remind}",0.85,0.272727,0.333333
3,"{take, place, today, vote}","{take, place, vote}",0.9,0.75,0.75
4,"{tired, fishermen, inactive, disappointed}","{tired, fishermen, inactive, disappointed}",1.0,1.0,1.0


In [25]:
correlation_lemmatized = compute_pearson_correlation(dt['gs_normalized'], dt['jaccard_lemmatized'])
print(f'Pearson Correlation: {correlation_lemmatized:.4f}')

Pearson Correlation: 0.4810


# **Summary**

In [26]:
dt[['gs_normalized','jaccard_tokenized','jaccard_lemmatized','jaccard_lesk']].head()

Unnamed: 0,gs_normalized,jaccard_tokenized,jaccard_lemmatized,jaccard_lesk
0,0.9,0.384615,0.384615,0.214286
1,1.0,0.75,0.75,0.666667
2,0.85,0.272727,0.272727,0.333333
3,0.9,0.75,0.75,0.75
4,1.0,1.0,1.0,1.0


In [27]:
print(f'Pearson Correlation for tokenization: {correlation_tokenization:.4f}')
print(f'Pearson Correlation for lemmatization: {correlation_lemmatized:.4f}')
print(f'Pearson Correlation for WSD: {correlation_lesk:.4f}')

Pearson Correlation for tokenization: 0.4672
Pearson Correlation for lemmatization: 0.4810
Pearson Correlation for WSD: 0.4203


## **Analysis and Conclusions**

The ***lesk algorithm*** is a knowledge-based approach for Word Sense Disambiguation, which is the task of determining the correct sense/meaning of a word based on its context in a sentence. It is based on the assumption that words in a given "neighborhood" (section of text) will tend to share a common topic.

We know that in Wordnet each word can have multiple meanings/senses and each of this senses has a defintion called gloss and most of the times it has also example sentences. The Lesk algorithm tries to find the sense of a word by comparing its gloss with the context in which the word appears in the sentence or in the text.It measures the similarity between the words in the gloss and the words in the context and the sense with the most shared words between the gloss and the context is chosen as the correct meaning.

By this explenation we can understand that Lesk algorithm is not very effective. Let's list some limitations:
* Since it relies on exact word matching between the sense definitions and the context of the word, it cannot handle synonyms or paraphrases, meaning it won't recognize similarities if different words are used to express the same idea.
* It does not deeply understand the semantics of the words, which lead to incorrect disambiguation if the word usage in the context is complex or figurative.
* The Lesk algorithm does not take into account the order of words, their relationships, or grammatical structure. It only looks at individual words. This can lead to confusion in contexts where the meaning depends on how words are arranged.

The limitations of this algorithm can also be seen our implementation too.


As you can see, the correlation when using Lesk is lower compared to using some preprocessing and tokenization. This also reinforces the notion that Lesk has several disavantages.
Lesk also underperforms compared to tokenization inasmuch as only verbs, adjectives, adverbs and nouns are taken into consideration while in case of tokenization we took into consideration the whole sentence ( except the punctuation which does not include a semantic meaning). As we see, compared to just tokenization, lematizing performs better, hence it also performs better then lesk.

