# Lab 6: Sentence Similarity Analysis with Lemmas

Lab session by:
* Daniel Hess
* Pandelis Laurens Symeonidis

### Imports

In [28]:
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.metrics.distance import jaccard_distance
from scipy.stats import pearsonr

from textserver import TextServer

from nltk.corpus import stopwords 

### Data Loading

In [40]:
nltk.download('wordnet')
nltk.download('omw-1.4')

nltk.download('stopwords', quiet=True)

# Load dataframes
dt = pd.read_csv('./STS.input.SMTeuroparl.txt', sep='\t', header=None)
gold_standard_dt = pd.read_csv('./STS.gs.SMTeuroparl.txt', sep='\t', header=None)

LIMIT = 2

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [30]:
def convert_category(word: str, penn_tag: str):
    d = {'NN': 'n', 'NNS': 'n',
          'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
            'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 
            'RB': 'r', 'RBR': 'r', 'RBS': 'r'} 
    
    if penn_tag in d: 
        return word, d[penn_tag]
    else:
        return word, None 

def sentence_preparation(sentence: str):
    # Tokenize & optional simple filtering
    tokens = word_tokenize(sentence)

    # POS-tag (Penn)
    penn_tagged = nltk.pos_tag(tokens)

    # Map to WordNet POS and WSD
    context = tokens  # Contex =  the tokenized sentence
    items = []

    for (w, p) in penn_tagged:
        w2, wn_pos = convert_category(w, p)

        key = None
        if wn_pos is not None:
        
            syn = nltk.wsd.lesk(context, w, wn_pos)
            if syn is None:
                key = w
            else:
                key = syn.name()
        else:
            key = w

        items.append(key)

    return items

def lesk_similarities(dt):

    wsd1 = dt[0].apply(sentence_preparation)
    wsd2 = dt[1].apply(sentence_preparation)

    # print(wsd1.head())
    # print(wsd2.head())

    jaccard_distances_lesk = [jaccard_distance(set(s1), set(s2)) for s1, s2 in zip(wsd1, wsd2)]
    similarities_lesk = [1 - d for d in jaccard_distances_lesk]

    return similarities_lesk

# Try 
# Lemmatize (lowers)
# remove punctuation
# regex preprocess
# lowercasing





In [41]:

def get_synset_from_textserver_result(x):
    if x[-1] == 'N/A':
        return x[0] # return token
    id = x[-1]
    pos, offset = id.split("-")
    pos = int(pos)
    synset = wn.synset_from_pos_and_offset(offset, pos)
    return synset.name()



def UKB_similarities(dt):
    ts = TextServer('pandelis2', '76Qc#iD8L', 'senses')
    dt = dt.head(LIMIT)
    dt = dt.applymap(lambda sentence: ts.senses(sentence))

    dt_synsets = dt.applymap(lambda senses: [get_synset_from_textserver_result(x) for x in senses[0]])
    
    jaccard_distances = [jaccard_distance(set(s1), set(s2)) for s1, s2 in zip(dt_synsets[0], dt_synsets[1])]
    similarities = [1 - d for d in jaccard_distances]

    return similarities
    

In [42]:
similarities_lesk = lesk_similarities(dt.copy())
similarities_UKB = UKB_similarities(dt.copy())

  dt = dt.applymap(lambda sentence: ts.senses(sentence))
  dt_synsets = dt.applymap(lambda senses: [get_synset_from_textserver_result(x) for x in senses[0]])


In [44]:
print(len(similarities_UKB))
print(len(gold_standard_dt[0]))

corr_lesk = pearsonr(similarities_lesk, gold_standard_dt[0])[0] # Calculate Pearson coefficient between similarity scores and gold standard
corr_UKB = pearsonr(similarities_UKB, gold_standard_dt[0][:2])[0] # Calculate Pearson coefficient between similarity scores and gold standard

print(corr_lesk)
print(corr_UKB)

2
459
0.39563254052490154
1.0
