In [None]:
import nltk
import pandas as pd
from nltk.metrics.distance import jaccard_distance
from scipy.stats import pearsonr
import re

# Download corpus
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

dt = pd.read_csv('./STS.input.SMTeuroparl.txt',sep='\t',header=None)
dt.to_csv("my_dataframe_orig.csv", index=False, encoding="utf-8")
gold_standard_dt = pd.read_csv('./STS.gs.SMTeuroparl.txt',sep='\t',header=None)


dt_processed = dt.map(lambda text: re.sub(r'[^a-zA-Z0-9,.]+', ' ', text).strip()) 
print(dt_processed)


dt_tokenized = dt_processed.map(nltk.word_tokenize)
dt_postag = dt_tokenized.map(nltk.pos_tag)

wnl = nltk.stem.WordNetLemmatizer()
def lemmatize(p):
  d = {'NN': 'n', 'NNS': 'n', 
       'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 
       'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', 
       'RB': 'r', 'RBR': 'r', 'RBS': 'r'}
  
  # decide what to do with the rest (this version ignores them)
  if p[1] in d:
    return wnl.lemmatize(p[0], pos=d[p[1]])
  return p[0]

dt_processed.to_csv("processed.csv", index=False, encoding="utf-8")
dt_lemma = dt_postag.map(lambda words: [lemmatize(word) for word in words])
print(dt.head())
dt_lower = dt_lemma.map(lambda tokens: [t.lower() for t in tokens])
jaccard_distances = dt_lower.apply(lambda row: jaccard_distance(set(row[0]), set(row[1])), axis=1)

print(jaccard_distances.head())
similarity_scores = 1 - jaccard_distances
corr = pearsonr(similarity_scores, gold_standard_dt[0])[0]
print(corr)


dt_lower.to_csv("my_dataframe_lower.csv", index=False, encoding="utf-8")


# when comparing should do the same preprocessing. If we do preprocessing before posttagger its wrong cause its trained on normal sentences. Order is important. If we want to apply the same preprocessing as previous session first tokenize and get post-tags and then we can apply anything we want.
# list of post tags is different per model 
# will lemmas always be better? if we think no give counterexample

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                                     0  \
0    The leaders have now been given a new chance a...   
1    Amendment No 7 proposes certain changes in the...   
2    Let me remind you that our allies include ferv...   
3           The vote will take place today at 5 30 p m   
4    The fishermen are inactive, tired and disappoi...   
..                                                 ...   
454  It is our job to continue to support Latvia wi...   
455         The vote will take place today at 5 30 p m   
456  Neither was there a qualified majority within ...   
457  Let me remind you that our allies include ferv...   
458  We often pontificate here about being the repr...   

                                                     1  
0    The leaders benefit aujourd hui of a new luck ...  
1    Amendment No 7 is proposing certain changes in...  
2    I would like to remind you that among our alli...  
3                   The vote will take place at 5 30pm  
4    The fishermen

In [22]:
print(dt_tokenized[0])
print(dt_lemma[0])

0      [The, leaders, have, now, been, given, a, new,...
1      [Amendment, No, 7, proposes, certain, changes,...
2      [Let, me, remind, you, that, our, allies, incl...
3      [The, vote, will, take, place, today, at, 5.30...
4      [The, fishermen, are, inactive, ,, tired, and,...
                             ...                        
454    [It, is, our, job, to, continue, to, support, ...
455    [The, vote, will, take, place, today, at, 5.30...
456    [Neither, was, there, a, qualified, majority, ...
457    [Let, me, remind, you, that, our, allies, incl...
458    [We, often, pontificate, here, about, being, t...
Name: 0, Length: 459, dtype: object
0      [The, leader, have, now, be, give, a, new, cha...
1      [Amendment, No, 7, propose, certain, change, i...
2      [Let, me, remind, you, that, our, ally, includ...
3      [The, vote, will, take, place, today, at, 5.30...
4      [The, fisherman, be, inactive, ,, tired, and, ...
                             ...                    