In [587]:
import re
import string
import unicodedata
from time import time

import numpy as np
import pandas as pd

from numpy.linalg import norm

import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown, movie_reviews, gutenberg, treebank, inaugural 
from nltk.corpus import wordnet as wn

from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('treebank')
nltk.download('inaugural')
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('movie_reviews')

import multiprocessing
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [89]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Training

## Dataset

The dataset used to train the word embeddings model using gensim from scratch consists of 5 different datasets available through *nltk*:


1.   [Brown Corpus](http://korpus.uib.no/icame/brown/bcm.html)
2.   Movie Reviews
3.   Inaugural
4.   Gutenberg Books
5.   Stanford Sentiment Treebank



In [90]:
brown_dataset = []
for sent in brown.sents():
  sentence = ' '.join(sent)
  brown_dataset.append(sentence)

In [91]:
df1 = pd.DataFrame(brown_dataset, columns = ['text'])
df1.describe()

Unnamed: 0,text
count,57340
unique,56418
top,)
freq,58


In [92]:
# eliminate empty rows
df1 = df1.dropna().reset_index(drop=True)
df1.isnull().sum()

# eliminate duplicates
df1.drop_duplicates(inplace=True)
df1

Unnamed: 0,text
0,The Fulton County Grand Jury said Friday an in...
1,The jury further said in term-end presentments...
2,The September-October term jury had been charg...
3,`` Only a relative handful of such reports was...
4,The jury said it did find that many of Georgia...
...,...
57335,S. J. Perelman
57336,revulsion in the desert
57337,"the doors of the D train slid shut , and as I ..."
57338,She was a living doll and no mistake -- the bl...


In [588]:
inaugural_dataset = []
for sent in treebank.sents():
  sentence = ' '.join(sent)
  inaugural_dataset.append(sentence)

In [589]:
df3 = pd.DataFrame(inaugural_dataset, columns = ['text'])
df3.describe()

Unnamed: 0,text
count,3914
unique,3904
top,Terms were n't disclosed *-1 .
freq,8


In [590]:
# eliminate empty rows
df3 = df5.dropna().reset_index(drop=True)
df3.isnull().sum()

# eliminate duplicates
df3.drop_duplicates(inplace=True)
df3

Unnamed: 0,text
0,"Pierre Vinken , 61 years old , will join the b..."
1,"Mr. Vinken is chairman of Elsevier N.V. , the ..."
2,"Rudolph Agnew , 55 years old and former chairm..."
3,A form of asbestos once used * * to make Kent ...
4,"The asbestos fiber , crocidolite , is unusuall..."
...,...
3899,"Moreover , they said 0 the first appropriation..."
3900,They also said that more than a dozen presiden...
3901,Sen. Kennedy said in a separate statement that...
3902,Trinity Industries Inc. said 0 it reached a pr...


In [220]:
treebank_dataset = []
for sent in treebank.sents():
  sentence = ' '.join(sent)
  treebank_dataset.append(sentence)

In [221]:
df5 = pd.DataFrame(treebank_dataset, columns = ['text'])
df5.describe()

Unnamed: 0,text
count,3914
unique,3904
top,Terms were n't disclosed *-1 .
freq,8


In [222]:
# eliminate empty rows
df5 = df5.dropna().reset_index(drop=True)
df5.isnull().sum()

# eliminate duplicates
df5.drop_duplicates(inplace=True)
df5

Unnamed: 0,text
0,"Pierre Vinken , 61 years old , will join the b..."
1,"Mr. Vinken is chairman of Elsevier N.V. , the ..."
2,"Rudolph Agnew , 55 years old and former chairm..."
3,A form of asbestos once used * * to make Kent ...
4,"The asbestos fiber , crocidolite , is unusuall..."
...,...
3908,"Moreover , they said 0 the first appropriation..."
3909,They also said that more than a dozen presiden...
3910,Sen. Kennedy said in a separate statement that...
3911,Trinity Industries Inc. said 0 it reached a pr...


In [158]:
gutenberg_dataset = []
for sent in gutenberg.sents():
  sentence = ' '.join(sent)
  gutenberg_dataset.append(sentence)

In [159]:
df4 = pd.DataFrame(gutenberg_dataset, columns = ['text'])
df4.describe()

Unnamed: 0,text
count,98552
unique,93572
top,Ham .
freq,337


In [160]:
# eliminate empty rows
df4 = df4.dropna().reset_index(drop=True)
df4.isnull().sum()

# eliminate duplicates
df4.drop_duplicates(inplace=True)
df4

Unnamed: 0,text
0,[ Emma by Jane Austen 1816 ]
1,VOLUME I
2,CHAPTER I
3,"Emma Woodhouse , handsome , clever , and rich ..."
4,She was the youngest of the two daughters of a...
...,...
98547,Now for my last -- let me look back a moment ;...
98548,"Long have we lived , joy ' d , caress ' d toge..."
98549,"Yet let me not be too hasty , Long indeed have..."
98550,May - be it is you the mortal knob really undo...


In [94]:
movie_reviews_dataset = []
for sent in movie_reviews.sents():
  sentence = ' '.join(sent)
  movie_reviews_dataset.append(sentence)

In [95]:
df2 = pd.DataFrame(movie_reviews_dataset, columns = ['text'])
df2.describe()

Unnamed: 0,text
count,71532
unique,66014
top,.
freq,3079


In [96]:
# eliminate empty rows
df2 = df2.dropna().reset_index(drop=True)
df2.isnull().sum()

# eliminate duplicates
df2.drop_duplicates(inplace=True)
df2

Unnamed: 0,text
0,"plot : two teen couples go to a church party ,..."
1,they get into an accident .
2,"one of the guys dies , but his girlfriend cont..."
3,what ' s the deal ?
4,"watch the movie and "" sorta "" find out ."
...,...
71527,"it ' s a quick , straight shot to the movie ' ..."
71528,"in terms of overall quality , i would compare ..."
71529,both films are well made with interesting stor...
71530,but neither film really felt like it capitaliz...


Before training the embedding model, we process the text. The preprocessing stage consists of the following procedure:


*   The text is decoded and then normalized, i.e., the data is transformed from complex symbols into simple characters. Characters can be subjected to various forms of encoding, such as Latin, ISO/IEC 8859-1, etc. Therefore, for better analysis, it is necessary to keep the data in a standard encoding format. For this requirement, we choose UTF-8 encoding because it is widely accepted and often recommended.
*   We eliminate all numeric and special characters.
*   Any letter repeated more than three times in a row is replaced by two repetitions of the same letter as the usual rules of English spelling forbid triple letters (for example "coooool" is replaced by "cool")
*   Split the text in smaller units (word tokens).



In [101]:
def normalize_text(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore') # remove accented chars
    return x

"""
    Any letter repeated more than three times in a row is replaced by two repetitions of the same letter
"""
def remove_multiple_occurences(text):
    n = len(text)

    if n < 3:
        return text

    i, count = 0, 0
    while i < n - 1:
        i += 1
        if text[i] != text[i-1]:
            count = 0
        else:
            count += 1
            if count >= 2:
                text = text[:i] + text[i+1:]
                n -= 1
                i -= 1

    return text


In [138]:
from nltk.tokenize import word_tokenize

def preprocessing(text_dataframe, set_stopwords=None): 
  # STEP 1: Normalization
  text_dataframe['text'] = text_dataframe['text'].apply(lambda x: normalize_text(x))
  # STEP 2: Eliminate special characters
  special_characters = string.punctuation + string.digits
  text_dataframe['text'] = text_dataframe['text'].apply(lambda x : x.translate(str.maketrans(' ', ' ', special_characters)))
  # STEP 3: Replace any sequence of the same letter of length greater than 2 with a sequence of length 2
  text_dataframe['text'] = text_dataframe['text'].apply(remove_multiple_occurences)
  # STEP 4: Lowercasing
  text_dataframe['text'] = text_dataframe['text'].apply(lambda x: x.lower())
  # STEP 5: Tokenization
  text_dataframe['words'] = text_dataframe['text'].apply(lambda x: word_tokenize(x))
  # Optional: eliminate stopwords
  if set_stopwords is not None:
    text_dataframe['words'] = text_dataframe['words'].apply(lambda x: [w for w in x if w not in set_stopwords])
  
  """
    The central idea of Word2Vec is Skipgram with Negative Sampling, so it uses
    context words to learn the embeddings. If a sentence has less than 4 words,
    we will eliminate it.
  """
  text_dataframe['num_words'] = text_dataframe['words'].apply(lambda x: len(x))
  text_dataframe = text_dataframe[text_dataframe['num_words'] > 4]

  return text_dataframe

In [591]:
df = pd.concat([df1, df2, df3, df4, df5], axis=0, ignore_index=True)
df

Unnamed: 0,text
0,The Fulton County Grand Jury said Friday an in...
1,The jury further said in term-end presentments...
2,The September-October term jury had been charg...
3,`` Only a relative handful of such reports was...
4,The jury said it did find that many of Georgia...
...,...
223807,"Moreover , they said 0 the first appropriation..."
223808,They also said that more than a dozen presiden...
223809,Sen. Kennedy said in a separate statement that...
223810,Trinity Industries Inc. said 0 it reached a pr...


In [592]:
df = preprocessing(df, set_stopwords=stopwords.words('english'))
df

Unnamed: 0,text,words,num_words
0,the fulton county grand jury said friday an in...,"[fulton, county, grand, jury, said, friday, in...",16
1,the jury further said in termend presentments ...,"[jury, said, termend, presentments, city, exec...",18
2,the septemberoctober term jury had been charge...,"[septemberoctober, term, jury, charged, fulton...",20
3,only a relative handful of such reports was r...,"[relative, handful, reports, received, jury, s...",14
4,the jury said it did find that many of georgia...,"[jury, said, find, many, georgias, registratio...",12
...,...,...,...
223807,moreover they said the first appropriations ...,"[moreover, said, first, appropriations, bill, ...",15
223808,they also said that more than a dozen presiden...,"[also, said, dozen, presidents, ppa, called, l...",19
223809,sen kennedy said in a separate statement that ...,"[sen, kennedy, said, separate, statement, supp...",24
223810,trinity industries inc said it reached a prel...,"[trinity, industries, inc, said, reached, prel...",14


## Training the word embeddings from scratch

In [738]:
cores = multiprocessing.cpu_count()

In [739]:
# initialize model
model = Word2Vec(min_count=50,
                 window=15,           # Bigger window size => relatedness
                 size=150,            # We choose *size* parameter to be 150 due to our small dataset
                 sg=1,                # trainig algorithm
                #  sample=1e-7,
                #  alpha=0.015,
                #  min_alpha=0.0005,
                #  negative=7,        
                 workers=cores)

In [740]:
t = time()

model.build_vocab(df['words'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.05 mins


In [741]:
model.corpus_count

174621

In [742]:
t = time()

model.train(df['words'], total_examples=model.corpus_count, epochs=model.iter + 2, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

  This is separate from the ipykernel package so we can avoid doing imports until


Time to train the model: 2.48 mins


## Exploring the model

In [743]:
print('Most similar to \'number\':\n', model.wv.most_similar(positive=["number"]))

Most similar to 'number':
 [('numbers', 0.5407553911209106), ('numbered', 0.5209192037582397), ('fifty', 0.46774619817733765), ('thousand', 0.4514497220516205), ('sum', 0.44452738761901855), ('males', 0.4422333836555481), ('nine', 0.44021376967430115), ('eight', 0.4339964687824249), ('threescore', 0.4332047402858734), ('according', 0.42926108837127686)]


In [744]:
print('Most similar to \'modern\':\n', model.wv.most_similar(positive=["modern"]))

Most similar to 'modern':
 [('culture', 0.5394097566604614), ('designs', 0.5135877132415771), ('contemporary', 0.49742984771728516), ('myth', 0.4955432415008545), ('tradition', 0.4731130599975586), ('era', 0.4685472846031189), ('greek', 0.46849197149276733), ('civilization', 0.4638400077819824), ('primitive', 0.46175244450569153), ('inspiration', 0.4594225287437439)]


In [745]:
print('Most similar to \'expenses\':\n', model.wv.most_similar(positive=["expenses"]))

Most similar to 'expenses':
 [('taxes', 0.7442578077316284), ('wages', 0.6972888708114624), ('income', 0.6941887736320496), ('revenue', 0.6820709705352783), ('estimated', 0.662767767906189), ('expense', 0.6597381234169006), ('salary', 0.639643669128418), ('excess', 0.6345015168190002), ('secured', 0.623443603515625), ('exceed', 0.622524082660675)]


# Evaluation

### WordNet coverage
Percentage of all words in your embedding space which can be found in WordNet. For each word, you will check whether he word or its lemma occurs in a synset in WordNet.

In [746]:
lemmatizer = WordNetLemmatizer()

In [747]:
vocab = list(model.wv.vocab)

In [748]:
wn_lemmas = set(wn.all_lemma_names())

count = 0
for word in vocab:
  lemmatized_word = lemmatizer.lemmatize(word)
  if word in wn_lemmas or lemmatized_word in wn_lemmas:
    count += 1

print('WordNet coverage: %.2f' % ((count / len(vocab)) * 100.))

WordNet coverage: 88.41


### Precision, Recall, F1-score

In [761]:
rnd_ind = np.random.randint(len(vocab), size=(1000))
vocab = np.array(vocab)
words_pairs = [(w1, w2) for i, w1 in enumerate(vocab[rnd_ind]) for w2 in vocab[rnd_ind][i+1:] if w1 != w2]
words_pairs

[('cops', 'associate'),
 ('cops', 'dave'),
 ('cops', 'light'),
 ('cops', 'programs'),
 ('cops', 'workers'),
 ('cops', 'live'),
 ('cops', 'able'),
 ('cops', 'tide'),
 ('cops', 'doctors'),
 ('cops', 'buster'),
 ('cops', 'opportunity'),
 ('cops', 'vile'),
 ('cops', 'thunder'),
 ('cops', 'health'),
 ('cops', 'sister'),
 ('cops', 'conceive'),
 ('cops', 'calm'),
 ('cops', 'editing'),
 ('cops', 'bleak'),
 ('cops', 'nov'),
 ('cops', 'donnell'),
 ('cops', 'dropped'),
 ('cops', 'dixon'),
 ('cops', 'magnitude'),
 ('cops', 'surprised'),
 ('cops', 'pressures'),
 ('cops', 'abilities'),
 ('cops', 'clothing'),
 ('cops', 'leo'),
 ('cops', 'attended'),
 ('cops', 'tough'),
 ('cops', 'passed'),
 ('cops', 'rarely'),
 ('cops', 'elliot'),
 ('cops', 'par'),
 ('cops', 'vehicle'),
 ('cops', 'alicia'),
 ('cops', 'realistic'),
 ('cops', 'temper'),
 ('cops', 'barred'),
 ('cops', 'always'),
 ('cops', 'protestant'),
 ('cops', 'saint'),
 ('cops', 'top'),
 ('cops', 'sweet'),
 ('cops', 'evening'),
 ('cops', 'emma'),
 (

In [762]:
cosine_threshold = 0.55

S_emb = []
for words_pair in words_pairs:
  v1 = model.wv[words_pair[0]]
  v2 = model.wv[words_pair[1]]
  cosine_similarity = np.dot(v1,v2) / (norm(v1) * norm(v2))
  if cosine_similarity > cosine_threshold:
    S_emb.append(words_pair)

In [763]:
S_emb = set(S_emb)
S_emb

{('vpon', 'tis'),
 ('ltd', 'acquisition'),
 ('hughes', 'infamous'),
 ('recovered', 'harville'),
 ('client', 'funding'),
 ('passed', 'entered'),
 ('filmmaking', 'accuracy'),
 ('divide', 'inheritance'),
 ('harville', 'leisure'),
 ('dixon', 'affectionate'),
 ('ltd', 'units'),
 ('measuring', 'height'),
 ('slapstick', 'originality'),
 ('boyfriend', 'pregnant'),
 ('boyfriend', 'connor'),
 ('nov', 'calif'),
 ('sore', 'zedekiah'),
 ('expanding', 'consumer'),
 ('heareth', 'preach'),
 ('essence', 'aspects'),
 ('turtle', 'rabbit'),
 ('regardless', 'purely'),
 ('knowest', 'didst'),
 ('multiple', 'defined'),
 ('knowest', 'snare'),
 ('frightening', 'remotely'),
 ('planning', 'defense'),
 ('creative', 'scorsese'),
 ('elinor', 'sincerity'),
 ('flood', 'pestilence'),
 ('transaction', 'units'),
 ('infamous', 'derek'),
 ('significance', 'integration'),
 ('functions', 'objective'),
 ('regardless', 'objective'),
 ('lacking', 'originality'),
 ('associate', 'student'),
 ('analysis', 'objective'),
 ('programs

In [764]:
S_wn = []
for words_pair in words_pairs:
  lemma_w1 = lemmatizer.lemmatize(words_pair[0])
  lemma_w2 = lemmatizer.lemmatize(words_pair[1])
  synsets_w1 = set(wn.synsets(lemma_w1)) 
  synsets_w2 = set(wn.synsets(lemma_w2))
  if synsets_w1.intersection(synsets_w2):
    S_wn.append(words_pair)

In [765]:
S_wn = set(S_wn)
S_wn

{('abuse', 'shouted'),
 ('achieved', 'accomplished'),
 ('acquired', 'producing'),
 ('acts', 'bit'),
 ('acts', 'moves'),
 ('acts', 'pretend'),
 ('acts', 'representing'),
 ('acts', 'work'),
 ('advanced', 'raising'),
 ('advancing', 'advanced'),
 ('advancing', 'encourage'),
 ('advancing', 'raising'),
 ('advantages', 'advantage'),
 ('advised', 'suggests'),
 ('allowed', 'grant'),
 ('allowed', 'provides'),
 ('allowed', 'reserve'),
 ('always', 'constantly'),
 ('ample', 'riches'),
 ('answering', 'resolve'),
 ('answering', 'serve'),
 ('answering', 'serving'),
 ('appoint', 'names'),
 ('approach', 'attack'),
 ('arriving', 'arrived'),
 ('aspects', 'expression'),
 ('associate', 'associated'),
 ('associate', 'associates'),
 ('associate', 'companions'),
 ('associate', 'comrades'),
 ('associate', 'familiar'),
 ('associates', 'associated'),
 ('associates', 'comrades'),
 ('assumed', 'acquired'),
 ('assured', 'promising'),
 ('assured', 'secure'),
 ('assured', 'sees'),
 ('astonishment', 'amazement'),
 ('at

In [766]:
precision = len(S_emb.intersection(S_wn)) / len(S_emb)
recall = len(S_emb.intersection(S_wn)) / len(S_wn)
f1_score = 2 * precision * recall / (precision + recall)

print('Precision: ', precision)
print('Recall: ', recall)
print('F1_score: ', f1_score)

Precision:  0.014977973568281937
Recall:  0.03195488721804511
F1_score:  0.02039592081583683


### Coverage, precisions and recall errors
Find the words pairs which do not fit the coverage, precision and recall criteria:

1.   *coverage errors*: words in the embedding space which are not covered in WordNet
2.   *precision errors*: word pairs which are considered synonyms in the embedding space but do not appear in the same synset in WordNet
3.   *recall errors*: word pairs which are not considered synonyms in the embedding space in the embedding space, but do appear in the same synset in WordNet



**Coverage errors**

In [767]:
vocabulary = set(vocab)

print(vocabulary.difference(wn_lemmas))

{'facts', 'chaldeans', 'heauen', 'permitted', 'payments', 'reporters', 'attacked', 'receives', 'carrying', 'presents', 'ceased', 'associates', 'paintings', 'started', 'susan', 'fowls', 'tunes', 'du', 'viewed', 'discovers', 'spends', 'seas', 'besought', 'founded', 'exceptions', 'manasseh', 'reveals', 'advantages', 'jon', 'rnr', 'studies', 'connor', 'harville', 'introduces', 'ratings', 'knees', 'tones', 'standards', 'began', 'differences', 'expenses', 'dashwood', 'cecilia', 'prophesied', 'files', 'forests', 'fi', 'rhode', 'ere', 'touches', 'backs', 'studios', 'lands', 'cherubims', 'evils', 'sci', 'walks', 'amorites', 'finest', 'worlds', 'followed', 'bodies', 'wrote', 'accounts', 'americans', 'buddies', 'kellynch', 'intervals', 'everybody', 'arrives', 'dwelt', 'waves', 'surprises', 'loue', 'enemies', 'locks', 'enjoying', 'shapes', 'uses', 'taxes', 'creatures', 'sends', 'cubits', 'projects', 'streets', 'deserves', 'arquette', 'lebowski', 'purposes', 'prayed', 'troopers', 'throwing', 'descr

**Precision errors**

In [768]:
print(S_emb.difference(S_wn))

{('vpon', 'tis'), ('ltd', 'acquisition'), ('hughes', 'infamous'), ('recovered', 'harville'), ('client', 'funding'), ('passed', 'entered'), ('filmmaking', 'accuracy'), ('divide', 'inheritance'), ('harville', 'leisure'), ('dixon', 'affectionate'), ('ltd', 'units'), ('measuring', 'height'), ('slapstick', 'originality'), ('boyfriend', 'pregnant'), ('boyfriend', 'connor'), ('nov', 'calif'), ('sore', 'zedekiah'), ('expanding', 'consumer'), ('heareth', 'preach'), ('essence', 'aspects'), ('turtle', 'rabbit'), ('regardless', 'purely'), ('knowest', 'didst'), ('multiple', 'defined'), ('knowest', 'snare'), ('frightening', 'remotely'), ('planning', 'defense'), ('creative', 'scorsese'), ('elinor', 'sincerity'), ('flood', 'pestilence'), ('transaction', 'units'), ('infamous', 'derek'), ('significance', 'integration'), ('functions', 'objective'), ('regardless', 'objective'), ('lacking', 'originality'), ('associate', 'student'), ('analysis', 'objective'), ('programs', 'planning'), ('jacket', 'sticking')

**Recall errors**

In [769]:
print(S_wn.difference(S_emb))

{('levels', 'degree'), ('borne', 'assumed'), ('forcing', 'drove'), ('riding', 'drive'), ('passed', 'given'), ('ate', 'feeding'), ('model', 'patterns'), ('killer', 'killers'), ('forcing', 'drives'), ('points', 'tip'), ('essence', 'centers'), ('pick', 'foot'), ('principal', 'master'), ('centers', 'meat'), ('state', 'expressed'), ('reserve', 'held'), ('pushing', 'drive'), ('serving', 'portion'), ('check', 'stops'), ('consider', 'dealt'), ('consider', 'sees'), ('directed', 'pointed'), ('flowing', 'feeding'), ('passed', 'occurs'), ('ways', 'agency'), ('talents', 'gifts'), ('allowed', 'grant'), ('choices', 'pick'), ('riding', 'drove'), ('rate', 'value'), ('passing', 'eliminate'), ('focused', 'centre'), ('shed', 'cast'), ('live', 'experienced'), ('persuade', 'carrying'), ('essence', 'substance'), ('points', 'details'), ('stir', 'raising'), ('focused', 'center'), ('points', 'directed'), ('running', 'carrying'), ('minor', 'small'), ('work', 'functions'), ('approach', 'attack'), ('shown', 'demon

Experiment with the word similarity threshold used to decide word embedding synonymy (select a sample of different values) and find the optimal one thatmaximizes the F1-score computed above.

In [770]:
thresholds = np.arange(start=0.25, stop=0.99, step=0.05)

In [771]:
optim, best_f1_score = 0., 0.
for cosine_threshold in thresholds:
  S_emb = []
  for words_pair in words_pairs:
    v1 = model.wv[words_pair[0]]
    v2 = model.wv[words_pair[1]]
    cosine_similarity = np.dot(v1,v2) / (norm(v1) * norm(v2))
    if cosine_similarity > cosine_threshold:
      S_emb.append(words_pair)
  
  S_emb = set(S_emb)

  precision = len(S_emb.intersection(S_wn)) / (len(S_emb) + 1e-7)
  recall = len(S_emb.intersection(S_wn)) / (len(S_wn) + 1e-7)
  f1_score = 2 * precision * recall / (precision + recall + 1e-7)
  if f1_score > best_f1_score:
    best_f1_score = f1_score
    optim = cosine_threshold
    

print('Optim threshold {} achieved {} F1-score'.format(optim, best_f1_score))

Optim threshold 0.5499999999999999 achieved 0.020395877355827204 F1-score
