# Word Similarity

I'll be will be comparing our methods against a popular dataset of word similarities called Similarity-353. The dataset can be downloaded <a href="http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/wordsim353.zip">here</a>. The file we will be using is called *combined.tab*. Except for the header (which should be stripped out), the file is tab formated with the first two columns corresponding to two words, and the third column representing a human-annotated similarity between the two words.

I will filter this dataset to generate a smaller test set where I will evaluate word similarity methods.

The first filtering is based on document frequencies in the Brown corpus, in order to remove rare words. I will be treating the <i>paragraphs</i> of the Brown corpus as our "documents", the words will be lower-cased and lemmatized before they are added to the set. Then, using the information in this corpus, I will calculate document frequencies and remove from test set any word pairs where at least one of the two words has a document frequency of less than 10 in this corpus. 

The second filtering is based on words with highly ambiguous senses and involves using the NLTK interface to WordNet. Here, I remove any words which do not have a *single primary sense*. We define single primary sense here as either having only one sense (i.e. only one synset), or where the count (as provided by the WordNet `count()` method for the lemmas associated with a synset) of the most common sense is at least five and at least five times larger than the next most common sense. Also, I remove any words where the primary sense is not a noun (this information is also in the synset). Store the synset corresponding to this primary sense in a dictionary for use in the next section. I will remove any word pairs from the test set where at least one of the words does not contain a single primary sense or if the single primary sense is not a noun.

In [1]:
# load combined.tab into a dictionary
import csv
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus import brown
from nltk import FreqDist
import numpy as np

dataset={}
dataset_filtered_1={}
dataset_filtered_2={}
document_sets=[]   

file_name='combined.tab'



def lemmatize_word(word):
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    # try to lemmatize the word as verb, if not try with noun, adjective or adverb
    lemma = lemmatizer.lemmatize(word,wn.NOUN)
    
    return lemma


def single_primary_sense(word):
    synsets_word=wn.synsets(word)
    count_lemmas_next_common_sense=0
    count_lemmas_most_common_sense=0
    
    # remove any words which don´t have a single primary sense (either having only one sense like one synset )
    if synsets_word: 
        if len(synsets_word)==1: # means the word only have one set (and it´s a noun synset)
            return True
        # remove the word if the most common sense is not at least
        # five and at least five times larger thant the next most common sense
        if (synsets_word[0].lemmas()):
            for lemma in synsets_word[0].lemmas():
                # taking the name of first synset lemma and compare it to every lemma in second synset. 
                # if matches then I use the lemma count as valid for the next common sense lemma count
                if (lemma.name().split()[-1]==word):
                    count_lemmas_most_common_sense+=lemma.count()

        if len(synsets_word)>1: # does the word have a next sense?
            for lemma in synsets_word[1].lemmas():
                # taking the name of first synset lemma and compare it to every lemma in second synset. 
                # if matches then I use the lemma count as valid for the next common sense lemma count
                if (lemma.name().split()[-1]==word):
                    count_lemmas_next_common_sense+=lemma.count()


        if count_lemmas_most_common_sense>=5 and count_lemmas_most_common_sense>=(5*count_lemmas_next_common_sense):
            return True
            
    return False



def getDatasetDictionary():
    dataset_dict={}
    with open(file_name, 'r') as file:
        combined_reader = csv.reader(file, delimiter='\t')
        next(combined_reader) # discards the header
        for line in combined_reader:
            dataset_dict[line[0],line[1]]=float(line[2])
    
    return dataset_dict


def get_documents_set():
    documents_set=[]
    # got through Paragraphs > Sentences > words
    for para in brown.paras():
        document=set()
        for sentence in para:
            for word in sentence:
                word=lemmatize_word(word.lower())
                document.add(word)
        
        documents_set.append(document)
    return documents_set

def has_primary_noun_synset(word):
    synsets=wn.synsets(word)
    synsets_noun=wn.synsets(word,wn.NOUN)
    if len(synsets)>0 and len(synsets_noun)>0:
        if synsets[0]==synsets_noun[0]:
            return True
    
    return False

# document frequency for a word x is a number of documents in which x appears
def get_term_document_frequency(documents_set):
    term_document_frequency={}
    for document in documents_set:
        for word in document:
            term_document_frequency[word]=term_document_frequency.get(word,0) + 1
    return term_document_frequency

def get_dataset_filtered_1(ds,term_document_frequency):
    
    dataset_filtered={}
                
    for key in ds: 
        word_1=key[0].lower() # No need to lemmatize or preprocessing to do.
        word_2=key[1].lower()
        
        frequency_word1=0
        frequency_word2=0
        frequency_word1=term_document_frequency.get(word_1,0)
        frequency_word2=term_document_frequency.get(word_2,0)
        if frequency_word1 >= 10 and frequency_word2 >= 10: 
            dataset_filtered[key]=ds[key]

    return dataset_filtered

def get_dataset_filtered_2(ds):
    dataset_filtered={}
    for key in ds:
        word_1=key[0]
        word_2=key[1]
        if single_primary_sense(word_1) and single_primary_sense(word_2): 
            # to remove the words which single primary sinsets are not nouns
            if (has_primary_noun_synset(word_1) and has_primary_noun_synset(word_2)):
                dataset_filtered[key]=ds[key]
    
    return dataset_filtered

# get the dataset dictionary
dataset=getDatasetDictionary()

# get documents simplified
document_sets=get_documents_set()
term_document=get_term_document_frequency(document_sets)
dataset_filtered_1=get_dataset_filtered_1(dataset,term_document)
dataset_filtered_2=get_dataset_filtered_2(dataset_filtered_1)

# print out all the pairs in the filtered test set. 10 lt Total lt 50 
print(len(dataset_filtered_2))
dataset_filtered_2



38


{('aluminum', 'metal'): 7.83,
 ('baby', 'mother'): 7.85,
 ('brother', 'monk'): 6.27,
 ('canyon', 'landscape'): 7.53,
 ('car', 'automobile'): 8.94,
 ('century', 'year'): 7.59,
 ('coast', 'forest'): 3.15,
 ('coast', 'hill'): 4.38,
 ('coast', 'shore'): 9.1,
 ('computer', 'laboratory'): 6.78,
 ('doctor', 'personnel'): 5.0,
 ('drink', 'car'): 3.04,
 ('drink', 'ear'): 1.31,
 ('drink', 'mother'): 2.65,
 ('equipment', 'maker'): 5.91,
 ('hotel', 'reservation'): 8.03,
 ('journey', 'car'): 5.85,
 ('journey', 'voyage'): 9.29,
 ('luxury', 'car'): 6.47,
 ('money', 'cash'): 9.08,
 ('monk', 'slave'): 0.92,
 ('phone', 'equipment'): 7.13,
 ('planet', 'people'): 5.75,
 ('president', 'medal'): 3.0,
 ('professor', 'doctor'): 6.62,
 ('psychology', 'doctor'): 6.42,
 ('psychology', 'fear'): 6.85,
 ('psychology', 'health'): 7.23,
 ('psychology', 'mind'): 7.69,
 ('psychology', 'science'): 6.71,
 ('school', 'center'): 3.44,
 ('soap', 'opera'): 7.94,
 ('stock', 'egg'): 1.81,
 ('stock', 'phone'): 1.62,
 ('train', 

Let's create similarity scores for pairs of words in the test set. The first of these is the Wu-Palmer scores derived from the hypernym relationships in WordNet calculated using the primary sense for each word derived above.

In [2]:
# create similarity scores with Wu-Palmer drived from hypernym relationships in WordNet
dataset_wu_palmer=dataset_filtered_2.copy()
# populate the wu_palmer dictionary with the words derived from the filtered dataset
for key in dataset_wu_palmer:
    word_1=key[0]
    word_2=key[1]
    
    synsets_word_1=wn.synsets(word_1)
    synsets_word_2=wn.synsets(word_2)
    
    
    # similarity calulated using most common sense for each word
    similarity=synsets_word_1[0].wup_similarity(synsets_word_2[0])
    dataset_wu_palmer[key]=similarity
    
        

# print the python dictionary of word pair / similarity
dataset_wu_palmer

{('aluminum', 'metal'): 0.9333333333333333,
 ('baby', 'mother'): 0.5,
 ('brother', 'monk'): 0.5714285714285714,
 ('canyon', 'landscape'): 0.3333333333333333,
 ('car', 'automobile'): 1.0,
 ('century', 'year'): 0.8333333333333334,
 ('coast', 'forest'): 0.16666666666666666,
 ('coast', 'hill'): 0.6666666666666666,
 ('coast', 'shore'): 0.9090909090909091,
 ('computer', 'laboratory'): 0.35294117647058826,
 ('doctor', 'personnel'): 0.13333333333333333,
 ('drink', 'car'): 0.1111111111111111,
 ('drink', 'ear'): 0.13333333333333333,
 ('drink', 'mother'): 0.11764705882352941,
 ('equipment', 'maker'): 0.5,
 ('hotel', 'reservation'): 0.375,
 ('journey', 'car'): 0.09523809523809523,
 ('journey', 'voyage'): 0.8571428571428571,
 ('luxury', 'car'): 0.1111111111111111,
 ('money', 'cash'): 0.8,
 ('monk', 'slave'): 0.6666666666666666,
 ('phone', 'equipment'): 0.875,
 ('planet', 'people'): 0.18181818181818182,
 ('president', 'medal'): 0.11764705882352941,
 ('professor', 'doctor'): 0.5,
 ('psychology', 'doc

Now let's calculate Positive PMI (PPMI) for your word pairs using statistics derived from the Brown.

PMI between a target word $w$ and a context $c$ <br/><br/>
$pmi(w,c) = log_2 \frac{P(w,c)}{P(w)P(c)}$

In [3]:
import math


def get_ppmi_score(x, y):
    
    count_x=0.0
    count_y=0.0
    total_count=0.0
    count_x_y=0.0
    p_x_y=0.0
    p_x=0.0
    p_y=0.0
    pmi_x_y=0.0
    
    for document in document_sets:
        x_in_document=False
        y_in_document=False
        
        for word in document:

            if word==x:
                count_x+=1
            
                x_in_document=True
            elif word==y:
                count_y+=1
                total_count+=1
                y_in_document=True
            
        if x_in_document and y_in_document:
            count_x_y+=1     
        
    p_x_y=count_x_y/len(document_sets)
    p_x=count_x/len(document_sets)
    p_y=count_y/len(document_sets) 
        
    if (p_x*p_y*p_x_y>0):     
        # if log is less than 0 means words are co-occurring less than we expect by chance
        pmi_x_y=max(math.log(p_x_y/(p_x*p_y),2),0)  
    
    return pmi_x_y


# get a copy of the dataset to fill word pairs with ppmi
dataset_ppmi=dataset_filtered_2.copy()

# calculate the ppmi for every pair in our dataset
for word_pair in dataset_ppmi:
    word_1=lemmatize_word(word_pair[0].lower()) # lemmatizing since we are going to compare with the documents
    word_2=lemmatize_word(word_pair[1].lower())
    
    dataset_ppmi[word_pair]=get_ppmi_score(word_1, word_2)
    

dataset_ppmi

{('aluminum', 'metal'): 5.195660721157932,
 ('baby', 'mother'): 3.1068514542000756,
 ('brother', 'monk'): 2.8992677183777067,
 ('canyon', 'landscape'): 0.0,
 ('car', 'automobile'): 3.284928059255019,
 ('century', 'year'): 0.85521193298008,
 ('coast', 'forest'): 3.0505076829814297,
 ('coast', 'hill'): 1.2130606957673897,
 ('coast', 'shore'): 4.630747773460183,
 ('computer', 'laboratory'): 0.0,
 ('doctor', 'personnel'): 2.2186218696012423,
 ('drink', 'car'): 0.8109968709226062,
 ('drink', 'ear'): 0.0,
 ('drink', 'mother'): 0.7957301142692957,
 ('equipment', 'maker'): 4.283313403192924,
 ('hotel', 'reservation'): 2.891047211572738,
 ('journey', 'car'): 0.0,
 ('journey', 'voyage'): 0.0,
 ('luxury', 'car'): 2.272328022475385,
 ('money', 'cash'): 2.5276424807092166,
 ('monk', 'slave'): 0.0,
 ('phone', 'equipment'): 0.0,
 ('planet', 'people'): 0.4092477799069862,
 ('president', 'medal'): 0.0,
 ('professor', 'doctor'): 0.0,
 ('psychology', 'doctor'): 3.5625762708186035,
 ('psychology', 'fear')

Next, I will derive similarity scores using the LSA method, i.e. apply SVD and truncate to get a dense vector and then use cosine similarity between the two vectors for each word pair.I wil use truncatedSVD in Sci-kit learn to produce dense vectors of length 500, and then use cosine similarity to produce similarities for your word pairs.

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.spatial.distance import cosine


# bag of words to use it with every document
def get_BOW(text):
    BOW = {}  
    for word in text:
        BOW[word.lower()] = BOW.get(word.lower(),0) + 1
    
    return BOW

# list of dictionaries with BOW for every document set
texts = []
for document in document_sets:
    texts.append(get_BOW(document))


# get the term-document matrix
vectorizer = DictVectorizer()
brown_matrix = vectorizer.fit_transform(texts).T

# applying SVD
svd = TruncatedSVD(n_components=500)
brown_matrix = svd.fit_transform(brown_matrix)

# dataset with cosine similarities
dataset_cosine_similarity=dataset_filtered_2.copy()

for key in dataset_cosine_similarity:
    vector_word_1=[]
    vector_word_2=[]
    
    # scan the features to find the word we are looking for
    for feature in vectorizer.get_feature_names():    
        if key[0]==feature:
            vector_word_1=brown_matrix[vectorizer.vocabulary_.get(key[0])] # retrieve the word vector from matrix
        if key[1]==feature:
            vector_word_2=brown_matrix[vectorizer.vocabulary_.get(key[1])] # retrieve the word vector from matrix
        if len(vector_word_1)>0 and len(vector_word_2)>0:
            break
        
    # get cosine similarity between vectors and add it to the dataset
    distance=cosine(vector_word_1,vector_word_2)
    dataset_cosine_similarity[key]=1-distance 

dataset_cosine_similarity

{('aluminum', 'metal'): 0.23649550473014069,
 ('baby', 'mother'): 0.33243164151288696,
 ('brother', 'monk'): 0.074407863741779323,
 ('canyon', 'landscape'): 0.11213145179354456,
 ('car', 'automobile'): 0.34651907546962046,
 ('century', 'year'): 0.071588465579824456,
 ('coast', 'forest'): 0.10877911187910105,
 ('coast', 'hill'): 0.19527250376223726,
 ('coast', 'shore'): 0.39241131023582776,
 ('computer', 'laboratory'): 0.12970301058889433,
 ('doctor', 'personnel'): 0.047100581556278276,
 ('drink', 'car'): 0.10412288328340535,
 ('drink', 'ear'): 0.071306873593828479,
 ('drink', 'mother'): 0.069475474873495258,
 ('equipment', 'maker'): 0.27665719864367888,
 ('hotel', 'reservation'): 0.063667638924678194,
 ('journey', 'car'): -0.018161672088413905,
 ('journey', 'voyage'): 0.13919117050178054,
 ('luxury', 'car'): 0.10307725619342645,
 ('money', 'cash'): 0.1454837975098745,
 ('monk', 'slave'): -0.044018597810947524,
 ('phone', 'equipment'): 0.011789448079946419,
 ('planet', 'people'): 0.0329

Now I will derive a similarity score from word2vec vectors, using the Gensim interface. Check the Gensim word2vec tutorial for details on the API: https://radimrehurek.com/gensim/models/word2vec.html. My vectors will have the same number of dimensions as LSA (500), and run for 50 iterations. 

In [5]:
import gensim

# copy the original dataset to populate with similarity
dataset_word2vec_similarity=dataset_filtered_2.copy()

# get all sentences to train the model
brown_sents=[]
for sent in brown.sents():
    brown_sents.append([lemmatize_word(word.lower()) for word in sent]) # lemmatize every word
    
# train the model
model = gensim.models.Word2Vec(brown_sents, size=500, iter=50, workers=4)

# get the similarity using the model
for key in dataset_word2vec_similarity:
    dataset_word2vec_similarity[key]=model.wv.similarity(key[0], key[1])
   
# print similarities
dataset_word2vec_similarity


{('aluminum', 'metal'): 0.61249731059222601,
 ('baby', 'mother'): 0.24806951434605501,
 ('brother', 'monk'): 0.18979339329331829,
 ('canyon', 'landscape'): 0.027469219789351452,
 ('car', 'automobile'): 0.24388802172656793,
 ('century', 'year'): 0.36964380627284538,
 ('coast', 'forest'): 0.21735449863030148,
 ('coast', 'hill'): 0.46980546329244566,
 ('coast', 'shore'): 0.43456961154981077,
 ('computer', 'laboratory'): 0.12874616994648233,
 ('doctor', 'personnel'): 0.14877272730870961,
 ('drink', 'car'): 0.17745198587180427,
 ('drink', 'ear'): 0.21360208008965711,
 ('drink', 'mother'): 0.18511475031732816,
 ('equipment', 'maker'): 0.23870496809580691,
 ('hotel', 'reservation'): 0.17949129613603371,
 ('journey', 'car'): 0.17750559599260754,
 ('journey', 'voyage'): 0.46100917017950932,
 ('luxury', 'car'): 0.10760435657073361,
 ('money', 'cash'): 0.30738483667156852,
 ('monk', 'slave'): 0.19510641745750062,
 ('phone', 'equipment'): -0.036527752741762606,
 ('planet', 'people'): 0.06288823842


Finally, let's should compare all the similarities  to the gold standard loaded and filtered in the first step. For this, you can use the Pearson correlation co-efficient (`pearsonr`), which is included in scipy (`scipy.stats`). 


In [6]:
from scipy.stats import pearsonr
import pandas as pd

# transform dict datasets to array in the same order
def get_arrays(similarity_dictionary):
    array_gold_std=np.zeros(len(dataset_filtered_2))
    array_similarity=np.zeros(len(similarity_dictionary))
    index=0
    for key in dataset_filtered_2:
        array_gold_std[index]=dataset_filtered_2[key]
        array_similarity[index]=similarity_dictionary[key]
        index+=1
    
    return array_gold_std,array_similarity

# get the coefficient 
def get_pearsonr(similarity_dictionary):   
    gold_std, similarity=get_arrays(similarity_dictionary)  
    r,p=pearsonr(gold_std,similarity)
    return r

        
# create a dataframe to show results
df_sim=pd.DataFrame(columns=['Model','Correlation'])
df_sim=df_sim.append({'Model':'Wu Plamer','Correlation':get_pearsonr(dataset_wu_palmer)}, ignore_index=True)
df_sim=df_sim.append({'Model':'PPMI','Correlation':get_pearsonr(dataset_ppmi)}, ignore_index=True)
df_sim=df_sim.append({'Model':'LSA','Correlation':get_pearsonr(dataset_cosine_similarity)}, ignore_index=True)
df_sim=df_sim.append({'Model':'Word2Vec','Correlation':get_pearsonr(dataset_word2vec_similarity)}, ignore_index=True)


df_sim


Unnamed: 0,Model,Correlation
0,Wu Plamer,0.584229
1,PPMI,0.357784
2,LSA,0.414849
3,Word2Vec,0.291747
