# Evaluating Word Representations

## Getting the pre-trained word vectors

In [5]:
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial


bow2_file = 'bow2.words'
bow5_file = 'bow5.words'
dependency_file = 'deps.words'

#read the files
with open(bow2_file) as f:
    bow2 = f.readlines()
    
with open(bow5_file) as f:
    bow5 = f.readlines()
    
with open(dependency_file) as f:
    dependency = f.readlines()
  

In [6]:
print(len(bow2))
print(len(bow5))
print(len(dependency))

183870
183870
174015


In [7]:
word2 = []
word5 = []
word_deps = []

vector2 = dict()
vector5 = dict()
vector_deps = dict()

In [8]:
#fill in the word arrays and the respective vector arrays

#bow2
for l in bow2:
    #split line
    split_line = str.split(l)
    
    #first one is the word - string
    word = split_line[0]
    word2.append(word)
    
    #remaining elements are the embedding values - float
    vector = []
    for vd in split_line[1:]:
        vector.append(float(vd))
        
    vector2[word] = vector
    

In [9]:
#bow5
for l in bow5:
    #split line
    split_line = str.split(l)
    
    #first one is the word - string
    word = split_line[0]
    word5.append(word)
    
    #remaining elements are the embedding values - float
    vector = []
    for vd in split_line[1:]:
        vector.append(float(vd))
        
    vector5[word] = vector

In [10]:
#dependency
for l in dependency:
    #split line
    split_line = str.split(l)
    
    #first one is the word - string
    word = split_line[0]
    word_deps.append(word)
    
    #remaining elements are the embedding values - float
    vector = []
    for vd in split_line[1:]:
        vector.append(float(vd))
        
    vector_deps[word] = vector

In [11]:
print(word2[0])
print(word5[0])
print(word_deps[0])


the
the
the


In [12]:
print(len(vector2['the']))
print(len(vector5['the']))
print(len(vector_deps['the']))


300
300
300


## Word Similarity Task

In [9]:
#MEN/agreement/elias-men-ratings.txt' dataset read
word_pairs = {}

filename = 'MEN/agreement/elias-men-ratings.txt'

with open(filename) as f:
    lines = f.readlines()
    
for line in lines:
    line = line.split()
    word1 = line[0]
    word2 = line[1]
    score = line[2]
    word_pairs[(word1,word2)] = score
    
keys = list(word_pairs.keys())

In [10]:
keys = list(word_pairs.keys())

In [11]:
#vector_deps and cosine similarity comparison
scores = []
cosines = []

for key in keys:
    word1 = key[0]
    word2 = key[1]
    score = word_pairs[(word1,word2)]
    score = (float(score) - 1)/6
    if word1 in vector_deps.keys() and word2 in vector_deps.keys():
        word1_embed = vector_deps[word1]
        word2_embed = vector_deps[word2]
        #cosine = cosine_similarity(word1_embed, word1_embed)
        cosine_similarity = 1 - spatial.distance.cosine(word1_embed, word2_embed)
        #print("Score : ",score)
        #print("Cosine similarity: ",cosine_similarity)
        scores.append(score)
        cosines.append(cosine_similarity)
        

In [12]:
#vector2 and cosine similarity comparison


scores = []
cosines = []

for key in keys:
    word1 = key[0]
    word2 = key[1]
    score = word_pairs[(word1,word2)]
    score = (float(score) - 1)/6 #normalization
    if word1 in vector2.keys() and word2 in vector2.keys():
        word1_embed = vector2[word1]
        word2_embed = vector2[word2]
        #cosine = cosine_similarity(word1_embed, word1_embed)
        cosine_similarity = 1 - spatial.distance.cosine(word1_embed, word2_embed)
        print("Score : ",score)
        print("Cosine similarity: ",cosine_similarity)
        print(word1, word2)
        print("\n")
        scores.append(score)
        cosines.append(cosine_similarity)

        

Score :  0.0
Cosine similary:  0.113985907431
hamster party


Score :  0.8333333333333334
Cosine similary:  0.456945987653
bed sleep


Score :  0.8333333333333334
Cosine similary:  0.617271328208
raspberry strawberry


Score :  0.6666666666666666
Cosine similary:  0.270836771023
cooking fruit


Score :  0.5
Cosine similary:  0.404629717368
downtown shopping


Score :  0.16666666666666666
Cosine similary:  0.102363906632
drug wolf


Score :  0.8333333333333334
Cosine similary:  0.263234470686
colorful outfit


Score :  0.6666666666666666
Cosine similary:  0.228039486373
burger mac


Score :  0.6666666666666666
Cosine similary:  0.316650201947
frost weather


Score :  0.16666666666666666
Cosine similary:  0.357448838882
arch concrete


Score :  0.16666666666666666
Cosine similary:  0.15165121894
cactus leg


Score :  0.8333333333333334
Cosine similary:  0.408632063131
rice sushi


Score :  0.6666666666666666
Cosine similary:  0.357273494506
ceiling room


Score :  0.0
Cosine similary:  0

Cosine similary:  0.133970138952
leather swan


Score :  0.3333333333333333
Cosine similary:  0.198486692875
ice sheep


Score :  0.5
Cosine similary:  0.447666365884
room staircase


Score :  0.3333333333333333
Cosine similary:  0.136753607691
family female


Score :  0.0
Cosine similary:  0.149805853709
flight whiskers


Score :  0.16666666666666666
Cosine similary:  0.255510601194
construction sailing


Score :  0.0
Cosine similary:  0.22872190527
rice tickets


Score :  0.0
Cosine similary:  0.22375246451
pin wing


Score :  0.3333333333333333
Cosine similary:  0.316310751724
decoration wood


Score :  0.6666666666666666
Cosine similary:  0.337596247578
feathers peacock


Score :  0.8333333333333334
Cosine similary:  0.408279221499
marble statue


Score :  0.0
Cosine similary:  0.196701162504
explosion stencil


Score :  0.0
Cosine similary:  0.117229022498
club cone


Score :  0.0
Cosine similary:  0.148193950509
handwriting pigs


Score :  1.0
Cosine similary:  0.352485336838
bar


Score :  0.5
Cosine similary:  0.39642652832
feline reptiles


Score :  0.5
Cosine similary:  0.16794070458
pool relaxed


Score :  0.6666666666666666
Cosine similary:  0.2125512077
daisy purple


Score :  0.16666666666666666
Cosine similary:  0.584044562785
fabric wool


Score :  0.5
Cosine similary:  0.356734001035
cafe dinner


Score :  0.8333333333333334
Cosine similary:  0.537866882849
flamingo stork


Score :  0.3333333333333333
Cosine similary:  0.323478337092
bike rally


Score :  0.6666666666666666
Cosine similary:  0.28919936274
blurred eyes


Score :  0.0
Cosine similary:  0.171360026174
smile view


Score :  0.0
Cosine similary:  0.241880681487
metro television


Score :  0.3333333333333333
Cosine similary:  0.204776028928
concrete pin


Score :  0.16666666666666666
Cosine similary:  0.194002604297
abandoned soldiers


Score :  1.0
Cosine similary:  0.501183330124
hair haircut


Score :  0.6666666666666666
Cosine similary:  0.183877446127
bay swim


Score :  0.666666666666

Cosine similary:  0.229770463912
bedroom feline


Score :  0.0
Cosine similary:  0.231619951178
socks white


Score :  0.0
Cosine similary:  0.14571783027
feline nuts


Score :  0.8333333333333334
Cosine similary:  0.546226979384
beef cattle


Score :  0.0
Cosine similary:  0.15276703126
alley punk


Score :  0.0
Cosine similary:  0.216229418395
happy sitting


Score :  0.3333333333333333
Cosine similary:  0.157592098104
guy sitting


Score :  0.3333333333333333
Cosine similary:  0.329597156604
air dew


Score :  0.0
Cosine similary:  0.00745840070648
ad track


Score :  0.6666666666666666
Cosine similary:  0.71777042831
guitar piano


Score :  0.0
Cosine similary:  0.195584560337
desert roof


Score :  0.0
Cosine similary:  0.115626384975
lighting person


Score :  0.16666666666666666
Cosine similary:  0.362686374002
bacon sweet


Score :  0.8333333333333334
Cosine similary:  0.244243682094
daisy plant


Score :  0.5
Cosine similary:  0.364973848899
amphibians pelican


Score :  0.333

Score :  0.0
Cosine similary:  0.0582974508148
posted tulip


Score :  0.0
Cosine similary:  0.166023643399
party scooter


Score :  0.5
Cosine similary:  0.256082435904
river scenery


Score :  0.6666666666666666
Cosine similary:  0.269184820121
footprint tiles


Score :  0.5
Cosine similary:  0.234153505163
fun night


Score :  0.8333333333333334
Cosine similary:  0.568738922916
chicken lamb


Score :  0.3333333333333333
Cosine similary:  0.2613227346
day lunch


Score :  0.0
Cosine similary:  0.338220388689
diamond stencil


Score :  0.8333333333333334
Cosine similary:  0.416192441007
guitar musicians


Score :  0.8333333333333334
Cosine similary:  0.473928650498
green violet


Score :  0.8333333333333334
Cosine similary:  0.473230600423
dessert sweet


Score :  0.6666666666666666
Cosine similary:  0.20934471255
bucket curve


Score :  0.16666666666666666
Cosine similary:  0.357237333991
green shade


Score :  0.16666666666666666
Cosine similary:  0.249313318942
bed furniture


Scor

Score :  1.0
Cosine similary:  0.678644128069
holiday vacation


Score :  0.5
Cosine similary:  0.613570003601
porch staircase


Score :  0.16666666666666666
Cosine similary:  0.309471602249
dirty smile


Score :  0.5
Cosine similary:  0.18354617737
decoration tulip


Score :  0.6666666666666666
Cosine similary:  0.434497820818
baby mother


Score :  0.0
Cosine similary:  0.209218237342
handle terrier


Score :  0.8333333333333334
Cosine similary:  0.517195534259
cigarette smoking


Score :  0.5
Cosine similary:  0.505521677996
carrots sunflower


Score :  0.3333333333333333
Cosine similary:  0.145287832406
book smoking


Score :  0.5
Cosine similary:  0.309602665846
canine pets


Score :  0.8333333333333334
Cosine similary:  0.540593314166
maple oak


Score :  0.3333333333333333
Cosine similary:  0.208415056891
grass ruins


Score :  0.0
Cosine similary:  0.13607191425
bottle construction


Score :  0.8333333333333334
Cosine similary:  0.515740788496
pink violet


Score :  0.833333333

Score :  0.5
Cosine similary:  0.156022835202
face guy


Score :  0.8333333333333334
Cosine similary:  0.397127384921
ruins stone


Score :  0.16666666666666666
Cosine similary:  0.194058097903
cute music


Score :  0.5
Cosine similary:  0.373636520793
lake shore


Score :  0.0
Cosine similary:  0.0976042920056
furniture nature


Score :  0.0
Cosine similary:  0.118423036569
boys gate


Score :  0.5
Cosine similary:  0.738119418464
reptiles rodents


Score :  0.0
Cosine similary:  0.279564539894
rope train


Score :  0.5
Cosine similary:  0.263131286009
cafe relaxed


Score :  0.6666666666666666
Cosine similary:  0.533063581689
bread salad


Score :  0.3333333333333333
Cosine similary:  0.352319114007
foliage purple


Score :  0.6666666666666666
Cosine similary:  0.440580203596
cooking soup


Score :  0.16666666666666666
Cosine similary:  0.385481625875
feathers grey


Score :  0.0
Cosine similary:  0.153913784073
port sweet


Score :  0.6666666666666666
Cosine similary:  0.51608329121

In [13]:
#vector5 and cosine similarity comparison

scores = []
cosines = []

for key in keys:
    word1 = key[0]
    word2 = key[1]
    score = word_pairs[(word1,word2)]
    score = (float(score) - 1)/6 #normalization
    if word1 in vector5.keys() and word2 in vector5.keys():
        word1_embed = vector5[word1]
        word2_embed = vector5[word2]
        #cosine = cosine_similarity(word1_embed, word1_embed)
        cosine_similarity = 1 - spatial.distance.cosine(word1_embed, word2_embed)
        #print("Score : ",score)
        #print("Cosine similarity: ",cosine_similarity)
        #print(word1, word2)
        #print("\n")
        scores.append(score)
        cosines.append(cosine_similarity)
        
print(pearsonr(scores, cosines))
print(spearmanr(scores, cosines))

(0.68735351705330894, 0.0)
SpearmanrResult(correlation=0.69935776173920328, pvalue=0.0)


In [14]:
# Simlex Dataset Read

word_pair_list = {}

filename = 'SimLex-999/SimLex-999.txt'

with open(filename) as f:
    lines = f.readlines()
    
lines = lines[1:]
for line in lines:
    line = line.split()
    word1 = line[0]
    word2 = line[1]
    score = line[3]
    word_pair_list[(word1,word2)] = score
    
keys = list(word_pair_list.keys())

In [15]:
word_pair_list[('old', 'new')]

'1.58'

In [16]:
#vector5 and cosine similarity comparison

from scipy import spatial

scores = []
cosines = []

for key in keys:
    word1 = key[0]
    word2 = key[1]
    score = word_pair_list[(word1,word2)]
    score = float(score)/10 #normalization
    if word1 in vector5.keys() and word2 in vector5.keys():
        word1_embed = vector5[word1]
        word2_embed = vector5[word2]
        #cosine = cosine_similarity(word1_embed, word1_embed)
        cosine_similarity = 1 - spatial.distance.cosine(word1_embed, word2_embed)
        #print("Score : ",score)
        #print("Cosine similary: ",cosine_similarity)
        #print(word1, word2)
        #print("\n")
        scores.append(score)
        cosines.append(cosine_similarity)
        
print(pearsonr(scores, cosines))
print(spearmanr(scores, cosines))

(0.37560059706687149, 8.6074105722983937e-35)
SpearmanrResult(correlation=0.36739613669787896, pvalue=2.9775781067162087e-33)


In [17]:
spatial.distance.cosine(vector5['old'], vector5['fresh'])

0.93695011658868554

In [19]:
d = np.asarray(vector5['dublin'])
f = np.asarray(vector5['ireland'])

offset = d-f
offset = offset/np.linalg.norm(offset)

In [20]:
athens = np.asarray(vector5['athens'])

In [24]:
greece = athens + offset

In [25]:
min_dist = 10000

closest = ''

for key in vector5.keys():
    
    word_embed = vector5[key]
    cosine_similarity = 1 - spatial.distance.cosine(greece, word_embed)

    if cosine_similarity < min_dist:
        min_dist = cosine_similarity
        closest = key


print(closest)

consequential


In [26]:
diff = np.asarray(vector5['athens']) - np.asarray(vector5['greece'])
np.linalg.norm(offset - diff)

0.95687822040077763

In [27]:
x = offset - diff
x = np.absolute(x)
sum(x)

13.289640543063458

## Word Analogy Task

In [28]:
analogy_file = 'questions-words.txt'

#read the files
with open(analogy_file) as f:
    analogies = f.readlines()
    
for a in analogies:
    if ':' in a:
        print(a)

: capital-common-countries

: capital-world

: currency

: city-in-state

: family

: gram1-adjective-to-adverb

: gram2-opposite

: gram3-comparative

: gram4-superlative

: gram5-present-participle

: gram6-nationality-adjective

: gram7-past-tense

: gram8-plural

: gram9-plural-verbs



In [55]:
# all types of analogies will be stored separately

capital_common_countries = []
capital_world = []
currency = []
city_in_state = []
family = []
gram1_adjective_to_adverb = []
gram2_opposite = []
gram3_comparative = []
gram4_superlative = []
gram5_present_participle = []
gram6_nationality_adjective = []
gram7_past_tense = []
gram8_plural = []
gram9_plural_verbs = []

#while reading into the arrays, keep the current array
current = []

for i in range(len(analogies)):
    #one line -> question-answer pair
    a = analogies[i]
    
    if ': capital-common-countries' in a:
        continue
    elif ': capital-world' in a:

        #store current array
        capital_common_countries = current

        #reset current
        current = []  
        continue
    elif ': currency' in a:

        #store current array
        capital_world = current

        #reset current
        current = []  
        continue
    elif ': city-in-state' in a:

        #store current array
        currency = current

        #reset current
        current = []  
        continue

    elif ': family' in a:

        #store current array
        city_in_state = current

        #reset current
        current = [] 
        continue

    elif ': gram1-adjective-to-adverb' in a:

        #store current array
        family = current

        #reset current
        current = [] 
        continue
    elif ': gram2-opposite' in a:

        #store current array
        gram1_adjective_to_adverb = current

        #reset current
        current = [] 
        continue
    elif ': gram3-comparative'in a:

        #store current array
        gram2_opposite = current

        #reset current
        current = [] 
        continue
    elif ': gram4-superlative' in a:

        #store current array
        gram3_comparative = current

        #reset current
        current = [] 
        continue
    elif ': gram5-present-participle' in a:

        #store current array
        gram4_superlative = current

        #reset current
        current = [] 
        continue
    elif ': gram6-nationality-adjective' in a:

        #store current array
        gram5_present_participle = current

        #reset current
        current = [] 
        continue
    elif ': gram7-past-tense' in a:

        #store current array
        gram6_nationality_adjective = current

        #reset current
        current = [] 
        continue
    elif ': gram8-plural' in a:

        #store current array
        gram7_past_tense = current

        #reset current
        current = [] 
        continue
    elif ': gram9-plural-verbs' in a:

        #store current array
        gram8_plural = current

        #reset current
        current = [] 
        continue
    current.append(a)
    
    if i == len(analogies)-1:
        #last item in the file
        gram9_plural_verbs = current
        


In [56]:
print(len(capital_common_countries))

506


In [57]:
currency[0]

'Algeria dinar Angola kwanza\n'

In [54]:
#TODO separate the question-answer pairs, then use offset and cosine sim for all
#use s.lower() 
#ax = str.split(a, " ")
#ax[3] = str.split(ax[3], "\n")[0] #get rid of newline
#print(ax)

## Clustering Word Vectors

In [13]:
from sklearn.cluster import KMeans

#read 2000 frequent nouns
freq_file = '2000_nouns_sorted.txt'

#read the files
with open(freq_file) as f:
    freq_nouns = [line.strip() for line in f]

In [26]:
len(freq_nouns)

1999

In [27]:
freq_nouns[:10]

['dollar',
 'formula',
 'pound',
 'quote',
 'ability',
 'absence',
 'abuse',
 'acceptance',
 'access',
 'accident']

In [25]:
vector_set = []

for fn in freq_nouns:
    if fn in vector2:
        vector_set.append(vector2[fn])
    else:
        print(fn)


# kmeans_model = KMeans(n_clusters=2, random_state=0) 
# kmeans_fit = kmeans_model.fit(X)

fig.


In [22]:
len(vector_set)

1998