In [1]:
#import the necessary libraries

import scipy.stats as stats
import numpy as np
import pandas as pd

from tqdm import tqdm
from typing import Callable
from numpy.linalg import norm
from heapq import nlargest, nsmallest

# Part 1

The following code is used to count the number of times that word y appears in a context window with size $w$ centered at the word $x$, using the provided wiki-1percent.txt corpus. A context window contains up to $w$
words to either side of the center word, so it contains up to $2w + 1$ words in total (including the center
word). If y appears multiple times in a single window, we count each occurrence separately. We will use the
notation $#(x, y)$ to denote the count of the tuple $⟨x, y⟩$, i.e., the number of times that word $y$ appeared
within $w$ words to the left or right of $x$. Keep in mind that the tuples are ordered: the first item in the tuple
$⟨x, y⟩$ is the center word and the second item is the context word.

In [None]:
w = int(input('Window size: '))
V = set(x for x in input('Vocabulary: ').split())
V_c = set(x for x in input('Context vocabulary: ').split())


pairs = { }

for line in (open("wiki-1percent.txt", "r+")).readlines():
    #follow each line
    a = line.split()
    for i in range(len(a)):
        window = []
        #add words within range to window if word is in Vocabulary
        if (a[i]+'\n') in V and i >= w and i < (len(a) - w):
            window = a[i - w: i + w + 1]
            window.remove(window[w])
        elif (a[i]+'\n') in V and i < w:
            window = a[0: i + w + 1]
            window.remove(window[i])
        elif (a[i]+'\n') in V and i >= (len(a) - w):
            window = a[i - w: len(a)]
            window.remove(window[w])
        #remove words not in V_c from window and add to count
        for con in window:
            if (con+'\n') in V_c:
               try:
                   pairs[(a[i], con)] += 1
               except KeyError:
                   pairs[(a[i], con)] = 1


print(pairs)


After having computed these counts, we can use them to form word vectors for the words in $V$ . The
word vector for a word $x ∈ V$ is $|V_{C}|$−dimensional; that is, it has an entry for each context word y ∈ VC
with value $#(x, y)$. Therefore, the counts $#(\cdot, \cdot)$ can be viewed as representing word vectors for the
words in $V$ . Note that zero counts do not need to be represented explicitly in the counts $#(·, ·)$. You can
just store nonzero counts and assume that all other tuples have count zero. By doing so, word vectors
can be represented sparsely, saving memory and speeding up the computation of word similarities. We implement a cosine similarity function to obtain correlations between the obtained vectors and a function to provide an evaluation metric based on counting alone, IDF, or PMI.

In [5]:
#algorithm is the same as above
V = set(x for x in (open("vocab-15kws.txt", "r+")).read().splitlines())
V_c = set(x for x in (open("vocab-5k.txt", "r+")).read().splitlines())
corpus = open("wiki-1percent.txt", "r+").readlines()
lc = len(corpus)
def counting(w):
    pairs = {}
    for line in corpus:
        a = line.split()
        for i in range(len(a)):
            window = []
            if (a[i]) in V and i >= w and i < (len(a) - w):
                window = a[i - w: i + w + 1]
                window.remove(window[w])
            elif (a[i]) in V and i < w:
                window = a[0: i + w + 1]
                window.remove(window[i])
            elif (a[i]) in V and i >= (len(a) - w):
                window = a[i - w: len(a)]
                window.remove(window[w])
            for con in window:
                if (con) in V_c:
                   try:
                       pairs[(a[i], con)] += 1
                   except KeyError:
                       pairs[(a[i], con)] = 1
    return pairs
                


In [17]:
#creating the vectors where each component corresponds to the count with respect to another word
vectors_3 = {}
for voc in V:
    vectors_3[voc] = []
    for con in V_c:
        try:
            vectors_3[voc].append(pairs_3[(voc, con)])
        except KeyError:
              vectors_3[voc].append(0)
    

In [18]:
#cosine similarity (inner product) function
def cos_sim_3(voc1, voc2):
    if (norm(vectors_3[voc1]) != 0) and (norm(vectors_3[voc2]) != 0):
        cosine = np.dot(vectors_3[voc1], vectors_3[voc2])/(norm(vectors_3[voc1])*norm(vectors_3[voc2]))
    else:
        cosine = 0
    return cosine

In [19]:
counts_3 = {}
for voc in V:
    counts_3[voc] = {}
    for con in V_c:
        try:
            counts_3[voc][con] = { con : pairs_3[(voc,con)]}
        except KeyError:
            pass

In [28]:
#EvalWS evaluation metric
def EvalWS(hum_an: pd.DataFrame, counts: dict[str, dict[str, int]], 
           metric: Callable[[any], float], name: str, **kwargs) -> float:

    voc1 = hum_an.iloc[:,0]
    voc2 = hum_an.iloc[:,1]
    sim = np.zeros(len(hum_an))
    for i in range(len(hum_an)):
        if voc1[i] not in counts or voc2[i] not in counts:
            continue
        elif name == 'count': 
            sim[i] = metric(voc1[i], voc2[i])
        elif name == 'IDF': 
            sim[i] = metric(voc1[i], voc2[i])
        elif name == 'PMI': 
            sim[i] = metric(voc1[i], voc2[i])
    hum_scores = hum_an.iloc[:,2].values
    corr = stats.spearmanr(hum_scores, sim)
    return corr
    

In [21]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3, cos_sim_3, 'count') 

SignificanceResult(statistic=0.2251396048448754, pvalue=8.800788745595221e-36)

In [22]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3, cos_sim_3, 'count') 

SignificanceResult(statistic=0.05876135331349779, pvalue=0.0633756392544004)

# Part 2

One problem with using raw counts is that they place too much weight on extremely common words.
We’ll try to address this by using inverse document frequency (IDF). IDF was developed in the context
of document retrieval, but here we’re working with sentences so we’ll be applying the idea to sentences
rather than documents.
Let $S$ denote the set of sentences in the corpus. Then, instead of defining word vector entries using counts
$#(x, y)$, we will define them as follows. The word vector for a word $x ∈ V$ has an entry for each word
$y ∈ V_{C}$ with value given by: $$ #(x, y) \times |S|/|{s ∈ S : s contains y}| $$ The first term above is the “term frequency” (TF) and the second is the inverse of the “sentence frequency” for the context word. Note that if a word vector entry was previously zero when only using
counts, it is still zero when using the above formula, so we can still use sparse data structures.

In [21]:
#function to count frequency of word within sentence
def sen_freq(word):
    val = 0
    for line in corpus:
        if word in line:
            val += 1
    return val

In [22]:
sen_freqs = {}
for con in tqdm(V_c):
    sen_freqs[con] = sen_freq(con)

100%|███████████████████████████████████████| 5000/5000 [06:38<00:00, 12.55it/s]


In [25]:
#function to compute IDF with a window size of 3
def IDF_3(voc, con):
    if sen_freqs[con] != 0:
        val = (pairs_3[(voc,con)])*(lc/(sen_freqs[con]))
    else:
        val = 0
    return val
    

In [26]:
#computing vectors obtained
idf_3_vectors = {}
for voc in tqdm(counts_3):
    idf_3_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_3[(voc,con)] *= 1
            idf_3_vectors[voc].append(IDF_3(voc,con))
        except KeyError:
            idf_3_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:31<00:00, 485.18it/s]


In [27]:
def idf_sim_3(voc1, voc2):
    if (norm(idf_3_vectors[voc1]) != 0) and (norm(idf_3_vectors[voc2]) != 0):
        cosine = np.dot(idf_3_vectors[voc1], idf_3_vectors[voc2])/(norm(idf_3_vectors[voc1])*norm(idf_3_vectors[voc2]))
    else:
        cosine = 0
    return cosine

In [28]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3, idf_sim_3, 'IDF') 

SignificanceResult(statistic=0.44646592054501644, pvalue=6.145013710758384e-147)

In [29]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3, idf_sim_3, 'IDF') 

SignificanceResult(statistic=0.16111999057244053, pvalue=3.0614957547676104e-07)

# Part 3

While IDF helps by downweighting frequent words, the IDF for a context word is computed independently of the center word it is paired with. Next, we’ll define word vectors using pointwise mutual
information (PMI), which captures some information about the relationship between the center word
and context word in a single number.

We define two random variables: $X$ is a random variable representing the center word and $Y$ is a random
variable representing the context word. The set of possible events for $X$ is the vocabulary $V$ , while the set
of possible events for $Y$ is $V_{C}$. PMI is defined for the event pair $X = x$, $Y = y$ as follows:
$$ pmi(x, y) = \log_{2}p_{X,Y} (x, y)/p_{X}(x)p_{Y}(y) $$
where $p_{X}(x)$ and $p_{Y}(y)$ are the probability mass functions for $X$ and $Y$ , and $p_{X,Y} (x, y)$ is the joint distribution. 

We will compute PMIs by using the counts $#(x, y)$ that we computed above. First, we define the total
count N:

$$ N = \sum_{x}\sum_{y}#(x, y) $$

Then, we estimate the joint probabilities as follows:

$$ p_{X,Y} (x, y) = #(x, y)/N $$

$$p_{X}(x) = \sum_{y} #(x, y)/N $$
$$ p_{Y}(y) = \sum_{x} #(x, y)/N $$


Plugging this into our formula for PMI gives:
$$ pmi(x,y) = \log_{2}\frac{#(x,y) N}{sum_{x'}#(x',y)sum_{y'}#(x,y')} $$
We implement it here:

In [None]:
N = 0
#compute N as above
for x in tqdm(V):
    for y in V_c:
        try:
            N += pairs_3[(x, y)]
        except KeyError:
            N += 0

In [31]:
#compute mass probability for x
def prob_x(voc):
    val = 0
    for con in V_c:
        try:
            val += pairs_3[(voc, con)]
        except KeyError:
            val += 0
    return val

In [32]:
#compute mass probability for y
def prob_y(con):
    val = 0
    for voc in counts_3:
        try:
            val += pairs_3[(voc, con)]
        except KeyError:
            val += 0
    return val

In [33]:
probs_y = {}
for con in tqdm(V_c):
    probs_y[con] = prob_y(con)

100%|██████████████████████████████████████| 5000/5000 [00:23<00:00, 208.65it/s]


In [34]:
probs_x = {}
for voc in tqdm(counts_3):
    probs_x[voc] = prob_x(voc)

100%|████████████████████████████████████| 15228/15228 [00:23<00:00, 635.49it/s]


In [35]:
def pmi_3(voc, con):
    if probs_x[voc] != 0 and probs_y[con] != 0:
        val = np.log2((pairs_3[(voc, con)]*N)/(probs_x[voc]*probs_y[con]))
    else:
        val = 0
    return val

In [36]:
pmi_coffee = {}
for con in V_c:
    try:
        pmi_coffee[con] = pmi_3('coffee', con)
    except KeyError:
        pmi_coffee[con] = 0

In [37]:
print({ x : pmi_coffee[x] for x in nlargest(10, pmi_coffee, key = pmi_coffee.get) })

{'tea': 8.16600126243293, 'drinking': 7.58797865873193, 'shop': 7.411693771493207, 'costa': 7.350256393786161, 'shops': 7.260751873418467, 'sugar': 6.533949521544205, 'coffee': 6.501977131805925, 'mix': 6.131195903101976, 'seattle': 5.950816325067398, 'houses': 5.868161497268183}


In [40]:
print({ x : pmi_coffee[x] for x in nsmallest(10, pmi_coffee, key = pmi_coffee.get) })

{'he': -2.26033826495274, 'be': -2.1509730526875237, 'had': -1.9875291676196303, 'this': -1.979549817934235, 'not': -1.9115928402014317, 'its': -1.839457915441101, 'after': -1.598505205571959, 'more': -1.4785257922880328, 'when': -1.4043486976803334, 'page': -1.2805627423998573}


3.2

In [39]:
pmi_3_vectors = {}
for voc in tqdm(counts_3):
    pmi_3_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_3[(voc,con)] *= 1
            pmi_3_vectors[voc].append(pmi_3(voc,con))
        except KeyError:
            pmi_3_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:33<00:00, 451.13it/s]


In [41]:
def pmi_sim_3(voc1, voc2):
    if (norm(pmi_3_vectors[voc1]) != 0) and (norm(pmi_3_vectors[voc2]) != 0):
        cosine = np.dot(pmi_3_vectors[voc1], pmi_3_vectors[voc2])/(norm(pmi_3_vectors[voc1])*norm(pmi_3_vectors[voc2]))
    else:
        cosine = 0
    return cosine

In [42]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3, pmi_sim_3, 'PMI') 

SignificanceResult(statistic=0.46563240836038006, pvalue=2.5050388889148127e-161)

In [43]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3, pmi_sim_3, 'PMI')

SignificanceResult(statistic=0.18643183126956037, pvalue=2.8976468276084516e-09)

# Part 4

Now we repeat the same experiments with a different context vocabulary and different window sizes.

w = 1

In [11]:
def counting_15(w):
    pairs = {}
    for line in corpus:
        a = line.split()
        for i in range(len(a)):
            window = []
            if (a[i]) in V and i >= w and i < (len(a) - w):
                window = a[i - w: i + w + 1]
                window.remove(window[w])
            elif (a[i]) in V and i < w:
                window = a[0: i + w + 1]
                window.remove(window[i])
            elif (a[i]) in V and i >= (len(a) - w):
                window = a[i - w: len(a)]
                window.remove(window[w])
            for con in window:
                if (con) in V:
                   try:
                       pairs[(a[i], con)] += 1
                   except KeyError:
                       pairs[(a[i], con)] = 1
    return pairs
                


In [12]:
pairs_1_15 = counting_15(1)

In [136]:
vectors_1_15 = {}
for voc in tqdm(V):
    vectors_1_15[voc] = []
    for con in V:
        try:
            vectors_1_15[voc].append(pairs_1_15[(voc, con)])
        except KeyError:
              vectors_1_15[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:22<00:00, 184.86it/s]


In [13]:
counts_1_15 = {}
for voc in tqdm(V):
    counts_1_15[voc] = {}
    for con in V:
        try:
            counts_1_15[voc][con] = { con : pairs_1_15[(voc,con)]}
        except KeyError:
            pass

100%|████████████████████████████████████| 15228/15228 [01:12<00:00, 209.48it/s]


In [48]:
def cos_sim_1_15(voc1, voc2):
    if (norm(vectors_1_15[voc1]) != 0) and (norm(vectors_1_15[voc2]) != 0):
        cosine = np.dot(vectors_1_15[voc1], vectors_1_15[voc2])/(norm(vectors_1_15[voc1])*norm(vectors_1_15[voc2]))
    else:
        cosine = 0
    return cosine

------------------------Results------------------------------

In [49]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_1_15, cos_sim_1_15, 'count')

SignificanceResult(statistic=0.2063976325574214, pvalue=3.196256617167331e-30)

In [50]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_1_15, cos_sim_1_15, 'count')

SignificanceResult(statistic=0.07001369489995889, pvalue=0.026906915488976564)

---- w = 3

In [35]:
pairs_3_15 = counting_15(3)

In [150]:
vectors_3_15 = {}
for voc in tqdm(V):
    vectors_3_15[voc] = []
    for con in V:
        try:
            vectors_3_15[voc].append(pairs_3_15[(voc, con)])
        except KeyError:
              vectors_3_15[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:42<00:00, 148.59it/s]


In [36]:
counts_3_15 = {}
for voc in tqdm(V):
    counts_3_15[voc] = {}
    for con in V:
        try:
            counts_3_15[voc][con] = { con : pairs_3_15[(voc,con)]}
        except KeyError:
            pass

100%|████████████████████████████████████| 15228/15228 [01:26<00:00, 175.93it/s]


In [151]:
def cos_sim_3_15(voc1, voc2):
    if (norm(vectors_3_15[voc1]) != 0) and (norm(vectors_3_15[voc2]) != 0):
        cosine = np.dot(vectors_3_15[voc1], vectors_3_15[voc2])/(norm(vectors_3_15[voc1])*norm(vectors_3_15[voc2]))
    else:
        cosine = 0
    return cosine

-----------------------Results---------------------------------

In [152]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3_15, cos_sim_3_15, 'count')

SignificanceResult(statistic=0.2207784201051131, pvalue=1.928303905976344e-34)

In [153]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3_15, cos_sim_3_15, 'count')

SignificanceResult(statistic=0.05714229789355398, pvalue=0.07102643005951532)

w = 6

In [32]:
pairs_6_15 = counting_15(6)

In [58]:
vectors_6_15 = {}
for voc in tqdm(V):
    vectors_6_15[voc] = []
    for con in V:
        try:
            vectors_6_15[voc].append(pairs_6_15[(voc, con)])
        except KeyError:
              vectors_6_15[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [02:17<00:00, 110.77it/s]


In [33]:
counts_6_15 = {}
for voc in tqdm(V):
    counts_6_15[voc] = {}
    for con in V:
        try:
            counts_6_15[voc][con] = { con : pairs_6_15[(voc,con)]}
        except KeyError:
            pass

100%|████████████████████████████████████| 15228/15228 [01:49<00:00, 138.52it/s]


In [60]:
def cos_sim_6_15(voc1, voc2):
    if (norm(vectors_6_15[voc1]) != 0) and (norm(vectors_6_15[voc2]) != 0):
        cosine = np.dot(vectors_6_15[voc1], vectors_6_15[voc2])/(norm(vectors_6_15[voc1])*norm(vectors_6_15[voc2]))
    else:
        cosine = 0
    return cosine

-------------- Results -----------------

In [61]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_6_15, cos_sim_6_15, 'count')

SignificanceResult(statistic=0.23691121807651294, pvalue=1.5204903371938184e-39)

In [62]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_6_15, cos_sim_6_15, 'count')

SignificanceResult(statistic=0.04065367101089449, pvalue=0.1991905732194143)

* IDF, 15k 

w = 1

In [129]:
sen_freqs_15 = {}
for con in tqdm(V):
    sen_freqs_15[con] = sen_freq(con)

100%|█████████████████████████████████████| 15228/15228 [20:06<00:00, 12.62it/s]


In [130]:
def IDF_1_15(voc, con):
    if sen_freqs_15[con] != 0:
        val = (pairs_1_15[(voc,con)])*(lc/(sen_freqs_15[con]))
    else:
        val = 0
    return val

In [138]:
idf_1_15_vectors = {}
for voc in tqdm(counts_1_15):
    idf_1_15_vectors[voc] = []
    for con in V:
        try: 
            pairs_1_15[(voc,con)] *= 1
            idf_1_15_vectors[voc].append(IDF_1_15(voc,con))
        except KeyError:
            idf_1_15_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:25<00:00, 178.29it/s]


In [139]:
def idf_sim_1_15(voc1, voc2):
    if (norm(idf_1_15_vectors[voc1]) != 0) and (norm(idf_1_15_vectors[voc2]) != 0):
        cosine = np.dot(idf_1_15_vectors[voc1], idf_1_15_vectors[voc2])/(norm(idf_1_15_vectors[voc1])*norm(idf_1_15_vectors[voc2]))
    else:
        cosine = 0
    return cosine

-------------- Results -------------------------

In [140]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_1_15, idf_sim_1_15, 'IDF') 

SignificanceResult(statistic=0.35374791217129503, pvalue=3.814930609372954e-89)

In [141]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_1_15, idf_sim_1_15, 'IDF') 

SignificanceResult(statistic=0.17607311795459427, pvalue=2.119052859623529e-08)

w = 3

In [142]:
def IDF_3_15(voc, con):
    val = (pairs_3_15[(voc,con)])*(lc/(sen_freqs_15[con]))
    return val

In [155]:
idf_3_15_vectors = {}
for voc in tqdm(counts_3_15):
    idf_3_15_vectors[voc] = []
    for con in V:
        try: 
            pairs_3_15[(voc,con)] *= 1
            idf_3_15_vectors[voc].append(IDF_3_15(voc,con))
        except KeyError:
            idf_3_15_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:40<00:00, 152.16it/s]


In [156]:
def idf_sim_3_15(voc1, voc2):
    if (norm(idf_3_15_vectors[voc1]) != 0) and (norm(idf_3_15_vectors[voc2]) != 0):
        cosine = np.dot(idf_3_15_vectors[voc1], idf_3_15_vectors[voc2])/(norm(idf_3_15_vectors[voc1])*norm(idf_3_15_vectors[voc2]))
    else:
        cosine = 0
    return cosine

--------------------- Results ---------------------

In [157]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3_15, idf_sim_3_15, 'IDF') 

SignificanceResult(statistic=0.45888326548120223, pvalue=3.7255880980687796e-156)

In [158]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3_15, idf_sim_3_15, 'IDF') 

SignificanceResult(statistic=0.1489147540265972, pvalue=2.276020577804637e-06)

w = 6

In [159]:
def IDF_6_15(voc, con):
    if sen_freqs_15[con] != 0:
        val = (pairs_6_15[(voc,con)])*(lc/(sen_freqs_15[con]))
    else:
        val = 0
    return val

In [164]:
idf_6_15_vectors = {}
for voc in tqdm(counts_6_15):
    idf_6_15_vectors[voc] = []
    for con in V:
        try: 
            pairs_6_15[(voc,con)] *= 1
            idf_6_15_vectors[voc].append(IDF_6_15(voc,con))
        except KeyError:
            idf_6_15_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [02:01<00:00, 125.19it/s]


In [165]:
def idf_sim_6_15(voc1, voc2):
    if (norm(idf_6_15_vectors[voc1]) != 0) and (norm(idf_6_15_vectors[voc2]) != 0):
        cosine = np.dot(idf_6_15_vectors[voc1], idf_6_15_vectors[voc2])/(norm(idf_6_15_vectors[voc1])*norm(idf_6_15_vectors[voc2]))
    else:
        cosine = 0
    return cosine

--------------------- Results ---------------------

In [166]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_6_15, idf_sim_6_15, 'IDF') 

SignificanceResult(statistic=0.5056665321215367, pvalue=1.6243213643410148e-194)

In [167]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_6_15, idf_sim_6_15, 'IDF') 

SignificanceResult(statistic=0.11788601900660632, pvalue=0.00018817645331045238)

* PMI, 15k

w = 1 

In [15]:
N_1_15 = 0
for x in tqdm(counts_1_15):
    for y in V:
        try:
            N_1_15 += pairs_1_15[(x, y)]
        except KeyError:
            N_1_15 += 0

100%|████████████████████████████████████| 15228/15228 [01:15<00:00, 202.31it/s]


In [16]:
def prob_x_1_15(voc):
    val = 0
    for con in V:
        try:
            val += pairs_1_15[(voc, con)]
        except KeyError:
            val += 0
    return val

In [17]:
def prob_y_1_15(con):
    val = 0
    for voc in counts_1_15:
        try:
            val += pairs_1_15[(voc, con)]
        except KeyError:
            val += 0
    return val

In [18]:
probs_y_1_15 = {}
for con in tqdm(V):
    probs_y_1_15[con] = prob_y_1_15(con)

100%|████████████████████████████████████| 15228/15228 [01:02<00:00, 242.87it/s]


In [20]:
probs_x_1_15 = {}
for voc in tqdm(V):
    probs_x_1_15[voc] = prob_x_1_15(voc)

100%|████████████████████████████████████| 15228/15228 [01:01<00:00, 247.14it/s]


In [23]:
def pmi_1_15(voc, con):
    if probs_x_1_15[voc] != 0 and probs_y_1_15[con] != 0:
        val = np.log2((pairs_1_15[(voc, con)]*N_1_15)/(probs_x_1_15[voc]*probs_y_1_15[con]))
    else:
        val = 0
    return val

In [24]:
pmi_1_15_vectors = {}
for voc in tqdm(counts_1_15):
    pmi_1_15_vectors[voc] = []
    for con in V:
        try: 
            pairs_1_15[(voc,con)] *= 1
            pmi_1_15_vectors[voc].append(pmi_1_15(voc,con))
        except KeyError:
            pmi_1_15_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:22<00:00, 185.14it/s]


In [26]:
def pmi_sim_1_15(voc1, voc2):
    try:
        if (norm(pmi_1_15_vectors[voc1]) != 0) and (norm(pmi_1_15_vectors[voc2]) != 0):
            cosine = np.dot(pmi_1_15_vectors[voc1], pmi_1_15_vectors[voc2])/(norm(pmi_1_15_vectors[voc1])*norm(pmi_1_15_vectors[voc2]))
        else:
            cosine = 0
    except KeyError:
        cosine = 0
    return cosine

---------- Results -----------------

In [29]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_1_15, pmi_sim_1_15, 'PMI') 

SignificanceResult(statistic=0.47023686998764175, pvalue=6.349687433050748e-165)

In [30]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_1_15, pmi_sim_1_15, 'PMI') 

SignificanceResult(statistic=0.2680651460928735, pvalue=6.673823511814832e-18)

w = 3

In [37]:
N_3_15 = 0
for x in tqdm(counts_3_15):
    for y in V:
        try:
            N_3_15 += pairs_3_15[(x, y)]
        except KeyError:
            N_3_15 += 0

100%|████████████████████████████████████| 15228/15228 [01:27<00:00, 174.82it/s]


In [38]:
def prob_x_3_15(voc):
    val = 0
    for con in V:
        try:
            val += pairs_3_15[(voc, con)]
        except KeyError:
            val += 0
    return val

In [39]:
def prob_y_3_15(con):
    val = 0
    for voc in V:
        try:
            val += pairs_3_15[(voc, con)]
        except KeyError:
            val += 0
    return val

In [40]:
probs_y_3_15 = {}
for con in tqdm(V):
    probs_y_3_15[con] = prob_y_3_15(con)

100%|████████████████████████████████████| 15228/15228 [01:18<00:00, 193.73it/s]


In [41]:
probs_x_3_15 = {}
for voc in tqdm(V):
    probs_x_3_15[voc] = prob_x_3_15(voc)

100%|████████████████████████████████████| 15228/15228 [01:20<00:00, 188.07it/s]


In [42]:
def pmi_3_15(voc, con):
    if probs_x_3_15[voc] != 0 and probs_y_3_15[con] != 0:
        val = np.log2((pairs_3_15[(voc, con)]*N_3_15)/(probs_x_3_15[voc]*probs_y_3_15[con]))
    else:
        val = 0
    return val

In [43]:
pmi_3_15_vectors = {}
for voc in tqdm(V):
    pmi_3_15_vectors[voc] = []
    for con in V:
        try: 
            pairs_3_15[(voc,con)] *= 1
            pmi_3_15_vectors[voc].append(pmi_3_15(voc,con))
        except KeyError:
            pmi_3_15_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:45<00:00, 143.93it/s]


In [44]:
def pmi_sim_3_15(voc1, voc2):
    if (norm(pmi_3_15_vectors[voc1]) != 0) and (norm(pmi_3_15_vectors[voc2]) != 0):
        cosine = np.dot(pmi_3_15_vectors[voc1], pmi_3_15_vectors[voc2])/(norm(pmi_3_15_vectors[voc1])*norm(pmi_3_15_vectors[voc2]))
    else:
        cosine = 0
    return cosine

In [45]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3_15, pmi_sim_3_15, 'PMI') 

SignificanceResult(statistic=0.5193931025800151, pvalue=5.9479248995790086e-207)

In [46]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3_15, pmi_sim_3_15, 'PMI') 

SignificanceResult(statistic=0.2122918467026247, pvalue=1.2108366674818686e-11)

w = 6

In [47]:
N_6_15 = 0
for x in tqdm(counts_6_15):
    for y in V:
        try:
            N_6_15 += pairs_6_15[(x, y)]
        except KeyError:
            N_6_15 += 0

100%|████████████████████████████████████| 15228/15228 [01:39<00:00, 152.66it/s]


In [48]:
def prob_x_6_15(voc):
    val = 0
    for con in V:
        try:
            val += pairs_6_15[(voc, con)]
        except KeyError:
            val += 0
    return val

In [49]:
def prob_y_6_15(con):
    val = 0
    for voc in V:
        try:
            val += pairs_6_15[(voc, con)]
        except KeyError:
            val += 0
    return val

In [50]:
probs_y_6_15 = {}
for con in tqdm(V):
    probs_y_6_15[con] = prob_y_6_15(con)

100%|████████████████████████████████████| 15228/15228 [01:36<00:00, 157.60it/s]


In [51]:
probs_x_6_15 = {}
for voc in tqdm(V):
    probs_x_6_15[voc] = prob_x_6_15(voc)

100%|████████████████████████████████████| 15228/15228 [01:29<00:00, 170.50it/s]


In [52]:
def pmi_6_15(voc, con):
    if probs_x_6_15[voc] != 0 and probs_y_6_15[con] != 0:
        val = np.log2((pairs_6_15[(voc, con)]*N_6_15)/(probs_x_6_15[voc]*probs_y_6_15[con]))
    else:
        val = 0
    return val

In [53]:
pmi_6_15_vectors = {}
for voc in tqdm(counts_6_15):
    pmi_6_15_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_6_15[(voc,con)] *= 1
            pmi_6_15_vectors[voc].append(pmi_6_15(voc,con))
        except KeyError:
            pmi_6_15_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [01:24<00:00, 179.94it/s]


In [54]:
def pmi_sim_6_15(voc1, voc2):
    if (norm(pmi_6_15_vectors[voc1]) != 0) and (norm(pmi_6_15_vectors[voc2]) != 0):
        cosine = np.dot(pmi_6_15_vectors[voc1], pmi_6_15_vectors[voc2])/(norm(pmi_6_15_vectors[voc1])*norm(pmi_6_15_vectors[voc2]))
    else:
        cosine = 0
    return cosine

In [55]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_6_15, pmi_sim_6_15, 'PMI') 

SignificanceResult(statistic=0.4720569004178326, pvalue=2.3230119403338823e-166)

In [56]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_6_15, pmi_sim_6_15, 'PMI') 

SignificanceResult(statistic=0.15210098244200515, pvalue=1.3683723395019382e-06)

* Counts, 5k

w = 1

In [8]:
def counting_5(w):
    pairs = {}
    for line in corpus:
        a = line.split()
        for i in range(len(a)):
            window = []
            if (a[i]) in V and i >= w and i < (len(a) - w):
                window = a[i - w: i + w + 1]
                window.remove(window[w])
            elif (a[i]) in V and i < w:
                window = a[0: i + w + 1]
                window.remove(window[i])
            elif (a[i]) in V and i >= (len(a) - w):
                window = a[i - w: len(a)]
                window.remove(window[w])
            for con in window:
                if (con) in V_c:
                   try:
                       pairs[(a[i], con)] += 1
                   except KeyError:
                       pairs[(a[i], con)] = 1
    return pairs
                


In [26]:
pairs_1_5 = counting_5(1)

In [156]:
vectors_1_5 = {}
for voc in tqdm(V):
    vectors_1_5[voc] = []
    for con in V_c:
        try:
            vectors_1_5[voc].append(pairs_1_5[(voc, con)])
        except KeyError:
              vectors_1_5[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:27<00:00, 556.47it/s]


In [30]:
counts_1_5 = {}
for voc in tqdm(V):
    counts_1_5[voc] = {}
    for con in V_c:
        try:
            counts_1_5[voc][con] = { con : pairs_1_5[(voc,con)]}
        except KeyError:
            pass

100%|████████████████████████████████████| 15228/15228 [00:23<00:00, 635.35it/s]


In [158]:
def cos_sim_1_5(voc1, voc2):
    if (norm(vectors_1_5[voc1]) != 0) and (norm(vectors_1_5[voc2]) != 0):
        cosine = np.dot(vectors_1_5[voc1], vectors_1_5[voc2])/(norm(vectors_1_5[voc1])*norm(vectors_1_5[voc2]))
    else:
        cosine = 0
    return cosine

------------------------Results------------------------------

In [159]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_1_5, cos_sim_1_5, 'count')

SignificanceResult(statistic=0.20909154357455798, pvalue=5.464614687617875e-31)

In [160]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_1_5, cos_sim_1_5, 'count')

SignificanceResult(statistic=0.06778569953818134, pvalue=0.03217079362556691)

---- w = 3

In [6]:
pairs_3_5 = counting_5(3)

In [7]:
vectors_3_5 = {}
for voc in tqdm(V):
    vectors_3_5[voc] = []
    for con in V_c:
        try:
            vectors_3_5[voc].append(pairs_3_5[(voc, con)])
        except KeyError:
              vectors_3_5[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:30<00:00, 505.06it/s]


In [8]:
counts_3_5 = {}
for voc in tqdm(V):
    counts_3_5[voc] = {}
    for con in V_c:
        try:
            counts_3_5[voc][con] = { con : pairs_3_5[(voc,con)]}
        except KeyError:
            pass

100%|████████████████████████████████████| 15228/15228 [00:27<00:00, 545.95it/s]


In [9]:
def cos_sim_3_5(voc1, voc2):
    if (norm(vectors_3_5[voc1]) != 0) and (norm(vectors_3_5[voc2]) != 0):
        cosine = np.dot(vectors_3_5[voc1], vectors_3_5[voc2])/(norm(vectors_3_5[voc1])*norm(vectors_3_5[voc2]))
    else:
        cosine = 0
    return cosine

-----------------------Results---------------------------------

In [12]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3_5, cos_sim_3_5, 'count')

SignificanceResult(statistic=0.2251396048448754, pvalue=8.800788745595221e-36)

In [13]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3_5, cos_sim_3_5, 'count')

SignificanceResult(statistic=0.05876135331349779, pvalue=0.0633756392544004)

w = 6

In [9]:
pairs_6_5 = counting_5(6)

In [15]:
vectors_6_5 = {}
for voc in tqdm(V):
    vectors_6_5[voc] = []
    for con in V_c:
        try:
            vectors_6_5[voc].append(pairs_6_5[(voc, con)])
        except KeyError:
              vectors_6_5[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:29<00:00, 508.39it/s]


In [16]:
counts_6_5 = {}
for voc in tqdm(V):
    counts_6_5[voc] = {}
    for con in V_c:
        try:
            counts_6_5[voc][con] = { con : pairs_6_5[(voc,con)]}
        except KeyError:
            pass

100%|████████████████████████████████████| 15228/15228 [00:28<00:00, 534.33it/s]


In [17]:
def cos_sim_6_5(voc1, voc2):
    if (norm(vectors_6_5[voc1]) != 0) and (norm(vectors_6_5[voc2]) != 0):
        cosine = np.dot(vectors_6_5[voc1], vectors_6_5[voc2])/(norm(vectors_6_5[voc1])*norm(vectors_6_5[voc2]))
    else:
        cosine = 0
    return cosine

-------------- Results -----------------

In [18]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_6_5, cos_sim_6_5, 'count')

SignificanceResult(statistic=0.24106664963215607, pvalue=6.355522116916368e-41)

In [19]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_6_5, cos_sim_6_5, 'count')

SignificanceResult(statistic=0.04469576384051759, pvalue=0.1580577270004679)

* IDF, 5k 

w = 1

In [23]:
sen_freqs_5 = sen_freqs

In [24]:
def IDF_1_5(voc, con):
    if sen_freqs_5[con] != 0:
        val = (pairs_1_5[(voc,con)])*(lc/(sen_freqs_5[con]))
    else:
        val = 0
    return val

In [27]:
idf_1_5_vectors = {}
for voc in tqdm(V):
    idf_1_5_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_1_5[(voc,con)] *= 1
            idf_1_5_vectors[voc].append(IDF_1_5(voc,con))
        except KeyError:
            idf_1_5_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:31<00:00, 486.16it/s]


In [28]:
def idf_sim_1_5(voc1, voc2):
    if (norm(idf_1_5_vectors[voc1]) != 0) and (norm(idf_1_5_vectors[voc2]) != 0):
        cosine = np.dot(idf_1_5_vectors[voc1], idf_1_5_vectors[voc2])/(norm(idf_1_5_vectors[voc1])*norm(idf_1_5_vectors[voc2]))
    else:
        cosine = 0
    return cosine

-------------- Results -------------------------

In [32]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_1_5, idf_sim_1_5, 'IDF') 

SignificanceResult(statistic=0.3224946913281174, pvalue=1.4500412473429356e-73)

In [34]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_1_5, idf_sim_1_5, 'IDF') 

SignificanceResult(statistic=0.16997314465621835, pvalue=6.481392516427962e-08)

w = 3

In [37]:
def IDF_3_5(voc, con):
    val = (pairs_3_5[(voc,con)])*(lc/(sen_freqs[con]))
    return val

In [38]:
idf_3_5_vectors = {}
for voc in tqdm(V):
    idf_3_5_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_3_5[(voc,con)] *= 1
            idf_3_5_vectors[voc].append(IDF_3_5(voc,con))
        except KeyError:
            idf_3_5_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:32<00:00, 462.76it/s]


In [39]:
def idf_sim_3_5(voc1, voc2):
    if (norm(idf_3_5_vectors[voc1]) != 0) and (norm(idf_3_5_vectors[voc2]) != 0):
        cosine = np.dot(idf_3_5_vectors[voc1], idf_3_5_vectors[voc2])/(norm(idf_3_5_vectors[voc1])*norm(idf_3_5_vectors[voc2]))
    else:
        cosine = 0
    return cosine

--------------------- Results ---------------------

In [41]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3_5, idf_sim_3_5, 'IDF') 

SignificanceResult(statistic=0.44646592054501644, pvalue=6.145013710758384e-147)

In [42]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3_5, idf_sim_3_5, 'IDF') 

SignificanceResult(statistic=0.16111999057244053, pvalue=3.0614957547676104e-07)

w = 6

In [43]:
def IDF_6_5(voc, con):
    val = (pairs_6_5[(voc,con)])*(lc/(sen_freqs[con]))
    return val

In [44]:
idf_6_5_vectors = {}
for voc in tqdm(counts_6_5):
    idf_6_5_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_6_5[(voc,con)] *= 1
            idf_6_5_vectors[voc].append(IDF_6_5(voc,con))
        except KeyError:
            idf_6_5_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:41<00:00, 362.87it/s]


In [45]:
def idf_sim_6_5(voc1, voc2):
    if (norm(idf_6_5_vectors[voc1]) != 0) and (norm(idf_6_5_vectors[voc2]) != 0):
        cosine = np.dot(idf_6_5_vectors[voc1], idf_6_5_vectors[voc2])/(norm(idf_6_5_vectors[voc1])*norm(idf_6_5_vectors[voc2]))
    else:
        cosine = 0
    return cosine

--------------------- Results ---------------------

In [47]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_6_5, idf_sim_6_5, 'IDF') 

SignificanceResult(statistic=0.48994814292760613, pvalue=5.892843252710712e-181)

In [48]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_6_5, idf_sim_6_5, 'IDF') 

SignificanceResult(statistic=0.11291312382637779, pvalue=0.0003491403417042362)

* PMI, 5k

w = 1 

In [52]:
N_1_5 = 0
for x in tqdm(V):
    for y in V_c:
        try:
            N_1_5 += pairs_1_5[(x, y)]
        except KeyError:
            N_1_5 += 0

100%|████████████████████████████████████| 15228/15228 [00:24<00:00, 616.54it/s]


In [53]:
def prob_x_1_5(voc):
    val = 0
    for con in V_c:
        try:
            val += pairs_1_5[(voc, con)]
        except KeyError:
            val += 0
    return val

In [54]:
def prob_y_1_5(con):
    val = 0
    for voc in counts_1_5:
        try:
            val += pairs_1_5[(voc, con)]
        except KeyError:
            val += 0
    return val

In [55]:
probs_y_1_5 = {}
for con in tqdm(V_c):
    probs_y_1_5[con] = prob_y_1_5(con)

100%|██████████████████████████████████████| 5000/5000 [00:19<00:00, 255.19it/s]


In [56]:
probs_x_1_5 = {}
for voc in tqdm(counts_1_5):
    probs_x_1_5[voc] = prob_x_1_5(voc)

100%|████████████████████████████████████| 15228/15228 [00:20<00:00, 760.85it/s]


In [62]:
def pmi_1_5(voc, con):
    if probs_x_1_5[voc] != 0 and probs_y_1_5[con] != 0:
        val = np.log2((pairs_1_5[(voc, con)]*N_1_5)/(probs_x_1_5[voc]*probs_y_1_5[con]))
    else:
        val = 0
    return val

In [63]:
pmi_1_5_vectors = {}
for voc in tqdm(counts_3_5):
    pmi_1_5_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_1_5[(voc,con)] *= 1
            pmi_1_5_vectors[voc].append(pmi_1_5(voc,con))
        except KeyError:
            pmi_1_5_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:38<00:00, 398.80it/s]


In [64]:
def pmi_sim_1_5(voc1, voc2):
    if (norm(pmi_1_5_vectors[voc1]) != 0) and (norm(pmi_1_5_vectors[voc2]) != 0):
        cosine = np.dot(pmi_1_5_vectors[voc1], pmi_1_5_vectors[voc2])/(norm(pmi_1_5_vectors[voc1])*norm(pmi_1_5_vectors[voc2]))
    else:
        cosine = 0
    return cosine

---------- Results -----------------

In [65]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_1_5, pmi_sim_1_5, 'PMI') 

SignificanceResult(statistic=0.43360312970252096, pvalue=8.727721983966118e-138)

In [66]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_1_5, pmi_sim_1_5, 'PMI') 

SignificanceResult(statistic=0.2274977000485412, pvalue=3.409809747132032e-13)

w = 3

In [67]:
N_3_5 = 0
for x in tqdm(counts_3_5):
    for y in V_c:
        try:
            N += pairs_3_5[(x, y)]
        except KeyError:
            N += 0

100%|████████████████████████████████████| 15228/15228 [00:28<00:00, 527.64it/s]


In [68]:
def prob_x_3_5(voc):
    val = 0
    for con in V_c:
        try:
            val += pairs_3_5[(voc, con)]
        except KeyError:
            val += 0
    return val

In [69]:
def prob_y_3_5(con):
    val = 0
    for voc in counts_3_5:
        try:
            val += pairs_3_5[(voc, con)]
        except KeyError:
            val += 0
    return val

In [70]:
probs_y_3_5 = {}
for con in tqdm(V):
    probs_y_3_5[con] = prob_y_3_5(con)

100%|████████████████████████████████████| 15228/15228 [01:09<00:00, 218.91it/s]


In [71]:
probs_x_3_5 = {}
for voc in tqdm(counts_3_5):
    probs_x_3_5[voc] = prob_x_3_5(voc)

100%|████████████████████████████████████| 15228/15228 [00:24<00:00, 621.40it/s]


In [83]:
def pmi_3_5(voc, con):
    if probs_x_3_5[voc] != 0 and probs_y_3_5[con] != 0:
        val = np.log2((pairs_3_5[(voc, con)]*N_3_5)/(probs_x_3_5[voc]*probs_y_3_5[con]))
    else:
        val = 0
    return val

In [73]:
pmi_3_5_vectors = {}
for voc in tqdm(counts_3_5):
    pmi_3_5_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_3_5[(voc,con)] *= 1
            pmi_3_5_vectors[voc].append(pmi_3_5(voc,con))
        except KeyError:
            pmi_3_5_vectors[voc].append(0)

100%|████████████████████████████████████| 15228/15228 [00:36<00:00, 418.96it/s]


In [84]:
def pmi_sim_3_5(voc1, voc2):
    if (norm(pmi_3_5_vectors[voc1]) != 0) and (norm(pmi_3_5_vectors[voc2]) != 0):
        cosine = np.dot(pmi_3_5_vectors[voc1], pmi_3_5_vectors[voc2])/(norm(pmi_3_5_vectors[voc1])*norm(pmi_3_5_vectors[voc2]))
    else:
        cosine = 0
    return cosine

In [85]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_3_5, pmi_sim_3_5, 'PMI') 

SignificanceResult(statistic=0.46563240836038006, pvalue=2.5050388889148127e-161)

In [86]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_3_5, pmi_sim_3_5, 'PMI') 

SignificanceResult(statistic=0.18643183126956037, pvalue=2.8976468276084516e-09)

w = 6

In [119]:
N_6_5 = 0
for x in tqdm(V):
    for y in V_c:
        try:
            N_6_5 += pairs_6_5[(x, y)]
        except KeyError:
            N_6_5 += 0

100%|████████████████████████████████████| 15228/15228 [00:33<00:00, 456.97it/s]


In [120]:
def prob_x_6_5(voc):
    val = 0
    for con in V_c:
        try:
            val += pairs_6_5[(voc, con)]
        except KeyError:
            val += 0
    return val

In [121]:
def prob_y_6_5(con):
    val = 0
    for voc in V:
        try:
            val += pairs_6_5[(voc, con)]
        except KeyError:
            val += 0
    return val

In [101]:
probs_y_6_5 = {}
for con in tqdm(V_c):
    probs_y_6_5[con] = prob_y_6_5(con)

100%|██████████████████████████████████████| 5000/5000 [00:31<00:00, 156.25it/s]


In [102]:
probs_x_6_5 = {}
for voc in tqdm(counts_6_5):
    probs_x_6_5[voc] = prob_x_6_5(voc)

100%|████████████████████████████████████| 15228/15228 [00:31<00:00, 491.08it/s]


In [3]:
def pmi_6_5(voc, con):
    if probs_x_6_5[voc] != 0 and probs_y_6_5[con] != 0:
        val = np.log2((pairs_6_5[(voc, con)]*N_6_5)/(probs_x_6_5[voc]*probs_y_6_5[con]))
    else:
        val = 0
    return val

In [None]:
pmi_6_5_vectors = {}
for voc in tqdm(V):
    pmi_6_5_vectors[voc] = []
    for con in V_c:
        try: 
            pairs_6_5[(voc,con)] *= 1
            pmi_6_5_vectors[voc].append(pmi_6_5(voc,con))
        except KeyError:
            pmi_6_5_vectors[voc].append(0)

In [125]:
def pmi_sim_6_5(voc1, voc2):
    if (norm(pmi_6_5_vectors[voc1]) != 0) and (norm(pmi_6_5_vectors[voc2]) != 0):
        cosine = np.dot(pmi_6_5_vectors[voc1], pmi_6_5_vectors[voc2])/(norm(pmi_6_5_vectors[voc1])*norm(pmi_6_5_vectors[voc2]))
    else:
        cosine = 0
    return cosine

In [126]:
EvalWS((pd.read_csv('men.txt', sep="\t", header=None, skiprows = [0])), counts_6_5, pmi_sim_6_5, 'PMI') 

SignificanceResult(statistic=0.472408394245907, pvalue=1.2234451339873135e-166)

In [127]:
EvalWS((pd.read_csv('simlex-999.txt', sep="\t", header=None, skiprows = [0])), counts_6_5, pmi_sim_6_5, 'PMI') 

SignificanceResult(statistic=0.15033132295378585, pvalue=1.8175892851158345e-06)

It seems like for both context vocabularies the correlation factor for Simlex decreases as the window size increases, for all three similarity metrics. The opposite is true for Men, as the correlation factor grows with the window size. In general , IDF and PMI do much better than Counts, although it is unclear from these findings which obtains a higher correlation factor between IDF and PMI. When context vocabulary changes it seems like the correlation factors decrease (i.e. the 15k context vocabulary obtains higher correlation factors), which I think makes sense since more context provides a better way of differentiating between words that are related and words that aren't (i.e. if the context vocabulary were populated mainly by words appearing in the corpus, it would be very hard to differentiate between them).

4.2. We can imagine that these trends are occuring since Men appears to contain about twice as many annotations, and as I mentioned above, having a larger sample it is easier to differentiate between the similarities between words. Moreover, the values in men.txt vary from 0 to 50 whereas those in simlex-999.txt only vary approximately between 0 and 10, so the same reasoning as above tells us that it is harder to differentiate the similarities for simlex-999.

# Part 5

5.1

w = 1

In [59]:
judges_cos_sims = {}
for y in V:
    judges_cos_sims[y] = pmi_sim_1_15('judges',y)

In [62]:
print(nlargest(11, judges_cos_sims, key = judges_cos_sims.get))

['judges', 'judge', 'justices', 'arbitrators', 'players', 'trustees', 'contestants', 'officials', 'admins', 'appeals', 'officers']


w = 6

In [63]:
judges_cos_sims_6 = {}
for y in V:
    judges_cos_sims_6[y] = pmi_sim_6_15('judges',y)

In [64]:
print(nlargest(11, judges_cos_sims_6, key = judges_cos_sims_6.get))

['judges', 'judge', 'jury', 'appeals', 'courts', 'panel', 'supreme', 'justice', 'contestants', 'candidates', 'appeal']


5.2

First we try the word 'transported':

In [65]:
transported_cos_sims = {}
for y in V:
    transported_cos_sims[y] = pmi_sim_1_15('transported',y)

In [66]:
print(nlargest(11, transported_cos_sims, key = transported_cos_sims.get))

['transported', 'marched', 'shipped', 'deported', 'reassigned', 'detected', 'relegated', 'subjected', 'vanish', 'converted', 'dragged']


In [69]:
transported_cos_sims_6 = {}
for y in V:
    transported_cos_sims_6[y] = pmi_sim_6_15('transported',y)

In [71]:
print(nlargest(11, transported_cos_sims_6, key = transported_cos_sims_6.get))

['transported', 'cargo', 'supplies', 'transport', 'carrying', 'prisoners', 'supply', 'shipping', 'passengers', 'ships', 'captured']


Now we try 'beautiful':

In [67]:
beautiful_cos_sims = {}
for y in V:
    beautiful_cos_sims[y] = pmi_sim_1_15('beautiful',y)

In [68]:
print(nlargest(11, beautiful_cos_sims, key = beautiful_cos_sims.get))

['beautiful', 'attractive', 'scenic', 'dark', 'amazing', 'whose', 'magnificent', 'quiet', 'picturesque', 'strange', 'surrounding']


In [72]:
beautiful_cos_sims_6 = {}
for y in V:
    beautiful_cos_sims_6[y] = pmi_sim_6_15('beautiful',y)

In [73]:
print(nlargest(11, beautiful_cos_sims_6, key = beautiful_cos_sims_6.get))

['beautiful', 'girl', 'dark', 'beauty', 'featuring', 'rich', 'eyes', 'herself', 'theme', 'dancing', 'moon']


Now we try 'when':

In [74]:
when_cos_sims = {}
for y in V:
    when_cos_sims[y] = pmi_sim_1_15('when',y)

In [75]:
print(nlargest(11, when_cos_sims, key = when_cos_sims.get))

['when', 'after', 'by', 'while', 'if', 'before', ';', 'during', 'and', 'where', 'had']


In [76]:
when_cos_sims_6 = {}
for y in V:
    when_cos_sims_6[y] = pmi_sim_6_15('when',y)

In [77]:
print(nlargest(11, when_cos_sims_6, key = when_cos_sims_6.get))

['when', 'after', 'then', 'but', 'she', 'he', 'had', 'him', 'her', 'before', 'they']


Now we try 'him':

In [94]:
him_cos_sims = {}
for y in V:
    him_cos_sims[y] = pmi_sim_1_15('him',y)

In [95]:
print(nlargest(11, him_cos_sims, key = him_cos_sims.get))

['him', 'them', 'me', 'her', 'himself', 'then', 'you', 'being', 'anyone', 'been', 'someone']


In [96]:
him_cos_sims_6 = {}
for y in V:
    him_cos_sims_6[y] = pmi_sim_6_15('him',y)

In [97]:
print(nlargest(11, him_cos_sims_6, key = him_cos_sims_6.get))

['him', 'her', 'his', 'them', 'he', 'she', 'who', 'i', 'they', 'but', 'you']


Now we try 'his':

In [78]:
his_cos_sims = {}
for y in V:
    his_cos_sims[y] = pmi_sim_1_15('his',y)

In [79]:
print(nlargest(11, his_cos_sims, key = his_cos_sims.get))

['his', 'her', 'their', 'its', 'my', "'s", 'your', 'it', 'this', 'own', 'one']


In [80]:
his_cos_sims_6 = {}
for y in V:
    his_cos_sims_6[y] = pmi_sim_6_15('his',y)

In [81]:
print(nlargest(11, his_cos_sims_6, key = his_cos_sims_6.get))

['his', 'he', 'her', "'s", 'who', 'she', 'him', 'after', 'had', 'later', 'when']


Now we try 'her':

In [98]:
her_cos_sims = {}
for y in V:
    her_cos_sims[y] = pmi_sim_1_15('her',y)

In [99]:
print(nlargest(11, her_cos_sims, key = her_cos_sims.get))

['her', 'his', 'their', 'him', 'them', 'my', 'its', 'your', "'s", 'me', 'she']


In [100]:
her_cos_sims_6 = {}
for y in V:
    her_cos_sims_6[y] = pmi_sim_6_15('her',y)

In [101]:
print(nlargest(11, her_cos_sims_6, key = her_cos_sims_6.get))

['her', 'his', 'she', 'him', 'he', 'who', 'after', "'s", '"', 'when', 'them']


Now we try 'them':

In [82]:
them_cos_sims = {}
for y in V:
    them_cos_sims[y] = pmi_sim_1_15('them',y)

In [83]:
print(nlargest(11, them_cos_sims, key = them_cos_sims.get))

['them', 'him', 'me', 'her', 'you', 'then', 'others', 'up', 'himself', 'people', 'themselves']


In [84]:
them_cos_sims_6 = {}
for y in V:
    them_cos_sims_6[y] = pmi_sim_6_15('them',y)

In [85]:
print(nlargest(11, them_cos_sims_6, key = them_cos_sims_6.get))

['them', 'they', 'so', 'you', 'if', 'but', 'i', 'him', 'not', 'that', 'do']


Now we try 'an':

In [86]:
an_cos_sims = {}
for y in V:
    an_cos_sims[y] = pmi_sim_1_15('an',y)

In [87]:
print(nlargest(11, an_cos_sims, key = an_cos_sims.get))

['an', 'its', 'more', 'not', 'one', 'it', 'no', 'but', 'or', 'most', 'this']


In [88]:
an_cos_sims_6 = {}
for y in V:
    an_cos_sims_6[y] = pmi_sim_6_15('an',y)

In [89]:
print(nlargest(11, an_cos_sims_6, key = an_cos_sims_6.get))

['an', 'is', 'a', 'this', 'as', 'not', 'it', 'be', 'that', 'no', '"']


Now we try 'running':

In [90]:
running_cos_sims = {}
for y in V:
    running_cos_sims[y] = pmi_sim_1_15('running',y)

In [91]:
print(nlargest(11, running_cos_sims, key = running_cos_sims.get))

['running', 'run', 'ran', 'runs', 'operating', 'connecting', 'walking', 'heading', 'moving', 'working', 'going']


In [92]:
running_cos_sims_6 = {}
for y in V:
    running_cos_sims_6[y] = pmi_sim_6_15('running',y)

In [93]:
print(nlargest(11, running_cos_sims_6, key = running_cos_sims_6.get))

['running', 'runs', 'ran', 'run', 'drive', 'moving', 'train', 'pass', 'speed', 'bus', 'operating']


According to the results above, it seems like nearest neighbors do tend to have the same POS taf as the query word, although it is clear that as the window size gets larger this is no longer the case (see for example 'transported' and 'running'). This pattern does not differ that much across different POS tags, although it holds even more often for more common POS tags such as pronouns and prepositions, and not so much for not so common tags such as conjugated verbs. In all cases we see that the nearest neighbors differ depending on the window size, where the changes occur more often for words which belongs to more than one POS tag (see for instance 'her'). Finally, since query words tend to have the same tag as their nearest neighbor, as it is expected we see that the very nearest neighbors are likely to share the same neighbors list (see 'his' and 'her').

5.3

Now we try 'bank':

In [102]:
bank_cos_sims = {}
for y in V:
    bank_cos_sims[y] = pmi_sim_1_15('bank',y)

In [103]:
print(nlargest(11, bank_cos_sims, key = bank_cos_sims.get))

['bank', 'banks', 'company', 'insurance', 'corporation', 'railway', 'government', 'banking', 'companies', 'institute', 'conference']


In [104]:
bank_cos_sims_6 = {}
for y in V:
    bank_cos_sims_6[y] = pmi_sim_6_15('bank',y)

In [105]:
print(nlargest(11, bank_cos_sims_6, key = bank_cos_sims_6.get))

['bank', 'corporation', 'capital', 'railway', 'northern', 'branch', 'valley', 'banks', 'southern', 'lake', 'centre']


Now we try 'cell':

In [106]:
cell_cos_sims = {}
for y in V:
    cell_cos_sims[y] = pmi_sim_1_15('cell',y)

In [107]:
print(nlargest(11, cell_cos_sims, key = cell_cos_sims.get))

['cell', 'cells', 'cellular', 'protein', 'tissue', 'brain', 'proteins', 'tissues', 'growth', 'human', 'enzyme']


In [108]:
cell_cos_sims_6 = {}
for y in V:
    cell_cos_sims_6[y] = pmi_sim_6_15('cell',y)

In [109]:
print(nlargest(11, cell_cos_sims_6, key = cell_cos_sims_6.get))

['cell', 'cells', 'function', 'protein', 'dna', 'surface', 'blood', 'tissue', 'membrane', 'chemical', 'flow']


Now we try 'apple':

In [110]:
apple_cos_sims = {}
for y in V:
    apple_cos_sims[y] = pmi_sim_1_15('apple',y)

In [111]:
print(nlargest(11, apple_cos_sims, key = apple_cos_sims.get))

['apple', 'cherry', 'chili', 'desktop', 'olive', 'tulip', 'orange', 'palm', 'pine', 'atari', 'wines']


In [112]:
apple_cos_sims_6 = {}
for y in V:
    apple_cos_sims_6[y] = pmi_sim_6_15('apple',y)

In [113]:
print(nlargest(11, apple_cos_sims_6, key = apple_cos_sims_6.get))

['apple', 'microsoft', 'computers', 'os', 'desktop', 'mac', 'hardware', 'macintosh', 'software', 'devices', 'windows']


Now we try 'axes':

In [114]:
axes_cos_sims = {}
for y in V:
    axes_cos_sims[y] = pmi_sim_1_15('axes',y)

In [115]:
print(nlargest(11, axes_cos_sims, key = axes_cos_sims.get))

['axes', 'phases', 'tributaries', 'qualities', 'paths', 'viewpoints', 'spells', 'sorts', 'branches', 'motifs', 'frames']


In [116]:
axes_cos_sims_6 = {}
for y in V:
    axes_cos_sims_6[y] = pmi_sim_6_15('axes',y)

In [117]:
print(nlargest(11, axes_cos_sims_6, key = axes_cos_sims_6.get))

['axes', 'parallel', 'horizontal', 'angles', 'strings', 'dimensions', 'shapes', 'axis', 'frames', 'vertical', 'vector']


Now we try 'frame':

In [118]:
frame_cos_sims = {}
for y in V:
    frame_cos_sims[y] = pmi_sim_1_15('frame',y)

In [119]:
print(nlargest(11, frame_cos_sims, key = frame_cos_sims.get))

['frame', 'brick', 'frames', 'two-story', 'rear', 'structure', 'panels', 'framed', 'storey', 'wooden', 'wheels']


In [120]:
frame_cos_sims_6 = {}
for y in V:
    frame_cos_sims_6[y] = pmi_sim_6_15('frame',y)

In [121]:
print(nlargest(11, frame_cos_sims_6, key = frame_cos_sims_6.get))

['frame', 'roof', 'rear', 'wooden', 'structure', 'brick', 'rectangular', 'wheel', 'window', 'steel', 'concrete']


Now we try 'light':

In [122]:
light_cos_sims = {}
for y in V:
    light_cos_sims[y] = pmi_sim_1_15('light',y)

In [123]:
print(nlargest(11, light_cos_sims, key = light_cos_sims.get))

['light', 'heavy', 'lights', 'water', 'dark', 'fire', 'regiment', 'division', 'force', 'large', 'pale']


In [124]:
light_cos_sims_6 = {}
for y in V:
    light_cos_sims_6[y] = pmi_sim_6_15('light',y)

In [125]:
print(nlargest(11, light_cos_sims_6, key = light_cos_sims_6.get))

['light', 'heavy', 'surface', 'dark', 'color', 'body', 'water', 'sometimes', 'low', 'blue', 'type']


Now we try 'well':

In [126]:
well_cos_sims = {}
for y in V:
    well_cos_sims[y] = pmi_sim_1_15('well',y)

In [127]:
print(nlargest(11, well_cos_sims, key = well_cos_sims.get))

['well', 'poorly', 'be', 'however', 'there', 'been', 'united', 'preserved', 'discussion', 'debate', 'list']


In [128]:
well_cos_sims_6 = {}
for y in V:
    well_cos_sims_6[y] = pmi_sim_6_15('well',y)

In [129]:
print(nlargest(11, well_cos_sims_6, key = well_cos_sims_6.get))

['well', 'such', 'other', 'many', 'most', 'are', 'some', 'like', 'have', 'more', 'all']


As we can see from this results, multisense words do tend to have much more variation in the POS tags of the words that appear in their nearest neighbors. For most, it appears to be the case that whenever the window size is of 1, the word tends to have as neighbors words with similar POS tags and which are related to it in its more traiditional sense, whereas when w = 6 the senses of the neighbors vary much more or are less traditional (see for instance 'apple', where for w = 6 all of its neighbors are related to computers). In general this shows that it is very hard to work with multisense words and determine their speech of tag based on the tags of its neighbors, but modulating the window size can help us narrow down to neighbors which have the same tag as the tag that would be assigned to the more common or traditional sense of a multisense word.