In [1]:
rg65 = ('cord', 'smile', 'rooster', 'voyage', 'noon', 'string', 'fruit', 'furnace', 'autograph', 'shore',
     'automobile', 'wizard', 'mound' ,'stove', 'grin', 'implement', 'asylum', 'fruit', 'asylum', 'monk',
     'graveyard', 'madhouse', 'glass', 'magician', 'boy', 'rooster', 'cushion','jewel', 'monk', 'slave',
     'asylum', 'cemetery', 'coast', 'forest', 'grin', 'lad', 'shore', 'woodland', 'monk', 'oracle',
     'boy', 'sage', 'automobile', 'cushion', 'mound', 'shore', 'lad', 'wizard', 'forest', 'graveyard',
     'food', 'rooster', 'cemetery', 'woodland', 'shore', 'voyage', 'bird', 'woodland', 'coast', 'hill',
     'furnace', 'implement', 'crane', 'rooster', 'hill', 'woodland', 'car', 'journey', 'cemetery', 'mound',
     'glass', 'jewel', 'magician', 'oracle', 'crane', 'implement', 'brother', 'lad', 'sage', 'wizard',
     'oracle', 'sage', 'bird', 'crane', 'bird', 'cock', 'food', 'fruit', 'brother', 'monk',
     'asylum', 'madhouse', 'furnace', 'stove', 'magician', 'wizard', 'hill', 'mound', 'cord', 'string',
     'glass', 'tumbler', 'grin', 'smile', 'serf', 'slave', 'journey', 'voyage', 'autograph', 'signature',
     'coast', 'shore', 'forest', 'woodland', 'implement', 'tool', 'cock', 'rooster', 'boy', 'lad',
     'cushion', 'pillow', 'cemetery', 'graveyard', 'automobile', 'car', 'midday', 'noon', 'gem', 'jewel')
human_similarity = (0.02, 0.04, 0.04, 0.05, 0.06, 0.11, 0.14, 0.18, 0.19, 0.39,
            0.42, 0.44, 0.44, 0.45, 0.57, 0.79, 0.85, 0.88, 0.90, 0.91,
            0.96, 0.97, 0.97, 0.99, 1.00, 1.09, 1.18, 1.22, 1.24, 1.26,
            1.37, 1.41, 1.48, 1.55, 1.69, 1.78, 1.82, 2.37, 2.41, 2.46,
            2.61, 2.63, 2.63, 2.69, 2.74, 3.04, 3.11, 3.21, 3.29, 3.41,
            3.45, 3.46, 3.46, 3.58, 3.59, 3.60, 3.65, 3.66, 3.68, 3.82,
            3.84, 3.88, 3.92, 3.94, 3.94)

In [3]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
import numpy as np

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Mengzelev\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [8]:
# Extract the 5000 most common English words (denoted by W) based on unigram frequencies in the Brown corpus
fdist = nltk.FreqDist(w.lower() for w in brown.words())
wfreq = fdist.most_common(5000)
w = list(x[0] for x in wfreq)

In [10]:
# Update W by adding rg65 words and make sure they are at 0-len
rg65_set = set(rg65)
for word in rg65_set:
    if word not in w:
        w.insert(0, word)

5031

In [5]:
# Construct bigram for the brown corpus
from collections import Counter
from nltk.util import ngrams

bigrams = ngrams(brown.words(), 2)
bigrams_freq = Counter(bigrams)

In [11]:
# Construct a word-context vector model (denoted by M1) by collecting bigram counts for words in W
m1_list= []
for i in range(0, len(w)):
    row = []
    for j in range(0, len(w)):
        row.append(bigrams_freq[(w[i], w[j])])
    m1_list.append(row)
m1 = np.array(m1_list)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [43]:
unigram = ngrams(brown.words(), 1)
unigram_freq = Counter(unigram)

In [52]:
sum_unigram = float(sum(fdist.values()))
sum_bigram = float(sum(bigrams_freq.values()))

In [57]:
# Compute positive pointwise mutual information on M1. Denote this model as M1+
# https://stackoverflow.com/questions/22118350/python-sentiment-analysis-using-pointwise-mutual-information
import math
def ppmi(w1, w2):
    p12 = bigrams_freq[(w1, w2)] / float(sum_bigram)
    if p12 == 0:
        return 0
    p1 = fdist[w1] / float(sum_unigram)
    p2 = fdist[w2] / float(sum_unigram)
    return max(0, math.log(p12/float(p1*p2),2))

m1plus_list = []
for i in range(0, len(w)):
    row = []
    for j in range(0, len(w)):
        ans = ppmi(w[i], w[j])
        row.append(ans)
    m1plus_list.append(row)
m1plus = np.array(m1plus_list)
m1plus
        

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
# Construct a latent semantic model (denoted by M2) by applying PCA to M1+
from sklearn.decomposition import PCA
def m2_pca(dimension):
    pca = PCA(n_components=dimension)
    return np.array(pca.fit_transform(m1plus))

In [77]:
m2_10 = m2_pca(10)

array([[-1.71592548,  0.02685898,  0.66079346, ..., -0.40596109,
         0.14647425,  0.34095178],
       [-1.23321432, -0.54208781,  0.35063855, ..., -0.68710369,
        -0.38029985,  0.2840805 ],
       [-1.37749283,  0.04963463,  0.3034222 , ..., -0.98636539,
        -0.28536521,  0.54078089],
       ...,
       [-1.16153642, -0.35345861, -0.89991321, ...,  0.29710302,
         0.58288933,  0.2485601 ],
       [-1.58068971,  1.06806476, -4.79492866, ..., -4.83227689,
        -2.56303421,  0.91626869],
       [-1.83436506,  0.76338435, -3.97294754, ..., -3.77890211,
        -2.06882947,  0.48038537]])

In [78]:
m2_100 = m2_pca(100)

In [83]:
m2_300 = m2_pca(300)

In [None]:
# Find the index for each word of rg65 in w
rg65_id = {}
for i in range(0, len(rg65)):
    for j in range(0,len(w)):
        if w[j] == rg65[i]:
            rg65_id[rg65[i]] = j
            break  

In [71]:
# Calculate the Pearson correlation between cosine similarities of model and human similarities
# from scipy.spatial import distance
from scipy import stats
def pearson_cos_human(model):
    cos_sim = []
    for i in range(0, 65):
        cos_sim.append(distance.cosine(model[rg65_id[rg65[i+i]]], model[rg65_id[rg65[i+i+1]]]))
    return stats.pearsonr(cos_sim, human_similarity)

In [87]:
pearson_cos_human(m1)

(-0.05494857263247904, 0.6637493854323102)

In [86]:
pearson_cos_human(m1plus)

(-0.0030878788110033986, 0.9805237578055053)

In [73]:
pearson_cos_human(m2_10)

(-0.1959481988489057, 0.11773792935085334)

In [80]:
pearson_cos_human(m2_100)

(-0.2923124144401581, 0.018135929609137593)

In [84]:
pearson_cos_human(m2_300)

(-0.2711279136635144, 0.028919139522779563)

In [97]:
"plays" in w

True

In [None]:
# Get common test case
test = []
with open("tests/word-test-v1.txt") as f:
    for line in f:
        line = line.strip()
        words = line.split(" ")
        if len(words) == 4 and words[0] in w and words[1] in w and words[2] in w and words[3] in w:
            test.append(line)

with open("tests/word-test-common.txt", mode="w+") as f:
    for line in test:
        f.write(line + '\n')
test

In [110]:
def find_id(word):
    for i, ww in enumerate(w):
        if ww == word:
            return i
    return -1

In [115]:
from scipy import spatial
tree = spatial.KDTree(m2_300)
def analogical_reasoning(wa1, wa2, wb1):
    b2 = m2_300[find_id(wb1)] - m2_300[find_id(wa1)] + m2_300[find_id(wa2)]
    return w[tree.query(b2)[1]]

In [133]:
def analogy_test_accuracy(file):
    correct, total = 0, 0
    with open(file) as f:
        for line in f:
            total += 1
            words = line.strip().split(" ")
            res = analogical_reasoning(words[0], words[1], words[2])
            if(res == words[3]):
                correct += 1
    return correct/float(total)

In [None]:
analogy_test_accuracy("tests\word-test-semantic.txt")

In [134]:
analogy_test_accuracy("tests\word-test-syntactic.txt")

0.001004016064257028