# 1 Create distributional semantic word vectors

In [1]:
import numpy as np
import scipy as scipy

In [2]:
sim_data = ["the men feed the dogs",
"the women feed the dogs",
"the women feed the men",
"the men feed the men",
"the dogs bite the men",
"the dogs bite the women",
"the dogs bite the dogs",
"the dogs like the men",
"the men like the women",
"the women like the dogs",
"the men like the dogs"
]

In [3]:
voc = list(set([item for sublist in [s.split() for s in sim_data] for item in sublist]))
voc_dict = {voc[i]:i for i in range(len(voc))}
inv_voc_dict = {i:voc[i] for i in range(len(voc))}
voc_dict

{'feed': 0, 'the': 1, 'women': 2, 'bite': 3, 'men': 4, 'like': 5, 'dogs': 6}

In [4]:
voc

['feed', 'the', 'women', 'bite', 'men', 'like', 'dogs']

In [5]:
cm = np.zeros((len(voc),len(voc)))
cm.shape

(7, 7)

In [6]:
# get all indexed occurrences of a word in a sentence
def get_indexed_occurrences(word, sentence):
    indexed_sentence = sentence.split()
    indices = [i for i, w in enumerate(indexed_sentence) if w == word]
    return indices

In [7]:
# between two lists of indices, determine if there's a difference of 1 among all combinations of the two lists elements
# for example, "the men feed the dogs" has 2 "the"s. I want to ensure both "the" words are accounted for being by 
# women and dogs, respectively
def distance_of_one_exists(list1, list2):
    for value1 in list1:
        for value2 in list2:
            if (abs(value1 - value2) == 1):
                return True
    return False

In [8]:
for w in voc:
    for c in voc:
        #if it's not the same word
        if w != c:
            #for each sentence in corpus
            for sent in sim_data:
                #if both words, w and c, are in the sentence
                if w in sent.split() and c in sent.split(): 
                    #if they are next to each other
                    if distance_of_one_exists(get_indexed_occurrences(w, sent), get_indexed_occurrences(c, sent)):
                            cm[voc_dict[w],voc_dict[c]] += 1
cm

array([[0., 4., 2., 0., 2., 0., 0.],
       [4., 0., 5., 3., 7., 4., 8.],
       [2., 5., 0., 0., 0., 1., 0.],
       [0., 3., 0., 0., 0., 0., 3.],
       [2., 7., 0., 0., 0., 2., 0.],
       [0., 4., 1., 0., 2., 0., 1.],
       [0., 8., 0., 3., 0., 1., 0.]])

In [9]:
cm *= 10
cm += 1
cm

array([[ 1., 41., 21.,  1., 21.,  1.,  1.],
       [41.,  1., 51., 31., 71., 41., 81.],
       [21., 51.,  1.,  1.,  1., 11.,  1.],
       [ 1., 31.,  1.,  1.,  1.,  1., 31.],
       [21., 71.,  1.,  1.,  1., 21.,  1.],
       [ 1., 41., 11.,  1., 21.,  1., 11.],
       [ 1., 81.,  1., 31.,  1., 11.,  1.]])

In [30]:
women_and_men_cm = scipy.linalg.norm(cm[2] - cm[4])
women_and_dogs_cm = scipy.linalg.norm(cm[2] - cm[6])
men_and_dogs_cm = scipy.linalg.norm(cm[4] - cm[6])
feed_and_like_cm = scipy.linalg.norm(cm[0] - cm[5])
feed_and_bite_cm = scipy.linalg.norm(cm[0] - cm[3])
like_and_bite_cm = scipy.linalg.norm(cm[5] - cm[3])

print(women_and_men_cm)
print(women_and_dogs_cm)
print(men_and_dogs_cm)
print(feed_and_like_cm)
print(feed_and_bite_cm)
print(like_and_bite_cm)

22.360679774997898
46.9041575982343
38.72983346207417
14.142135623730951
42.42640687119285
31.622776601683793


In [10]:
def calc_PPMI(w, c, cooccurrence_matrix):
    N = np.sum(cooccurrence_matrix)
    
    numerator = cooccurrence_matrix[w,c] / N
    denominator = (np.sum(cooccurrence_matrix[w,:]) / N ) * (np.sum(cooccurrence_matrix[:,c]) / N)
    
    return max(np.log2(numerator / denominator), 0)
        

In [11]:
PPMI_matrix = np.zeros((cm.shape[0], cm.shape[1]))

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        PPMI_matrix[i, j] = calc_PPMI(i, j, cm)

In [12]:
np.set_printoptions(linewidth=np.inf)

In [13]:
PPMI_matrix

array([[0.        , 0.40230909, 1.30247004, 0.        , 0.87504882, 0.        , 0.        ],
       [0.40230909, 0.        , 0.71718242, 0.3758077 , 0.76708298, 0.40230909, 0.83886589],
       [1.30247004, 0.71718242, 0.        , 0.        , 0.        , 0.36958424, 0.        ],
       [0.        , 0.3758077 , 0.        , 0.        , 0.        , 0.        , 1.69546204],
       [0.87504882, 0.76708298, 0.        , 0.        , 0.        , 0.87504882, 0.        ],
       [0.        , 0.40230909, 0.36958424, 0.        , 0.87504882, 0.        , 0.        ],
       [0.        , 0.83886589, 0.        , 1.69546204, 0.        , 0.        , 0.        ]])

In [15]:
voc_dict

{'feed': 0, 'the': 1, 'women': 2, 'bite': 3, 'men': 4, 'like': 5, 'dogs': 6}

In [16]:
women_and_men = scipy.linalg.norm(PPMI_matrix[2] - PPMI_matrix[4])
women_and_dogs = scipy.linalg.norm(PPMI_matrix[2] - PPMI_matrix[6])
men_and_dogs = scipy.linalg.norm(PPMI_matrix[4] - PPMI_matrix[6])
feed_and_like= scipy.linalg.norm(PPMI_matrix[0] - PPMI_matrix[5])
feed_and_bite= scipy.linalg.norm(PPMI_matrix[0] - PPMI_matrix[3])
like_and_bite= scipy.linalg.norm(PPMI_matrix[5] - PPMI_matrix[3])

print(women_and_men)
print(women_and_dogs)
print(men_and_dogs)
print(feed_and_like)
print(feed_and_bite)
print(like_and_bite)

0.6638323657475999
2.1731127708616107
2.100277406500883
0.9328858041414629
2.310288400824855
1.9436040742520886


In [17]:
U, E, Vt = scipy.linalg.svd(PPMI_matrix, full_matrices=False)

In [18]:
U = np.matrix(U)
U

matrix([[-0.39948604, -0.44884398,  0.34290825,  0.36263918,  0.27086762,  0.39715228, -0.39844391],
        [-0.49350358, -0.32522982, -0.07643136, -0.02679549, -0.78340307, -0.08847937,  0.15023668],
        [-0.37985835,  0.46889795, -0.26936487,  0.30519918,  0.09215832,  0.50020364,  0.45982773],
        [-0.32171275, -0.28333444, -0.5810479 , -0.57326033,  0.37703255,  0.05347619, -0.07046403],
        [-0.37426426,  0.43553137, -0.23999154,  0.27240819,  0.01918632, -0.48815204, -0.54751804],
        [-0.26080618, -0.22080862,  0.19725361,  0.19746777,  0.40144034, -0.58471223,  0.54979854],
        [-0.37547312,  0.39265371,  0.60818842, -0.57688919,  0.02722492,  0.03460171, -0.0144996 ]])

In [19]:
E = np.matrix(np.diag(E))
E

matrix([[2.55526972, 0.        , 0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 1.91824751, 0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 1.72522246, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 1.72376078, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.65843145, 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        , 0.47525179, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        , 0.        , 0.45238088]])

In [20]:
Vt = np.matrix(Vt)
Vt

matrix([[-0.39948604, -0.49350358, -0.37985835, -0.32171275, -0.37426426, -0.26080618, -0.37547312],
        [ 0.44884398,  0.32522982, -0.46889795,  0.28333444, -0.43553137,  0.22080862, -0.39265371],
        [-0.34290825,  0.07643136,  0.26936487,  0.5810479 ,  0.23999154, -0.19725361, -0.60818842],
        [ 0.36263918, -0.02679549,  0.30519918, -0.57326033,  0.27240819,  0.19746777, -0.57688919],
        [-0.27086762,  0.78340307, -0.09215832, -0.37703255, -0.01918632, -0.40144034, -0.02722492],
        [ 0.39715228, -0.08847937,  0.50020364,  0.05347619, -0.48815204, -0.58471223,  0.03460171],
        [ 0.39844391, -0.15023668, -0.45982773,  0.07046403,  0.54751804, -0.54979854,  0.0144996 ]])

In [21]:
V = Vt.T

In [22]:
U.shape, E.shape, Vt.shape

((7, 7), (7, 7), (7, 7))

In [23]:
(U @ E) @ Vt

matrix([[ 1.38777878e-17,  4.02309087e-01,  1.30247004e+00,  7.16440796e-16,  8.75048816e-01,  2.63677968e-16, -7.70217223e-16],
        [ 4.02309087e-01, -2.42861287e-16,  7.17182425e-01,  3.75807699e-01,  7.67082979e-01,  4.02309087e-01,  8.38865895e-01],
        [ 1.30247004e+00,  7.17182425e-01,  3.46944695e-16, -3.52148866e-16,  2.22044605e-16,  3.69584236e-01,  3.66460334e-16],
        [ 2.06432094e-16,  3.75807699e-01, -3.36536354e-16,  4.34548231e-16, -5.37764278e-16,  2.63677968e-16,  1.69546204e+00],
        [ 8.75048816e-01,  7.67082979e-01,  6.52256027e-16,  3.71230824e-16,  8.60422844e-16,  8.75048816e-01,  7.19910243e-17],
        [-6.93889390e-17,  4.02309087e-01,  3.69584236e-01,  2.74086309e-16,  8.75048816e-01, -1.11022302e-16, -1.18394877e-16],
        [-3.52582546e-16,  8.38865895e-01,  3.47378376e-16,  1.69546204e+00,  5.25621213e-16, -2.15105711e-16,  9.59654448e-16]])

In [27]:
reduced_PPMI = PPMI_matrix @ V[:,0:3]
reduced_PPMI

matrix([[-1.02079459, -0.86099384,  0.59159301],
        [-1.26103475, -0.62387128, -0.1318611 ],
        [-0.97064055,  0.89946233, -0.46471431],
        [-0.82206286, -0.54350558, -1.00243689],
        [-0.95634612,  0.83545697, -0.4140388 ],
        [-0.66643013, -0.42356559,  0.34030635],
        [-0.95943508,  0.75320699,  1.04926033]])

In [28]:
women_and_men_reduced = scipy.linalg.norm(reduced_PPMI[2] - reduced_PPMI[4])
women_and_dogs_reduced = scipy.linalg.norm(reduced_PPMI[2] - reduced_PPMI[6])
men_and_dogs_reduced = scipy.linalg.norm(reduced_PPMI[4] - reduced_PPMI[6])
feed_and_like_reduced = scipy.linalg.norm(reduced_PPMI[0] - reduced_PPMI[5])
feed_and_bite_reduced = scipy.linalg.norm(reduced_PPMI[0] - reduced_PPMI[3])
like_and_bite_reduced = scipy.linalg.norm(reduced_PPMI[5] - reduced_PPMI[3])

print(women_and_men_reduced)
print(women_and_dogs_reduced)
print(men_and_dogs_reduced)
print(feed_and_like_reduced)
print(feed_and_bite_reduced)
print(like_and_bite_reduced)


0.08287957673775763
1.5210639044886252
1.4656121329992486
0.6164921946291915
1.6374444803749935
1.357043319290905


# 2 Computing with distributional semantic word vectors

In [31]:
with open("GoogleNews-vectors-rcv_vocab.txt", 'r') as file:
    google_dictionary = {}
    
    #get rid of first line causing errors
    next(file)
    for line in file:
        words = line.strip().split()
        key = words[0]
        values = [word for word in words[1:]]

        google_dictionary[key] = np.array(values, dtype=float)

In [32]:
with open("EN-wform.w.2.ppmi.svd.500.rcv_vocab.txt", 'r') as file:
    composes_dictionary = {}

    #get rid of first line causing errors
    next(file)
    for line in file:
        words = line.strip().split()
        key = words[0]
        values = [word for word in words[1:]]

        composes_dictionary[key] = np.array(values, dtype=float)

In [60]:
answer_dict = {'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6}

file_path = 'SAT-package-V3.txt' 
current_paragraph = []
paragraphs = {}

with open(file_path, 'r') as file:
    i = 0
    for line in file:
        if line.startswith('#') or not line.strip():
            if len(current_paragraph) != 0:
                paragraphs[i] = current_paragraph
                i+=1
            current_paragraph = []
            continue
        else:
            current_paragraph.append(line.strip())

for key, paragraph in paragraphs.items():
    paragraph[7] = answer_dict[paragraph[7]]

In [61]:
print(paragraphs[0])

['190 FROM REAL SATs', 'lull trust v:n', 'balk fortitude v:n', 'betray loyalty v:n', 'cajole compliance v:n', 'hinder destination v:n', 'soothe passion v:n', 4]


In [62]:
def cos_sim(a,b):
    return np.inner(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [63]:
#take cos similarity between word and proposed analogous word and then find diffs
def diff_of_cos_choices_compared_to_cos_of_question(paragraph, vectors):
        diff_question = cos_sim(vectors[paragraph[1].split()[0]],vectors[paragraph[1].split()[1]])

        diff_a = cos_sim(vectors[paragraph[2].split()[0]],vectors[paragraph[2].split()[1]])
        diff_b = cos_sim(vectors[paragraph[3].split()[0]],vectors[paragraph[3].split()[1]])
        diff_c = cos_sim(vectors[paragraph[4].split()[0]],vectors[paragraph[4].split()[1]])
        diff_d = cos_sim(vectors[paragraph[5].split()[0]],vectors[paragraph[5].split()[1]])
        diff_e = cos_sim(vectors[paragraph[6].split()[0]],vectors[paragraph[6].split()[1]])

        abs_a = abs(diff_question - diff_a)
        abs_b = abs(diff_question - diff_b)
        abs_c = abs(diff_question - diff_c)
        abs_d = abs(diff_question - diff_d)
        abs_e = abs(diff_question - diff_e)

        return abs_a, abs_b, abs_c, abs_d, abs_e

In [64]:
#take euc distance similarity between word and proposed analogous word and then find diffs
def diff_of_euc_distance_of_choices_compared_to_euc_distance_of_question(paragraph, vectors):
        diff_question_euc = scipy.linalg.norm(vectors[paragraph[1].split()[0]] - vectors[paragraph[1].split()[1]])
        
        diff_a_euc = scipy.linalg.norm(vectors[paragraph[2].split()[0]] - vectors[paragraph[2].split()[1]])
        diff_b_euc = scipy.linalg.norm(vectors[paragraph[3].split()[0]] - vectors[paragraph[3].split()[1]])
        diff_c_euc = scipy.linalg.norm(vectors[paragraph[4].split()[0]] - vectors[paragraph[4].split()[1]])
        diff_d_euc = scipy.linalg.norm(vectors[paragraph[5].split()[0]] - vectors[paragraph[5].split()[1]])
        diff_e_euc = scipy.linalg.norm(vectors[paragraph[6].split()[0]] - vectors[paragraph[6].split()[1]])

        abs_a_euc = abs(diff_question_euc - diff_a_euc)
        abs_b_euc = abs(diff_question_euc - diff_b_euc)
        abs_c_euc = abs(diff_question_euc - diff_c_euc)
        abs_d_euc = abs(diff_question_euc - diff_d_euc)
        abs_e_euc = abs(diff_question_euc - diff_e_euc)

        return abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc

In [65]:
#diffs of model vectors, and then cos similarity among the diffs of model vectors
def diff_of_subtract_then_cos_similarity(paragraph, vectors):
    diff_question = vectors[paragraph[1].split()[0]] - vectors[paragraph[1].split()[1]]
    
    diff_a = vectors[paragraph[2].split()[0]] - vectors[paragraph[2].split()[1]]
    diff_b = vectors[paragraph[3].split()[0]] - vectors[paragraph[3].split()[1]]
    diff_c = vectors[paragraph[4].split()[0]] - vectors[paragraph[4].split()[1]]
    diff_d = vectors[paragraph[5].split()[0]] - vectors[paragraph[5].split()[1]]
    diff_e = vectors[paragraph[6].split()[0]] - vectors[paragraph[6].split()[1]]
    
    simularity_diffs_a = cos_sim(diff_question, diff_a)
    simularity_diffs_b = cos_sim(diff_question, diff_b)    
    simularity_diffs_c = cos_sim(diff_question, diff_c)    
    simularity_diffs_d = cos_sim(diff_question, diff_d)    
    simularity_diffs_e = cos_sim(diff_question, diff_e)    
    
    return simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e

Google, best method

In [74]:
right = 0
wrong = 0
num_sat_questions = len(paragraphs)

for i in range(num_sat_questions):
    if paragraphs[i][1].split()[0] in google_dictionary and \
        paragraphs[i][1].split()[1] in google_dictionary and \
        paragraphs[i][2].split()[0] in google_dictionary and \
        paragraphs[i][2].split()[1] in google_dictionary and \
        paragraphs[i][3].split()[0] in google_dictionary and \
        paragraphs[i][3].split()[1] in google_dictionary and \
        paragraphs[i][4].split()[0] in google_dictionary and \
        paragraphs[i][4].split()[1] in google_dictionary and \
        paragraphs[i][5].split()[0] in google_dictionary and \
        paragraphs[i][5].split()[1] in google_dictionary and \
        paragraphs[i][6].split()[0] in google_dictionary and \
        paragraphs[i][6].split()[1] in google_dictionary:
        
        
        abs_a, abs_b, abs_c, abs_d, abs_e = diff_of_cos_choices_compared_to_cos_of_question(paragraphs[i], google_dictionary)
        abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc = diff_of_euc_distance_of_choices_compared_to_euc_distance_of_question(paragraphs[i], google_dictionary)
        simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e = diff_of_subtract_then_cos_similarity(paragraphs[i], google_dictionary)
        
        cos_diff_array_of_choices = [abs_a, abs_b, abs_c, abs_d, abs_e]
        euc_diff_array_of_choices = [abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc]
        simularity_diffs_cos_array_of_choices = [simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e]
        
#         predicted_answer = cos_diff_array_of_choices.index(min(abs_a, abs_b, abs_c, abs_d, abs_e)) + 2
#         predicted_answer = euc_diff_array_of_choices.index(min(abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc)) + 2
        predicted_answer = simularity_diffs_cos_array_of_choices.index(max(simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e)) + 2
        
    
    
        correct_answer = paragraphs[i][7]
        if predicted_answer == correct_answer:
            right += 1
        else:
            wrong += 1
    else:
        continue
        
right, wrong, right / (right + wrong)

(114, 144, 0.4418604651162791)

Composes, best method

In [75]:
right = 0
wrong = 0
num_sat_questions = len(paragraphs)

for i in range(num_sat_questions):
    if paragraphs[i][1].split()[0] in composes_dictionary and \
        paragraphs[i][1].split()[1] in composes_dictionary and \
        paragraphs[i][2].split()[0] in composes_dictionary and \
        paragraphs[i][2].split()[1] in composes_dictionary and \
        paragraphs[i][3].split()[0] in composes_dictionary and \
        paragraphs[i][3].split()[1] in composes_dictionary and \
        paragraphs[i][4].split()[0] in composes_dictionary and \
        paragraphs[i][4].split()[1] in composes_dictionary and \
        paragraphs[i][5].split()[0] in composes_dictionary and \
        paragraphs[i][5].split()[1] in composes_dictionary and \
        paragraphs[i][6].split()[0] in composes_dictionary and \
        paragraphs[i][6].split()[1] in composes_dictionary:
        
        
        abs_a, abs_b, abs_c, abs_d, abs_e = diff_of_cos_choices_compared_to_cos_of_question(paragraphs[i], composes_dictionary)
        abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc = diff_of_euc_distance_of_choices_compared_to_euc_distance_of_question(paragraphs[i], composes_dictionary)
        simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e = diff_of_subtract_then_cos_similarity(paragraphs[i], composes_dictionary)
        
        cos_diff_array_of_choices = [abs_a, abs_b, abs_c, abs_d, abs_e]
        euc_diff_array_of_choices = [abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc]
        simularity_diffs_cos_array_of_choices = [simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e]
        
#         predicted_answer = cos_diff_array_of_choices.index(min(abs_a, abs_b, abs_c, abs_d, abs_e)) + 2
#         predicted_answer = euc_diff_array_of_choices.index(min(abs_a_euc, abs_b_euc, abs_c_euc, abs_d_euc, abs_e_euc)) + 2
        predicted_answer = simularity_diffs_cos_array_of_choices.index(max(simularity_diffs_a, simularity_diffs_b, simularity_diffs_c, simularity_diffs_d, simularity_diffs_e)) + 2
        
    
    
        correct_answer = paragraphs[i][7]
        if predicted_answer == correct_answer:
            right += 1
        else:
            wrong += 1
    else:
        continue
        
right, wrong, right / (right + wrong)

(112, 147, 0.43243243243243246)