In [1]:
import os
import numpy as np
import scipy.linalg as scipy_linalg
from collections import defaultdict 
import pandas as pd
import random
import re
import json

#os.chdir('C://Users/wanyu/Documents/Computational Linguilistics/PA5')

In [2]:
class DSWC():
    
    def __init__(self, word_dict=None, ppmi_matrix=None):
        self.word_dict = word_dict
        self.freq_matrix = None
        self.ppmi_matrix = ppmi_matrix
        self.V = None    

        
    def PPMI(self, sentences):
        
        # Create a word dictionary
        words = []
        for sentence in sentences:
            words.extend(sentence)
        
        words = list(set(words))
        self.word_dict = {word: index for index, word in enumerate(words)}
        
        # Creating a co-occurence matrix
        self.freq_matrix = np.zeros((len(words), len(words)))
        
        for sentence in sentences:
            for index, word in enumerate(sentence):
                if index != 0: 
                    context_idx = self.word_dict[sentence[index-1]]
                    word_idx = self.word_dict[word]
                    self.freq_matrix[word_idx, context_idx] += 1

        # Repeating sentnences and Smoothing counts            
        self.freq_matrix = self.freq_matrix*10 + 1

        total = self.freq_matrix.sum().sum()
        
        # Computing PPMI
        join_prob = self.freq_matrix/total
        word_prob = self.freq_matrix.sum(axis=1)/total
        context_prob = self.freq_matrix.sum(axis=0)/total
        
        independent_prob = (word_prob[:,None]*context_prob).T
        self.ppmi_matrix = np.log(join_prob*(1/independent_prob))
        
        self.ppmi_matrix = np.maximum(self.ppmi_matrix, np.zeros((len(self.word_dict), len(self.word_dict))))

    def SVD(self):
        U, E, Vt = scipy_linalg.svd(self.ppmi_matrix, full_matrices=False)
        U = np.matrix(U) # compute U
        E = np.matrix(np.diag(E)) # compute E
        Vt = np.matrix(Vt) # compute Vt = conjugage transpose of V
        
        return Vt.T # compute V = conjugate transpose of Vt
    
    def reduced_PPMI(self, dimension):
        self.V = self.SVD()
        reduced_PPMI = self.ppmi_matrix * self.V[:, 0:dimension]
        
        return reduced_PPMI
    
    def word_vector(self, words, word_matrix=None):
        '''
        Get a word vector based on the default matrix
        '''
        
        if word_matrix is None:
            word_matrix = self.ppmi_matrix                  
        
        if type(words) == list:
            word_index = [self.word_dict[word] for word in words]
            word_vector = word_matrix[word_index].astype('float')
            
        else: 
             word_index = self.word_dict[words]
        
        word_vector = word_matrix[word_index]
        
        return word_vector
    
    
    def euclidean(self, word1, word2, word_matrix=None, v_input=False):
        
        '''
        Compute the Euclidean distance between two vectors
        '''
        
        if word_matrix is None:
            word_matrix = self.ppmi_matrix   
        
        if v_input:
            v1, v2 = word1, word2
        
        else:
            v1 = self.word_vector(word1, word_matrix)
            v2 = self.word_vector(word2, word_matrix)
        
        distance = scipy_linalg.norm(v2-v1)
        
        return distance
    
    def cosine_similarity(self, word1, word2, word_matrix=None, v_input=False):
        '''
        Compute the cosine similarity between two words or one word and several words
        '''
        
        if word_matrix is None:
            word_matrix = self.ppmi_matrix   
        
        if v_input:
            v1, v2 = word1, word2
        
        else:
            v1 = self.word_vector(word1, word_matrix)
            v2 = self.word_vector(word2, word_matrix)
            
        length_v1 = scipy_linalg.norm(v1)
        length_v2 = scipy_linalg.norm(v2, axis=1)
        denominator = length_v1 * length_v2
        numerator = np.squeeze(np.asarray(v1.dot(v2.T)))
        similarity = numerator*(1/denominator)
        
        return similarity  
   

# 1. Create distributional semantic word vectors

In [3]:
# Read files
with open('dist_sim_data.txt') as f:
    sentences = f.read().splitlines()
    sentences = [sentence.split() for sentence in sentences]

In [4]:
dswc = DSWC()        
dswc.PPMI(sentences)

dogs = pd.DataFrame({'Raw Counts': dswc.word_vector('dogs', dswc.freq_matrix).ravel(), 
                   'PPMI': dswc.word_vector('dogs').ravel()}, index=list(dswc.word_dict.keys()))

In [5]:
dogs.T.round(2)

Unnamed: 0,men,like,dogs,feed,bite,the,women
Raw Counts,1.0,1.0,1.0,1.0,1.0,91.0,1.0
PPMI,0.0,0.0,0.0,0.0,0.0,2.09,0.0


In [6]:
sim_original,sim_reduced = [], []
pairs = [['women','men'],['women','dogs'],['men','dogs'],
         ['feed','like'],['feed','bite'], ['like','bite']]
reduced_PPMI = dswc.reduced_PPMI(3)

for pair in pairs:
    original = round(float(dswc.euclidean(pair[0], pair[1])),4)
    reduced = round(float(dswc.euclidean(pair[0], pair[1], reduced_PPMI)),4)
    sim_original.append(original)
    sim_reduced.append(reduced)
    
pairs_text = ['_'.join(pair) for pair in pairs]
distance = pd.DataFrame({'pairs':pairs_text, 'compact PPMI': sim_original,
                          'reduced PPMI': sim_reduced})

In [7]:
distance.set_index('pairs')

Unnamed: 0_level_0,compact PPMI,reduced PPMI
pairs,Unnamed: 1_level_1,Unnamed: 2_level_1
women_men,0.2234,0.2234
women_dogs,0.3398,0.3398
men_dogs,0.1164,0.1164
feed_like,0.6674,0.522
feed_bite,2.1746,2.1701
like_bite,1.7205,1.6985


# 2. Synonym Detection

In [3]:
# Import classic and google word vectors
google_dict, classic_dict = defaultdict(int), defaultdict(int)

with open('EN-wform.w.2.ppmi.svd.500.rcv_vocab.txt') as f:
    classic_list = f.read().splitlines()
    
for index, vector in enumerate(classic_list):
    word_vector = vector.split(' ')
    classic_dict[word_vector[0]] = len(classic_dict)
    classic_list[index] = word_vector[1:]

with open('GoogleNews-vectors-rcv_vocab.txt') as f:
    google_list = f.read().splitlines()

for index, vector in enumerate(google_list):
    word_vector = vector.split(' ')
    google_dict[word_vector[0]] = len(google_dict)
    google_list[index] = word_vector[1:]

# Transform lists to matrices
classic_matrix = np.matrix(classic_list).astype('float')
google_matrix = np.matrix(google_list).astype('float')

In [4]:
# Define google matrix and classic matrix
google = DSWC(google_dict, google_matrix)
classic = DSWC(classic_dict, classic_matrix)

In [5]:
# Import synonym questions
questions = json.load(open("synonyms_questions.txt"))

In [6]:
# Use google matrix and classic matrix to answer a synonym for each question
accuracy_synonyms = [0] * 4

for index, values in questions.items():   
    sim_google = google.cosine_similarity(values['question'], values['choice'])
    sim_classic = classic.cosine_similarity(values['question'], values['choice'])
    dist_google = google.euclidean(values['question'], values['choice'])
    dist_classic = classic.euclidean(values['question'], values['choice'])
    ans_list = [sim_google.argmax(), sim_classic.argmax(), dist_classic.argmin(), dist_classic.argmin()]
    for index, ans in enumerate(ans_list):
        if ans == 0:
            accuracy_synonyms[index] += 1
            
synonyms_detections =  pd.DataFrame(accuracy_synonyms, columns=['Accuracy'],
                                    index=['cos_google', 'cos_classic', 'dist_google', 'dist_classic'])/1000

In [7]:
synonyms_detections

Unnamed: 0,Accuracy
cos_google,0.627
cos_classic,0.537
dist_google,1.0
dist_classic,1.0


# 3. SAT Questions

In [20]:
# Import SAT questions
answer_dict = {answer: index for index, answer in enumerate(['a','b','c','d','e'])}
sat_questions = defaultdict(dict)
with open('SAT-package-V3.txt') as f:
    sat = f.readlines()[42:]
    
for i in range(1,3364,9):
    index = len(sat_questions)
    sat_questions[index]['question'] = re.findall(r'[^\W]+', sat[i])[:2]
    choice_list = []
    for choice in sat[(i+1):(i+6)]:
        choice_list.append(re.findall(r'[^\W]+',  choice)[:2])
    sat_questions[index]['choice'] = choice_list
    sat_questions[index]['answer'] = answer_dict[sat[i+6][0]]

In [24]:
# Define the function to summarize the relation between two words
def vec_summary(v1, v2, mode):
    if mode == 0:
        vec = v1+v2
    elif mode == 1:
        vec = v1-v2
    elif mode == 2:
        vec = np.multiply(v1,v2)
    elif mode == 3:
        vec = np.multiply(v1,1/v2)
    elif mode == 4:
        vec = np.concatenate((v1,v2),axis=1)
    return vec

In [58]:
# Use cosine similarity to measure similarity
results = {'Classic':[], 'Google':[]}
for m in range(5):
    accuracy_list_google = np.array([])
    accuracy_list_classic = np.array([])

    for index, values in sat_questions.items():
        w1, w2 = values['question']
        q1 = vec_summary(classic.word_vector(w1),classic.word_vector(w2),m)
        q2 = vec_summary(google.word_vector(w1),google.word_vector(w2),m)
        sim_list_classic, sim_list_google = np.array([]), np.array([])
        for choice in values['choice']:
            w1, w2 = choice
            c1 = vec_summary(classic.word_vector(w1), classic.word_vector(w2),m)
            c2 = vec_summary(google.word_vector(w1), google.word_vector(w2),m)
            sim_list_classic = np.append(sim_list_classic, classic.cosine_similarity(q1,c1,v_input=True))
            sim_list_google = np.append(sim_list_google, google.cosine_similarity(q2,c2,v_input=True))


        accuracy_list_classic= np.append(accuracy_list_classic, sim_list_classic.argmax() == values['answer'])
        accuracy_list_google= np.append(accuracy_list_classic, sim_list_google.argmax() == values['answer'])


    results['Classic'].append(accuracy_list_classic.mean().round(4))
    results['Google'].append(accuracy_list_google.mean().round(4))

sat_cos = pd.DataFrame(results, index=['Addition', 'Subtraction', 'Multiplication', 'Division', 'Concatenation'])



In [59]:
# Use the Euclidean Distance to measure similarity
results = {'Classic':[], 'Google':[]}
for m in range(5):
    accuracy_list_google = np.array([])
    accuracy_list_classic = np.array([])

    for index, values in sat_questions.items():
        w1, w2 = values['question']
        q1 = vec_summary(classic.word_vector(w1),classic.word_vector(w2),m)
        q2 = vec_summary(google.word_vector(w1),google.word_vector(w2),m)
        sim_list_classic, sim_list_google = np.array([]), np.array([])
        for choice in values['choice']:
            w1, w2 = choice
            c1 = vec_summary(classic.word_vector(w1), classic.word_vector(w2),m)
            c2 = vec_summary(google.word_vector(w1), google.word_vector(w2),m)
            sim_list_classic = np.append(sim_list_classic, classic.euclidean(q1,c1,v_input=True))
            sim_list_google = np.append(sim_list_google, google.euclidean(q2,c2,v_input=True))


        accuracy_list_classic= np.append(accuracy_list_classic, sim_list_classic.argmin() == values['answer'])
        accuracy_list_google= np.append(accuracy_list_classic, sim_list_google.argmin() == values['answer'])


    results['Classic'].append(accuracy_list_classic.mean().round(4))
    results['Google'].append(accuracy_list_google.mean().round(4))
sat_dist = pd.DataFrame(results, index=['Addition', 'Subtraction', 'Multiplication', 'Division', 'Concatenation'])

In [51]:
sat_dist

Unnamed: 0,Classic,Google
Addition,0.3102,0.312
Subtraction,0.3824,0.384
Multiplication,0.2594,0.2587
Division,0.2032,0.2053
Concatenation,0.3904,0.392


In [82]:
# Turn the results into a table and compare them
sat_dist['Similarity'] = 'Euclidean Distance'
sat_cos['Similarity'] = 'Cosine Similarity'
sat_comparison = pd.concat([sat_dist, sat_cos]).reset_index().rename(columns={'index':'Aggregation Method'})

sat_comparison = pd.melt(sat_comparison, id_vars=['Similarity','Aggregation Method'], value_vars=['Classic', 'Google'], var_name='Word Matrix', value_name='Accuracy')
#sat_comparison.iloc[:, [2,3,0,1,4]]
#sat_comparison.groupby(['method','word_matrix','index']).mean().unstack(level=2)
sat_comparison = pd.pivot_table(sat_comparison, index=['Similarity','Word Matrix'], columns='Aggregation Method', aggfunc=np.mean)
sat_comparison.iloc[:,[2,3,0,1,4]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy,Accuracy,Accuracy,Accuracy
Unnamed: 0_level_1,Aggregation Method,Division,Multiplication,Addition,Concatenation,Subtraction
Similarity,Word Matrix,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Cosine Similarity,Classic,0.1818,0.2353,0.3235,0.3904,0.4225
Cosine Similarity,Google,0.1813,0.2373,0.3253,0.392,0.4213
Euclidean Distance,Classic,0.2032,0.2594,0.3102,0.3904,0.3824
Euclidean Distance,Google,0.2053,0.2587,0.312,0.392,0.384
