In [21]:
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils

In [22]:
numDimen = 300
gloveFile = 'glove.6B.' + str(numDimen) + 'd.txt'
GLOVE = utils.glove2dict(os.path.join('glove.6B', gloveFile))

In [23]:
def randvec(n=50, lower=-0.5, upper=0.5):
    """Returns a random vector of length `n`"""
    return np.array([random.uniform(lower, upper) for i in range(n)])

def getGloveVec(w):    
    """Return `w`'s GloVe representation if available, else return 
    a random vector."""
    return GLOVE.get(w, randvec(n=numDimen))

def sentenceToGlove(words):
    result = [0] * numDimen
    for word in words:
        result = np.add(result, getGloveVec(word))
    return result

def getCosineSim(u, v):        
    """Cosine distance between 1d np.arrays `u` and `v`, which must have 
    the same dimensionality. Returns a float."""
    return scipy.spatial.distance.cosine(u, v)

In [25]:
sourceStrs = {}
goldTargetStrs = {}
targetStrs = []
devIndexes = []
vocab = [0]

f = open('data/movie_50000.txt')
for line in f:
    vocab.append(line.rstrip('\n'))
f.close()

f = open('data/dev.src_string.txt')
for i, line in enumerate(f):
    sourceStrs[i+1] = line.strip().split()
f.close()

f = open('data/dev.tgt_string.txt')
for i, line in enumerate(f):
    goldTargetStrs[i+1] = line.strip().split()
f.close()

f = open('data/index_dev.txt')
for line in f:
    devIndexes.append(int(line.rstrip('\n')))
f.close()

f = open('data/rearank_target_dev.txt')
for line in f:
    words = []
    vocabIndexes = line.strip().split()
    for index in vocabIndexes:
        words.append(vocab[int(index)])
    targetStrs.append(words)
f.close()

In [54]:
srcGloveDistances = []
for i, target in enumerate(targetStrs):
    source = sourceStrs[devIndexes[i]]
    cosineSim = getCosineSim(sentenceToGlove(source), sentenceToGlove(target))
    srcGloveDistances.append(cosineSim)
    
    print source
    print target
    print cosineSim
    
    if i == 3: break
            
tgtGloveDistances = []
for i, target in enumerate(targetStrs):
    goldTarget = goldTargetStrs[devIndexes[i]]
    cosineSim = getCosineSim(sentenceToGlove(goldTarget), sentenceToGlove(target))
    tgtGloveDistances.append(cosineSim)
    
    print target
    print goldTarget
    print cosineSim
    
    if i == 3: break

['archer', ',', 'you', 'promised', 'me', 'i', 'wouldn"', 't', 'have', 'to', 'go', 'through', 'that', 'again', '.']
['l"', 'm', 'not', 'going', 'to', 'let', 'that', 'happen', '.']
0.0890196760573
['archer', ',', 'you', 'promised', 'me', 'i', 'wouldn"', 't', 'have', 'to', 'go', 'through', 'that', 'again', '.']
['you"', 're', 'not', 'going', 'to', 'believe', 'this', '.']
0.0987098357982
['archer', ',', 'you', 'promised', 'me', 'i', 'wouldn"', 't', 'have', 'to', 'go', 'through', 'that', 'again', '.']
['if', 'you"', 'il', 'excuse', 'me', ',', 'l"', 'm', 'going', 'to', 'have', 'to', 'ask', 'you', 'to', 'leave', '.']
0.0762550578551
['archer', ',', 'you', 'promised', 'me', 'i', 'wouldn"', 't', 'have', 'to', 'go', 'through', 'that', 'again', '.']
['l"', 'm', 'sorry', ',', 'captain', '.']
0.310326531587
['l"', 'm', 'not', 'going', 'to', 'let', 'that', 'happen', '.']
['l"', 'm', 'going', 'to', 'have', 'to', 'break', 'that', 'promise', '.']
0.0770285154888
['you"', 're', 'not', 'going', 'to', 'be

In [40]:
# np.savetxt("dev_src_glove_dist.txt", srcGloveDistances)
# np.savetxt("dev_tgt_glove_dist.txt", tgtGloveDistances)