In [11]:
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils

In [12]:
numDimen = 300
gloveFile = 'glove.6B.' + str(numDimen) + 'd.txt'
GLOVE = utils.glove2dict(os.path.join('glove.6B', gloveFile))

In [26]:
def randvec(n=50, lower=-0.5, upper=0.5):
    """Returns a random vector of length `n`"""
    return np.array([random.uniform(lower, upper) for i in range(n)])

def getGloveVec(w):    
    """Return `w`'s GloVe representation if available, else return 
    a random vector."""
    return GLOVE.get(w, randvec(n=numDimen))

def sentenceToGlove(words):
    result = [0] * numDimen
    for word in words:
        result = np.add(result, getGloveVec(word))
    return result

def getCosineSim(u, v):        
    """Cosine distance between 1d np.arrays `u` and `v`, which must have 
    the same dimensionality. Returns a float."""
    return scipy.spatial.distance.cosine(u, v)

In [14]:
vocab = [0]

f = open('data/movie_50000.txt')
for line in f:
    vocab.append(line.rstrip('\n'))
f.close()

In [19]:
""" DEV SIDE """
sourceStrs = {}
goldTgtStrs = {}
targetStrs = []
devIndexes = []

f = open('data/dev.src_string.txt')
for i, line in enumerate(f):
    sourceStrs[i+1] = line.strip().split()
f.close()

f = open('data/dev.tgt_string.txt')
for i, line in enumerate(f):
    goldTgtStrs[i+1] = line.strip().split()
f.close()

f = open('data/index_dev.txt')
for line in f:
    devIndexes.append(int(line.rstrip('\n')))
f.close()

f = open('data/rearank_target_dev.txt')
for line in f:
    words = []
    vocabIndexes = line.strip().split()
    for index in vocabIndexes:
        words.append(vocab[int(index)])
    targetStrs.append(words)
f.close()

In [20]:
devSrcGloveDistances = []
devTgtGloveDistances = []
source = sourceStrs[devIndexes[0]]
sourceGlove = sentenceToGlove(source)
goldTgt = goldTgtStrs[devIndexes[0]]
goldTgtGlove = sentenceToGlove(goldTgt)

for i, tgtStr in enumerate(targetStrs):
    tgtStrGlove = sentenceToGlove(tgtStr)
    if source != sourceStrs[devIndexes[i]]:
        source = sourceStrs[devIndexes[i]]
        sourceGlove = sentenceToGlove(source)
        goldTgt = goldTgtStrs[devIndexes[i]]
        goldTgtGlove = sentenceToGlove(goldTgt)
    srcCosineSim = getCosineSim(sourceGlove, tgtStrGlove)
    devSrcGloveDistances.append(srcCosineSim)
    goldTgtCosineSim = getCosineSim(goldTgtGlove, tgtStrGlove)
    devTgtGloveDistances.append(goldTgtCosineSim)

In [21]:
np.savetxt("dev_src_glove_dist.txt", devSrcGloveDistances, fmt='%f')
np.savetxt("dev_tgt_glove_dist.txt", devTgtGloveDistances, fmt='%f')

In [22]:
""" TEST SIDE """
sourceStrs = {}
goldTgtStrs = {}
targetStrs = []
testIndexes = []

f = open('data/test.src_string.txt')
for i, line in enumerate(f):
    sourceStrs[i+1] = line.strip().split()
f.close()

f = open('data/test.tgt_string.txt')
for i, line in enumerate(f):
    goldTgtStrs[i+1] = line.strip().split()
f.close()

f = open('data/index_valid.txt')
for line in f:
    testIndexes.append(int(line.rstrip('\n')))
f.close()

f = open('data/rearank_target_valid.txt')
for line in f:
    words = []
    vocabIndexes = line.strip().split()
    for index in vocabIndexes:
        words.append(vocab[int(index)])
    targetStrs.append(words)
f.close()

In [27]:
testSrcGloveDistances = []
testTgtGloveDistances = []
source = sourceStrs[testIndexes[0]]
sourceGlove = sentenceToGlove(source)
goldTgt = goldTgtStrs[testIndexes[0]]
goldTgtGlove = sentenceToGlove(goldTgt)

for i, tgtStr in enumerate(targetStrs):
    tgtStrGlove = sentenceToGlove(tgtStr)
    if source != sourceStrs[testIndexes[i]]:
        source = sourceStrs[testIndexes[i]]
        sourceGlove = sentenceToGlove(source)
        goldTgt = goldTgtStrs[testIndexes[i]]
        goldTgtGlove = sentenceToGlove(goldTgt)
    srcCosineSim = getCosineSim(sourceGlove, tgtStrGlove)
    testSrcGloveDistances.append(srcCosineSim)
    goldTgtCosineSim = getCosineSim(goldTgtGlove, tgtStrGlove)
    testTgtGloveDistances.append(goldTgtCosineSim)

In [28]:
np.savetxt("test_src_glove_dist.txt", testSrcGloveDistances, fmt='%f')
np.savetxt("test_tgt_glove_dist.txt", testTgtGloveDistances, fmt='%f')