Section on VSM Models and Correlation

In [66]:
vsmdata_home = "vsmdata"

In [67]:
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils

In [68]:
ww = utils.build(os.path.join(vsmdata_home, 'imdb-wordword.csv'))

In [69]:
def cosine(u, v):        
    return scipy.spatial.distance.cosine(u, v)

In [70]:
def neighbors(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], distfunc(w, mat[i])) for i in range(len(mat))]
    return sorted(dists, key=itemgetter(1), reverse=False)

In [72]:
def pmi(mat, rownames=None, positive=True):  
    # Joint probability table:
    p = mat / np.sum(mat, axis=None)
    # Pre-compute column sums:
    colprobs = np.sum(p, axis=0)
    # Vectorize this function so that it can be applied rowwise:
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))
    p = np.array([np_pmi_log(row / (np.sum(row)*colprobs)) for row in p])   
    return (p, rownames)

def _pmi_log(x, positive=True):
    val = 0.0
    if x > 0.0:
        val = np.log(x)
    if positive:
        val = max([val,0.0])
    return val

In [73]:
ww_ppmi = pmi(mat=ww[0], rownames=ww[1], positive=True)

In [74]:
def correlateds(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    #for i in range (len(mat)):
        #print rownames[i]
        #print w[i]
    dists = [(rownames[i], w[i]) for i in range(len(mat))]
    #print dists
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    # print sorted_dists
    return sorted_dists

In [75]:
# Given a 
neighbors(word='black', mat=ww_ppmi[0], rownames=ww_ppmi[1], distfunc=cosine)[: 50]

[('black', 0.0),
 ('white', 0.32881567985452564),
 ('african', 0.45325001162718814),
 ('racist', 0.47732926622682514),
 ('south', 0.51259175322864359),
 ('africa', 0.55713639484889721),
 ('stereotypes', 0.5731757003996738),
 ('color', 0.58615918821489299),
 ('community', 0.58929295332708698),
 ('in', 0.59004131623416445),
 ('by', 0.59086285153193496),
 ('men', 0.59120925788245771),
 ('washington', 0.59371688522192234),
 ('female', 0.60655361671233843),
 ('american', 0.61079121267662817),
 ('youth', 0.61510447242682009),
 ('culture', 0.61778696924165444),
 ('race', 0.62196307219413705),
 ('wearing', 0.62472988868639134),
 ('a', 0.62512738146024849),
 ('southern', 0.62576460870826733),
 ('barbara', 0.62680153404529793),
 ('freedom', 0.62685769378942069),
 ('chief', 0.62733164799518293),
 ('hair', 0.62734784821295619),
 ('women', 0.62818248069905402),
 ('portrayed', 0.63484459039219243),
 ('country', 0.6372804847801401),
 ('including', 0.63751617984755016),
 ('man', 0.6392856694348128),
 

In [76]:
correlateds(word='black', mat=ww_ppmi[0], rownames=ww_ppmi[1], distfunc=cosine)[: 50]

[('black', 2.8488175927100698),
 ('racist', 2.2748441529419652),
 ('african', 2.1173502457165405),
 ('white', 2.0635351624122569),
 ('africa', 1.7916302075458261),
 ('race', 1.6793908071345509),
 ('n', 1.5945082749843766),
 ('south', 1.532390870734716),
 ('color', 1.4336240711502488),
 ('stereotypes', 1.3387090456518356),
 ('sunday', 1.3384187572482853),
 ('freedom', 1.3364695334597672),
 ('washington', 1.2941201328978293),
 ('barbara', 1.2647634283352243),
 ('mask', 1.2547888241431515),
 ('noir', 1.2176479652968502),
 ('cry', 1.1910817567999432),
 ('lugosi', 1.1864307660390141),
 ('blair', 1.1644314823191457),
 ('mexican', 1.1395988566023065),
 ('4/10', 1.1089660801738976),
 ('maria', 1.1070035331184447),
 ('culture', 1.104911932802461),
 ('foster', 1.0709601593451377),
 ('page', 1.0623088810882766),
 ('academy', 1.0549271053289986),
 ('navy', 1.0284338116087108),
 ('stereotypical', 1.0110780933407182),
 ('actresses', 0.9937533851021888),
 ('community', 0.98971572622608806),
 ('jack',