In [1]:
import random
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.spatial.distance
from scipy.spatial.distance import euclidean
from scipy.stats import pearsonr, t
from operator import itemgetter

from mittens.tf_mittens import Mittens, _log_of_array_ignoring_zeros

In [2]:
GLOVE_DIM = 100

In [16]:
comatrix_path = 'data/twitter_davidson/comatrix.dat'
X = pd.read_csv(comatrix_path, header = None, dtype = int)
print X

       0      1      2      3      4      5      6      7      8      9      \
0          0      0      0      0      0      0      0      0      0      0   
1          0      0      0      0      0      0      0      0      0      0   
2          0      0  34003  14606  12723   7134   6091  10829   5090   5221   
3          0      0  14606  29803   9786   5436   3845  10493   3707   4480   
4          0      0  12723   9786  81256   4311   4607   6114   3973   3271   
5          0      0   7134   5436   4311  13997   4238   3325   3522   2541   
6          0      0   6091   3845   4607   4238   9361   2433   3008   2018   
7          0      0  10829  10493   6114   3325   2433   8768   2185   2334   
8          0      0   5090   3707   3973   3522   3008   2185  11603   2577   
9          0      0   5221   4480   3271   2541   2018   2334   2577  10688   
10         0      0   5394   3519   3066   3240   2602   2184   2107   1705   
11         0      0   3926   3026   2572   2506   18

In [3]:
vocab = []
with open('data/twitter_davidson/vocab.dat', mode="rb") as f:
    vocab.extend(f.readlines())
vocab = [line.strip('\n') for line in vocab]

In [4]:
embed_path = 'data/twitter_davidson/embeddings.%dd.dat' % GLOVE_DIM
embeddings = pd.read_csv(embed_path, header = None, dtype = np.float64)

In [5]:
embeddings = embeddings.transpose()
embeddings.columns = vocab

In [6]:
embedding_dict = embeddings.to_dict('list')
print embedding_dict['<user>']

[0.6300600000000001, 0.65177, 0.25545, 0.018593000000000002, 0.043094, 0.047194, 0.23218000000000003, 0.11613, 0.17371, 0.40487, 0.022524000000000002, -0.076731, -2.2911, 0.094127, 0.43293000000000004, 0.041801, 0.063175, -0.64486, -0.43656999999999996, 0.024114, -0.082989, 0.21686, -0.13462000000000002, -0.22336, 0.39436, -2.1724, -0.39544, 0.16535999999999998, 0.39438, -0.35181999999999997, -0.14995999999999998, 0.10502, -0.45937, 0.27729000000000004, 0.8924, -0.042312999999999996, -0.009345000000000001, 0.5501699999999999, 0.09552100000000001, 0.070504, -1.1781, 0.013722999999999999, 0.17742, 0.74142, 0.17715999999999998, 0.038468, -0.31684, 0.08941, 0.20556999999999997, -0.34328000000000003, -0.64303, -0.878, -0.16293, -0.055925, 0.33898, 0.60664, -0.2774, 0.33626, 0.21603000000000003, -0.11051, 0.0058673, -0.64757, -0.068222, -0.7741399999999999, 0.13910999999999998, -0.15850999999999998, -0.61885, -0.10192000000000001, -0.47, 0.19787000000000002, 0.42175, -0.18458, 0.080581, -0.2

In [None]:
mittens = Mittens(n=GLOVE_DIM, max_iter=5000, mittens=1)
M = mittens.fit(X, vocab=vocab, initial_embedding_dict=embedding_dict)

In [14]:
mittens_path = 'data/twitter_davidson/embeddings.mittens1.%dd.dat' % GLOVE_DIM
M = pd.read_csv(mittens_path, header = None, quoting = 0).values

In [10]:
def cosine(u, v):        
    """Cosine distance between 1d np.arrays `u` and `v`, which must have 
    the same dimensionality. Returns a float."""
    # Use scipy's method:
    return scipy.spatial.distance.cosine(u, v)

def neighbors(word, mat, rownames, distfunc=cosine, returndist=False):    
    """Tool for finding the nearest neighbors of `word` in `mat` according 
    to `distfunc`. The comparisons are between row vectors.
    
    Parameters
    ----------
    word : str
        The anchor word. Assumed to be in `rownames`.
        
    mat : np.array
        The vector-space model.
        
    rownames : list of str
        The rownames of mat.
            
    distfunc : function mapping vector pairs to floats (default: `cosine`)
        The measure of distance between vectors. Can also be `euclidean`, 
        `matching`, `jaccard`, as well as any other distance measure  
        between 1d vectors.
        
    Raises
    ------
    ValueError
        If word is not in rownames.
    
    Returns
    -------    
    list of tuples
        The list is ordered by closeness to `word`. Each member is a pair 
        (word, distance) where word is a str and distance is a float.
    
    """
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], distfunc(w, mat[i])) for i in range(len(mat))]
    sorteddists = sorted(dists, key=itemgetter(1), reverse=False)
    if returndist:
        return sorteddists
    return [item[0] for item in sorteddists]

In [11]:
G = embeddings.transpose().values

In [27]:
print neighbors("bitch", G, vocab, cosine)[:5]
print neighbors("hoe", G, vocab, cosine)[:5]
print neighbors("redneck", G, vocab, cosine)[:5]

[('bitch', -1.4129278413932411e-08), ('fuck', 0.13727204773927515), ('nigga', 0.14032760976063741), ('ass', 0.14142532066032187), ('bitches', 0.14673692525606929)]
[('hoe', 5.9242417527194391e-08), ('bitch', 0.16976215836040542), ('dat', 0.21639467291919789), ('chick', 0.25209907057396741), ('ass', 0.25378253085063207)]
[('redneck', -9.3922838839688438e-08), ('hillbilly', 0.17283135233412072), ('hick', 0.32069949263477471), ('rednecks', 0.33543274354219055), ('beaner', 0.35272613527234342)]


In [31]:
print neighbors("bitch", G_, vocab, cosine)[:5]
print neighbors("hoe", G_, vocab, cosine)[:5]
print neighbors("redneck", G_, vocab, cosine)[:5]

<type 'numpy.ndarray'>
[('bitch', -2.2204460492503131e-16), ('ass', 0.12494736692434094), ('nigga', 0.12582509492100258), ('fuck', 0.12621232610457445), ('shit', 0.14002559766740652)]
[('hoe', -2.2204460492503131e-16), ('bitch', 0.1574867569747509), ('dat', 0.1685564922519468), ('chick', 0.2129714796372042), ('nigga', 0.23991521132589055)]
[('redneck', 0.0), ('hillbilly', 0.17286828155779077), ('hick', 0.32029168316044743), ('rednecks', 0.33512891701993541), ('beaner', 0.35247726930871182)]


In [32]:
for w in vocab[2:20]:
    print "GloVE:", neighbors(w, G, vocab, cosine)[:5]
    print "Mittens:", neighbors(w, M, vocab, cosine)[:5]

GloVE: [('<user>', 0.0), ('rt', 0.10501733915665235), ('hey', 0.28818721842568296), ('cc', 0.30024118953272783), ('bro', 0.30810150586519924)]
Mittens: [('<user>', -9.0452620504777315e-08), ('rt', 0.091155480241602693), ('<repeat>', 0.26170164632990722), ('a', 0.30732869161440635), ('hey', 0.30826986073257645)]
GloVE: [('<allcaps>', -2.2204460492503131e-16), ('<hashtag>', 0.26649973052702647), ('<url>', 0.30984813435870917), ('#', 0.34328685805574388), ('n', 0.35690622032526931)]
Mittens: [('<allcaps>', -1.9592548250457753e-08), ('<hashtag>', 0.23500529601586739), ('<url>', 0.27685880900865545), ('#', 0.33371711642786306), ('rt', 0.33414618495922355)]
GloVE: [('<sym>', -2.2204460492503131e-16), ('no-tv', 0.61755860648283067), ('(@', 0.64420567254571037), ('ebloa', 0.66548182115772203), ('kenfolk', 0.6657745220089053)]
Mittens: [('<sym>', 5.0854358057961235e-08), ('no-tv', 0.63506879668086524), ('kenfolk', 0.66257323099046794), ('(@', 0.66422520409372687), ('ebloa', 0.67649159828678906)

In [19]:
for w in ["bitch", 
          "pussy", 
          "hoe", 
          "fuck", 
          "nigga", 
          "trash", 
          "faggot", 
          "white", 
          "retarded", 
          "yellow", 
          "yankees",
          "black",
          "monkey"
         ]:
    print w, "------"
    print "GloVE:", neighbors(w, G, vocab, cosine, returndist=False)[:5]
    print "Mittens:", neighbors(w, M, vocab, cosine, returndist=False)[:5]

bitch ------
GloVE: ['bitch', 'ass', 'nigga', 'fuck', 'shit']
Mittens: ['bitch', 'like', 'you', 'a', 'i']
pussy ------
GloVE: ['pussy', 'dick', 'ass', 'cock', 'booty']
Mittens: ['pussy', 'dick', 'sex', 'hoe', 'tits']
hoe ------
GloVE: ['hoe', 'bitch', 'dat', 'chick', 'nigga']
Mittens: ['hoe', 'hoes', 'dat', 'chick', 'pussy']
fuck ------
GloVE: ['fuck', 'shit', 'hell', 'bitch', 'damn']
Mittens: ['fuck', 'hell', 'hate', 'why', 'fucking']
nigga ------
GloVE: ['nigga', 'bruh', 'niggas', 'aint', 'bitch']
Mittens: ['nigga', 'niggas', 'gotta', 'homie', 'lil']
trash ------
GloVE: ['trash', 'garbage', 'dirty', 'shit', 'nasty']
Mittens: ['trash', 'garbage', 'dirty', 'nasty', 'shit']
faggot ------
GloVE: ['faggot', 'fag', 'asshole', 'nigger', 'cunt']
Mittens: ['faggot', 'fag', 'nigger', 'asshole', 'cunt']
white ------
GloVE: ['white', 'black', 'blue', 'green', 'red']
Mittens: ['white', 'black', 'blue', 'yellow', 'green']
retarded ------
GloVE: ['retarded', 'dumb', 'retarted', 'stupid', 'dumbass']

In [38]:
for w in ["colored",
          "crazy",
          "women",
          "gay",
          "suck"
         ]:
    print w, "------"
    print "GloVE:", neighbors(w, G_, vocab, cosine, returndist=False)[:5]
    print "Mittens:", neighbors(w, G, vocab, cosine, returndist=False)[:5]

colored ------
GloVE: ['colored', 'coloured', 'colors', 'yellow', 'purple']
Mittens: ['colored', 'coloured', 'colors', 'yellow', 'purple']
crazy ------
GloVE: ['crazy', 'mad', 'shit', 'really', 'like']
Mittens: ['crazy', 'mad', 'insane', 'funny', 'really']
women ------
GloVE: ['women', 'woman', 'girls', 'men', 'ladies']
Mittens: ['women', 'woman', 'girls', 'men', 'ladies']
gay ------
GloVE: ['gay', 'homosexual', 'fake', 'gays', 'lesbian']
Mittens: ['gay', 'homosexual', 'gays', 'fake', 'lesbian']
suck ------
GloVE: ['suck', 'sucks', 'dick', 'fuck', 'either']
Mittens: ['suck', 'sucks', 'dick', 'either', 'balls']


In [42]:
mittens_embed_path = 'data/twitter_davidson/embeddings.mittens.%dd.dat' % GLOVE_DIM
pd.DataFrame(G).to_csv(mittens_embed_path, header = False, index = False)