In [1]:
__author__ = "Theodora Chu, Josh Cohen, Jason Chen"
__version__ = "CS224u, Stanford, Spring 2016 term"

# Setup

In [2]:
import os
import nltk

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer


In [3]:
# Info for creating VSM data
vsmdata_home = "vsmdata"
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils

# File Input
Takes in a text file and returns a list of ordered unigrams U. 
It should also consider stemming and other relevant pre-processing. Josh's note: parse "African American" as a unigram.

In [4]:
def parseTextFile(filename):
    text = open('cor-por-a/' + filename, 'r')
    for i in range(0, 10):
        print text.readline()
    text_parse = text.read().split()
    #print text_parse

    lancaster = LancasterStemmer()
#     print lancaster.stem('maximum') 

    porter = PorterStemmer()
    return text_parse
#     print porter.stem('maximum')    

#parseTextFile('TomSawyer.txt')


# Correlation Matrix
1. Parse U to create a word-word frequency matrix M, where each row represents a word and each entry x(i,j) represents the number of times word i co-occurs with word j.
2. Convert M to a new matrix M’ with some sort of correlation operation. We could use PMI, Occai (see Josh’s paper), CSA, or some other correlation structure.
3. Let row a represent the unigram “African American”. Take in that row, and output an ordered list of (this_unigram, correlation_score) pairs which represent the correlation score of this_unigram with the term “African American”
4. Produce a list L of the top 100 correlated words with the term “African American”


In [5]:
# This takes fucking forever
def createMatrix():
    # Initializes vector of terms
    u_vec = [x.lower() for x in parseTextFile('TomSawyer.txt')];
    vocab_vec = np.unique(u_vec).tolist()
    vocab_size = len(vocab_vec)
    mat = [[0 for x in range(vocab_size)] for y in range(vocab_size)]
    
    # Updates matrix, using bigrams
    for i in range(0, len(u_vec)-1):
        term_one = u_vec[i];
        term_two = u_vec[i+1];
        index_one = vocab_vec.index(term_one)
        index_two = vocab_vec.index(term_two)
        mat[index_one][index_one] += 1;
        mat[index_one][index_two] += 1;
        mat[index_two][index_one] += 1;

    last_term = u_vec[len(u_vec)-1]
    last_term_index = vocab_vec.index(last_term)
    mat[last_term_index][last_term_index] += 1
    return (mat, vocab_vec);

In [6]:
def cosine(u, v):        
    return scipy.spatial.distance.cosine(u, v)

In [7]:
def neighbors(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], distfunc(w, mat[i])) for i in range(len(mat))]
    return sorted(dists, key=itemgetter(1), reverse=False)

In [8]:
from __future__ import division
def pmi(mat, rownames=None, positive=True):  
    # Joint probability table:
    p = mat / np.sum(mat, axis=None)
    # Pre-compute column sums:
    colprobs = np.sum(p, axis=0)
    # Vectorize this function so that it can be applied rowwise:
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))
    p = np.array([np_pmi_log(row / (np.sum(row)*colprobs)) for row in p])   
    return (p, rownames)

def _pmi_log(x, positive=True):
    val = 0.0
    if x > 0.0:
        val = np.log(x)
    if positive:
        val = max([val,0.0])
    return val

In [9]:
def correlateds(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], w[i]) for i in range(len(mat))]
    #print dists
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    # print sorted_dists
    return sorted_dists

In [10]:
# The correlation list returns an ordered list of (word, correlation_score) tuples, where higher correlation_score
# means the word is more correlated. The correlation list includes all words in the vocabulary, so you can
# selectively take the first n elements if you want to use them.
def correlationList(mat_ppmi):
    return correlateds(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)

In [11]:
# mat[0] refers to the matrix, mat[1] is a vector of rownames. To get the vector which corresponds to a given word,
# call mat[0][mat[1].index('my_word')]
mat = createMatrix()
mat_ppmi = pmi(mat=mat[0], rownames=mat[1], positive=True)

# The output of this shows the format of the correlation list. You 
correlationList(mat_ppmi)[:5]



The Project Gutenberg EBook of The Adventures of Tom Sawyer, Complete by

Mark Twain (Samuel Clemens)



This eBook is for the use of anyone anywhere at no cost and with almost

no restrictions whatsoever. You may copy it, give it away or re-use

it under the terms of the Project Gutenberg License included with this

eBook or online at www.gutenberg.net



Title: The Adventures of Tom Sawyer, Complete



[('colored', 9.4169795839905763),
 ('frontispiece--a', 9.4169795839905763),
 ('boy,', 6.2389257536426292),
 ('small', 5.8616315225011615),
 ('and', 1.3983541189448454)]

In [16]:
neighbors_list = neighbors(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)[: 50]
print neighbors_list

def retrieve_words(tuple_list):
    words = list()
    for _tuple in tuple_list:
        words.append(_tuple[0])
    return words

neighbors_word_list = retrieve_words(neighbors_list)
print neighbors_word_list

[('colored', 1.1102230246251565e-16), ('frontispiece--a', 0.26683418138331849), ('boy,', 0.72281068148519168), ('scoldings', 0.7872725066119548), ("sister's", 0.80244142688351794), ('newcomer', 0.80692312982705072), ('reward--in', 0.80692312982705072), ("signpainter's", 0.81145390861254185), ('small', 0.8140836541431572), ('sacks', 0.81565158584186703), ('comforts', 0.81817882660553809), ('catfish--provisions', 0.82358249463107236), ('willie', 0.82874528710588957), ('\xa0i', 0.82874528710588957), ('hole,', 0.83175120585574069), ('"branch"', 0.83229509399182877), ('staff.', 0.83304774972930862), ('human', 0.83419264599984588), ('friendless', 0.8353984098379047), ('skiff', 0.8416621778249993), ('chamber,', 0.8419171116430535), ('watcher', 0.84761191403940617), ('cavern', 0.85188353729408162), ('recess', 0.85808672433960165), ('bluff', 0.86197070485873517), ('tick,', 0.8657556134858555), ('model', 0.86992459670831856), ('strain', 0.87014618555865109), ('"my', 0.88014593355008408), ('trust

# Sentiment Analysis
Takes in a list V of words and returns the average sentiment score across all terms in V as determined by freebase. Note to Jason: consider other sentiment databases

In [14]:
from nltk.corpus import sentiwordnet as swn
from __future__ import unicode_literals

def getSentiment(word):
    synset = list(swn.senti_synsets(word))
    if len(synset) > 0: #if a synset exists for this word
        synset = synset[0]
        return(synset.pos_score(), synset.neg_score(), synset.obj_score())

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

V = ['good', 'bad', 'great', 'awesome', 'amazing', 'holy', 'beautiful', 'worrisome', 'stupid']
def generate_sentiment(wordList):
    totalSentiment = 0.0;
    for word in wordList:
        if is_ascii(word): #see note below for rationale
            sentiment = getSentiment(word)
            if sentiment == None:
                sentiment = 0.0
            if type(sentiment) is float: #why does this happen
                print "n/a"
            else:  
                totalSentiment += (sentiment[0] - sentiment[1]) 
                print (sentiment[0] - sentiment[1])
        #sentiwordnet generates tuples of pos, neg, and neu. currently naively choosing to consider only sum of pos and neg. 
    averageSentiment = totalSentiment/len(wordList)
    return averageSentiment

print generate_sentiment(neighbors_word_list)

0.0
n/a
n/a
-0.125
n/a
0.0
n/a
n/a
0.0
0.0
0.0
n/a
n/a
n/a
n/a
n/a
0.0
0.0
0.0
n/a
0.25
0.0
0.0
0.0
n/a
0.0
0.0
n/a
0.0
0.0
0.0
0.0
0.0
0.0
-0.5
0.0
n/a
0.0
0.0
-0.375
0.0
0.0
0.5
n/a
0.0
0.0
n/a
0.0
n/a
-0.005


In [15]:
"你好".encode('utf-8')
encode converts a unicode object to a string object. But here you have invoked it on a string object (because you don't have the u). So python has to convert the string to a unicode object first. So it does the equivalent of

"你好".decode().encode('utf-8')
But the decode fails because the string isn't valid ascii. That's why you get a complaint about not being able to decode.

SyntaxError: invalid syntax (<ipython-input-15-3e4f1338aa92>, line 2)

# XOR/AND
Takes in a dict of corpus:list of words and returns a dict of corpus:XOR words and dict of corpus:AND words.

In [21]:
toyList = ['black', 'block', 'beer']

def XOR(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first ^ second
def AND(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first & second

print 'XOR'
print XOR(toyList, neighbors_word_list)
print 'AND'
print AND(toyList, neighbors_word_list)

XOR
set(['body', 'recess', 'human', 'too.', '\xa0i', 'boy,', 'watcher', 'strain', 'frontispiece--a', 'cavern', 'boys,', 'church', "sister's", '"my', 'saw', 'sandy', 'trifle', 'friendless', 'trust', 'log', 'comfort', u'beer', 'hole,', 'scoldings', 'raft', 'willie', 'told', 'poor', 'blue', 'presently,', 'catfish--provisions', 'chamber,', 'tick,', 'hand', 'newcomer', 'bluff', 'reward--in', 'colored', 'staff.', 'sacks', 'quiet', "signpainter's", 'placed', 'skiff', 'donations', u'block', 'small', 'model', '"branch"', "can't", 'comforts'])
AND
set([u'black'])


# Word Cloud
Takes in a matrix M and correlation list L. Using t-sne, produces a word cloud which represents correlation between all terms. 

In [None]:
from sklearn.manifold import TSNE
import numpy
numpy.set_printoptions(threshold='nan')

def word_cloud_preprocessing(words, matrix=mat_ppmi):
    output = []
    for word in words:
        ind = matrix[1].index(word)
        output.append(matrix[0][ind])
    return output
processed_mat = word_cloud_preprocessing(neighbors_word_list)
print processed_mat

def word_cloud(corr_list): #i think its processed_mat / didn't tsne take in a vector of labels as well?
    model = TSNE(n_components=2, random_state=0)
    tsne_matrix = model.fit_transform(corr_list)
    
word_cloud(processed_mat)