In [None]:
__author__ = "Theodora Chu, Josh Cohen, Jason Chen"
__version__ = "CS224u, Stanford, Spring 2016 term"

# Setup

In [8]:
import os
import nltk

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer


In [9]:
# Info for creating VSM data
vsmdata_home = "vsmdata"
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils

# File Input
Takes in a text file and returns a list of ordered unigrams U. 
It should also consider stemming and other relevant pre-processing. Josh's note: parse "African American" as a unigram.

In [10]:
def parseTextFile(filename):
    text = open('cor-por-a/' + filename, 'r')
    for i in range(0, 10):
        print text.readline()
    text_parse = text.read().split()
    #print text_parse

    lancaster = LancasterStemmer()
#     print lancaster.stem('maximum') 

    porter = PorterStemmer()
    return text_parse
#     print porter.stem('maximum')    

#parseTextFile('TomSawyer.txt')


# Correlation Matrix
1. Parse U to create a word-word frequency matrix M, where each row represents a word and each entry x(i,j) represents the number of times word i co-occurs with word j.
2. Convert M to a new matrix M’ with some sort of correlation operation. We could use PMI, Occai (see Josh’s paper), CSA, or some other correlation structure.
3. Let row a represent the unigram “African American”. Take in that row, and output an ordered list of (this_unigram, correlation_score) pairs which represent the correlation score of this_unigram with the term “African American”
4. Produce a list L of the top 100 correlated words with the term “African American”


In [11]:
# This takes fucking forever
def createMatrix():
    # Initializes vector of terms
    u_vec = [x.lower() for x in parseTextFile('TomSawyer.txt')];
    vocab_vec = np.unique(u_vec).tolist()
    vocab_size = len(vocab_vec)
    mat = [[0 for x in range(vocab_size)] for y in range(vocab_size)]
    
    # Updates matrix, using bigrams
    for i in range(0, len(u_vec)-1):
        term_one = u_vec[i];
        term_two = u_vec[i+1];
        index_one = vocab_vec.index(term_one)
        index_two = vocab_vec.index(term_two)
        mat[index_one][index_one] += 1;
        mat[index_one][index_two] += 1;
        mat[index_two][index_one] += 1;

    last_term = u_vec[len(u_vec)-1]
    last_term_index = vocab_vec.index(last_term)
    mat[last_term_index][last_term_index] += 1
    return (mat, vocab_vec);

In [12]:
def cosine(u, v):        
    return scipy.spatial.distance.cosine(u, v)

In [13]:
def neighbors(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], distfunc(w, mat[i])) for i in range(len(mat))]
    return sorted(dists, key=itemgetter(1), reverse=False)

In [14]:
from __future__ import division
def pmi(mat, rownames=None, positive=True):  
    # Joint probability table:
    p = mat / np.sum(mat, axis=None)
    # Pre-compute column sums:
    colprobs = np.sum(p, axis=0)
    # Vectorize this function so that it can be applied rowwise:
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))
    p = np.array([np_pmi_log(row / (np.sum(row)*colprobs)) for row in p])   
    return (p, rownames)

def _pmi_log(x, positive=True):
    val = 0.0
    if x > 0.0:
        val = np.log(x)
    if positive:
        val = max([val,0.0])
    return val

In [15]:
def correlateds(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], w[i]) for i in range(len(mat))]
    #print dists
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    # print sorted_dists
    return sorted_dists

In [16]:
# The correlation list returns an ordered list of (word, correlation_score) tuples, where h
def correlationList(mat_ppmi):
    return correlateds(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)

In [17]:
# mat[0] refers to the matrix, mat[1] is a vector of rownames. To get the vector which corresponds to a given word,
# call mat[0][mat[1].index('my_word')]
mat = createMatrix()
mat_ppmi = pmi(mat=mat[0], rownames=mat[1], positive=True)

# The output of this shows the format of 
correlationList(mat_ppmi)[:5]



The Project Gutenberg EBook of The Adventures of Tom Sawyer, Complete by

Mark Twain (Samuel Clemens)



This eBook is for the use of anyone anywhere at no cost and with almost

no restrictions whatsoever. You may copy it, give it away or re-use

it under the terms of the Project Gutenberg License included with this

eBook or online at www.gutenberg.net



Title: The Adventures of Tom Sawyer, Complete



[('colored', 9.4169795839905763),
 ('frontispiece--a', 9.4169795839905763),
 ('boy,', 6.2389257536426292),
 ('small', 5.8616315225011615),
 ('and', 1.3983541189448454)]

# Sentiment Analysis
Takes in a list V of words and returns the average sentiment score across all terms in V as determined by freebase. Note to Jason: consider other sentiment databases

In [3]:
def getSentiment(word):
    #score = nltk.sentiment.vader.polarity_scores(word)
    #replace this with score from freebase
    return len(word)

V = ['good', 'bad', 'great', 'worrisome', 'stupid']
def generate_sentiment(wordList):
    totalSentiment = 0.0;
    for word in wordList:
        totalSentiment += getSentiment(word)
    averageSentiment = totalSentiment/len(wordList)
    return averageSentiment

print generate_sentiment(V)

5.4


# XOR/AND
Takes in a dict of corpus:list of words and returns a dict of corpus:XOR words and dict of corpus:AND words.

In [2]:
def XOR(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first ^ second
def AND(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first & second

# Word Cloud
Takes in a matrix M and correlation list L. Using t-sne, produces a word cloud which represents correlation between all terms. 

In [None]:
from sklearn.manifold import TSNE

def word_cloud(corr_list):
    model = TSNE(n_components=2, random_state=0)
    tsne_matrix = model.fit_transform(corr_list)