# Setup

In [2]:
import os
import nltk

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

In [3]:
# Info for creating VSM data
vsmdata_home = "vsmdata"
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils
import string

In [4]:
equivalence_set = ['african american', 'african-american', 'black', 'african', 'nigger', 'nigga']

In [5]:
seed_set2 = ['asian-american', 'african-american', 'black', 'african', 'chinese', 'japanese', 'asian', 'jewish', 'latino', 'mexican', 'russian', 'american']

# File Input
Takes in a text file and returns a list of ordered unigrams U. 
It should also consider stemming and other relevant pre-processing. Josh's note: parse "African American" as a unigram.

In [14]:
def parse_NYT_articles_seedword(num_files, root_directory='cor-por-a/2006/', file_percentage=.1):
    overallCorpus = [];
    i = 0;
    print "Parsing " + str(100*file_percentage) + "% of files. Will cap hard at " + str(num_files) + " files."
    for dirname_1 in os.listdir(root_directory):
        if (dirname_1 == '.DS_Store'):
            continue;
        print "Parsing outer directory: " + dirname_1;
        for dirname in os.listdir(root_directory + dirname_1 + '/'):
            if (dirname == '.DS_Store'):
                continue;
            print "parsing directory " + root_directory + dirname_1 + '/' + dirname;
            for filename in os.listdir(root_directory + dirname_1 + '/' + dirname + '/'):
                if random.random() <= file_percentage:
                    if (i >= num_files):
                        print 'Num files: parsed overall ' + str(i);
                        return overallCorpus
                    if (filename == '.DS_Store'):
                        continue;
                    article_file = root_directory + dirname_1 + '/' + dirname + '/' + filename;
                    article_rep = parse_NYT_article(article_file);
                    if (article_rep):
                        article_text = remove_punctuation(article_rep[1]).split(" ");
                        overallCorpus += ' ';
                        overallCorpus += article_text;
                    i = i+1;
    print "num files: " + str(i);
    return overallCorpus;

In [15]:
def parse_NYT_articles_worddoc(num_files, root_directory='cor-por-a/2006/'):
    overallCorpus = [];
    file_list = [];
    i = 0;
    for dirname_1 in os.listdir(root_directory):
        if (dirname_1 == '.DS_Store'):
            continue;
        print "Parsing outer file directory " + dirname_1;
        for dirname in os.listdir(root_directory + dirname_1 + '/'):
            if (dirname == '.DS_Store'):
                continue;
            print "parsing directory " + root_directory + dirname_1 + '/' + dirname;
            for filename in os.listdir(root_directory + dirname_1 + '/' + dirname + '/'):      
                if (i >= num_files):
                    print 'Num files parsed in total: ' + str(i);
                    return (overallCorpus, file_list)
                if (filename == '.DS_Store'):
                    continue;
                article_file = root_directory + dirname_1 + '/' + dirname + '/' + filename;
                file_list.append(filename)
                article_rep = parse_NYT_article(article_file);
                if (article_rep):
                    article_text = remove_punctuation(article_rep[1]).split(" ");
                    for word in article_text:
                        overallCorpus.append((word, filename))
                i = i+1;
    print "Num files parsed in total: " + str(i)
    return (overallCorpus, file_list);

# Correlation Matrix
1. Parse U to create a word-word frequency matrix M, where each row represents a word and each entry x(i,j) represents the number of times word i co-occurs with word j.
2. Convert M to a new matrix M’ with some sort of correlation operation. We could use PMI, Occai (see Josh’s paper), CSA, or some other correlation structure.
3. Let row a represent the unigram “African American”. Take in that row, and output an ordered list of (this_unigram, correlation_score) pairs which represent the correlation score of this_unigram with the term “African American”
4. Produce a list L of the top 100 correlated words with the term “African American”


In [16]:
# returns matrix object where mat_obj[0] refers to the seed_word matrix, where mat[1] refers
# to the vocab list, where mat[2] refers to a frequency list,
# where mat_obj[0][0] refers to the vector representing co-occurrence for first word in seed
# set, and where mat_obj[0][len(seed_set)] refers to a vector of overall counts for each term 
def createSeedWordMatrixNYT(num_files, file_year="2006", percentage_of_files=.1):
    # Initializes vector of terms
    corpus = "cor-por-a/" + file_year + "/"
    u_vec = [x.lower() for x in parse_NYT_articles_seedword(num_files, root_directory=corpus, file_percentage=percentage_of_files)];
    num_terms = len(u_vec);
    print 'num terms in corpus: ' + str(num_terms);
    vocab_vec = np.unique(u_vec).tolist()
    vocab_size = len(vocab_vec)
    print 'vocab size: ' + str(vocab_size);
    print 'matrix dimensions: ' + str(len(seed_set2)) + ' x ' + str(len(vocab_vec));
    mat = [[0 for x in range(vocab_size)] for y in range(len(seed_set2)+1)]
    frequency_vec = [0 for x in range(vocab_size)]

    index_dict = {};
    for i in range (0, len(vocab_vec)):
        index_dict[vocab_vec[i]] = i;
    print 'index_dict created!'
    
    # Updates matrix, using bigrams
    term = u_vec[0];
    term_neighbor_r = u_vec[1];
    index_term = index_dict[term];
    frequency_vec[index_term] += 1;
    if (any(seed_word == term for seed_word in seed_set2)):
            index_seed = seed_set2.index(term);
            index_neighbor_r = index_dict[term_neighbor_r];
            mat[index_seed][index_neighbor_r] += 1;
            mat[index_seed][index_term] += 1;
    for i in range(1, len(u_vec)-1):
        if (i % 1000 == 0):
            print 'parsed ' + str(i) + '/' + str(num_terms) + ' terms'
        term = u_vec[i];
        term_neighbor_l = u_vec[i-1];
        term_neighbor_r = u_vec[i+1];
        index_term = index_dict[term]
        
        frequency_vec[index_term] += 1;
        if (any(seed_word == term for seed_word in seed_set2)):
            index_seed = seed_set2.index(term);
            index_neighbor_l = index_dict[term_neighbor_l]
            index_neighbor_r = index_dict[term_neighbor_r]
            
            mat[index_seed][index_neighbor_l] += 1;
            mat[index_seed][index_neighbor_r] += 1;
            mat[index_seed][index_term] += 1;
    term = u_vec[len(u_vec)-1];
    term_neighbor_l = u_vec[len(u_vec)-2];
    index_term = index_dict[term]
    
    frequency_vec[index_term] += 1;
    if (any(seed_word == term for seed_word in seed_set2)):
            index_seed = seed_set2.index(term);
            index_neighbor_l = index_dict[term];
            mat[index_seed][index_neighbor_l] += 1;
            mat[index_seed][index_term] += 1;
    

    #   Filter the matrix by removing all words with frequency less than cutoff_freq
    mat = np.transpose(mat)
    indicies = []
    cutoff_freq = 5
    for i in range(0, len(frequency_vec)):
        if frequency_vec[i] > cutoff_freq:
            indicies.append(i) #keep track of indices of all words with frequency < cutoff_freq
    print 'Parsed ' + str(num_terms) + '/' + str(num_terms) + ' terms';
    #update mat, freq_vec, and vocab_vec to include only the indices we saved
    mat = mat[np.array(indicies)]
    frequency_vec = np.array(frequency_vec)
    frequency_vec = frequency_vec[np.array(indicies)]
    temp_vocab = []
    for index in indicies:
        temp_vocab.append(vocab_vec[index])
    vocab_vec = temp_vocab
    #transpose mat back to original shape
    mat = np.transpose(mat)
    print("New vocab size: " + str(len(mat[0])))
    return (mat, vocab_vec, frequency_vec);

In [70]:
# Returns a word, correlation list tuple for each seed in seed_set2
from nltk.corpus import stopwords
def getCorrelationLists(mat_obj, rem_stopwords=1):
    tupleArr = []
    # Word lists for each word
    for j in range(len(seed_set2)):
        w = mat_obj[0][j]
        unfiltered_dists = [(mat_obj[1][i], w[i]) for i in range(len(w))]
        
        # Without stop words
        dists = [];
        if rem_stopwords:
            dists = [(word, frequency) for (word, frequency) in unfiltered_dists if word not in stopwords.words('english')]
        else:
            dists = unfiltered_dists
        sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
        print "Correlation list for word: " + seed_set2[j] + "; " + str(sorted_dists[:20])
        tupleArr.append((seed_set2[j], sorted_dists));

    #frequency list for each word:
    w = mat_obj[2]
    dists = [(mat_obj[1][i], w[i]) for i in range(len(w))]
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    print "Frequency list: " + str(sorted_dists[:20])
    tupleArr.append((seed_set2[j], sorted_dists));
    
    return tupleArr;

In [71]:
def cosine(u, v):        
    return scipy.spatial.distance.cosine(u, v)

In [72]:
def neighbors(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], distfunc(w, mat[i])) for i in range(len(mat))]
    return sorted(dists, key=itemgetter(1), reverse=False)

In [73]:
from __future__ import division
# PMI log: p(x, y)/ p(x)p(y)
def pmi_seed(mat_obj, rownames=None, positive=True):  
    rownames = mat_obj[1];
    frequencies = mat_obj[2];
    word_count = np.sum(frequencies, axis=None)
    
    # Joint probability table:
    p = mat_obj[0] / word_count;
    colprobs = frequencies/word_count;
    sum_of_colprobs = np.sum(colprobs)
    
    
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))    
    mat_ppmi = [];
    for row in p:
        if np.sum(row) > 0:
            mat_ppmi.append(np_pmi_log(row / (np.sum(row)*colprobs)));
        else:
            mat_ppmi.append([0 for x in row])
    return (mat_ppmi, rownames, frequencies)

In [74]:
def _pmi_log(x, positive=True):
    val = 0.0
    if x > 0.0:
        val = np.log(x)
    if positive:
        val = max([val,0.0])
    return val

# Tools

### Saving a matrix ###

In [17]:
# import os
# if not os.path.exists('my_file'): numpy.savetxt('my_file', my_array)

#this will save the result of our matrix into a human-readable text file, and the original array is easily
#recreated using loadtxt.

# np.savetxt("mat_features", mat[0])
# np.savetxt("mat_labels", mat[1])
# np.loadtxt("mat_labels")

   ### tool to remove punctuation from a text ###

In [18]:
s = "string. With. Punctuation?" # Sample string
def remove_punctuation(text):
    for c in string.punctuation:
        if c != '-': #excluding - because we want to preserve african-american as a token
            text = text.replace(c," ")
    return text

print(remove_punctuation(s))


#period, question mark, exclamation point, comma, semicolon, colon, dash, 
#hyphen, parentheses, brackets, braces, apostrophe, quotation marks, and ellipses

string  With  Punctuation 


### tools to parse out text from ntif/xml document for NYT articles ###

In [19]:
# http://docs.python-guide.org/en/latest/scenarios/xml/
# http://stackoverflow.com/questions/1912434/how-do-i-parse-xml-in-python
import xml.etree.ElementTree as ET

def parse_NYT_article(xmlFile): 
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    year = ''
    article_text = '';
    for child in root:
        if child.tag == 'head':
            for subchild in child:
                if 'name' in subchild.attrib:
                    if subchild.attrib['name'] == 'publication_year':
                        year = subchild.attrib['content']
        if child.tag == 'body':
            body = child
    for child in body:
        if child.tag == 'body.content':
            content = child
    for child in content:
        if child.attrib == {'class': 'full_text'}:
            for paragraph in child:
                article_text += paragraph.text
            return (year, article_text)
                
parse_NYT_article('nyt_sample_2.xml')

('2007',
 'A doctor who works at a clinic in Jamaica has been charged with insurance fraud, accused of billing insurance companies for tests that were never performed on victims of motor vehicle accidents, prosecutors said yesterday. The doctor, Alexander Israeli, 53, of Middle Village, was arraigned in Criminal Court on Monday night on charges of grand larceny and insurance fraud, said Richard A. Brown, the Queens district attorney. Mr. Brown said that Dr. Israeli billed insurance companies last year for $21,000 worth of neurological tests that were not performed. He faces loss of his medical license and up to seven years in prison if convicted, prosecutors said.')

# Creating Seed Word Matrix

In [20]:
all_years = ["1987", "1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007"]

In [22]:
for this_file_year in all_years:
    print this_file_year
    mat_obj = createSeedWordMatrixNYT(num_files=10, file_year=this_file_year)
    np.save("testm/" + this_file_year + "_seedword", mat_obj)

1987
Parsing 10.0% of files. Will cap hard at 10 files.
Parsing outer directory: 01
parsing directory cor-por-a/1987/01/01
Num files: parsed overall 10
num terms in corpus: 6094
vocab size: 1818
matrix dimensions: 12 x 1818
index_dict created!
parsed 1000/6094 terms
parsed 2000/6094 terms
parsed 3000/6094 terms
parsed 4000/6094 terms
parsed 5000/6094 terms
parsed 6000/6094 terms
Parsed 6094/6094 terms
New vocab size: 128
1988
Parsing 10.0% of files. Will cap hard at 10 files.
Parsing outer directory: 01
parsing directory cor-por-a/1988/01/01
Num files: parsed overall 10
num terms in corpus: 6563
vocab size: 1810
matrix dimensions: 12 x 1810
index_dict created!
parsed 1000/6563 terms
parsed 2000/6563 terms
parsed 3000/6563 terms
parsed 4000/6563 terms
parsed 5000/6563 terms
parsed 6000/6563 terms
Parsed 6563/6563 terms
New vocab size: 148
1989
Parsing 10.0% of files. Will cap hard at 10 files.
Parsing outer directory: 01
parsing directory cor-por-a/1989/01/01
Num files: parsed overall 1

In [156]:
this_file_year = "2006"

In [157]:
mat_obj = createSeedWordMatrixNYT(num_files=100, file_year=this_file_year)

Parsing outer directory: 01
parsing directory cor-por-a/2006/01/01
Num files: parsed overall 100
num terms in corpus: 102381
vocab size: 14051
matrix dimensions: 10 x 14051
index_dict created!
parsed 1000/102381 terms
parsed 2000/102381 terms
parsed 3000/102381 terms
parsed 4000/102381 terms
parsed 5000/102381 terms
parsed 6000/102381 terms
parsed 7000/102381 terms
parsed 8000/102381 terms
parsed 9000/102381 terms
parsed 10000/102381 terms
parsed 11000/102381 terms
parsed 12000/102381 terms
parsed 13000/102381 terms
parsed 14000/102381 terms
parsed 15000/102381 terms
parsed 16000/102381 terms
parsed 17000/102381 terms
parsed 18000/102381 terms
parsed 19000/102381 terms
parsed 20000/102381 terms
parsed 21000/102381 terms
parsed 22000/102381 terms
parsed 23000/102381 terms
parsed 24000/102381 terms
parsed 25000/102381 terms
parsed 26000/102381 terms
parsed 27000/102381 terms
parsed 28000/102381 terms
parsed 29000/102381 terms
parsed 30000/102381 terms
parsed 31000/102381 terms
parsed 320

In [161]:
np.save(this_file_year + "_seedword", mat_obj)

# Creating Correlation Lists
Using the functions above, creates seed-word matrix with a user-specified number of files, performs pmi on that matrix, and computes a resulting correlation list for each seed word.

In order to use more files, update the num_files variable. In order to update the seed set, update the seed_set2 global variable to include more words.

Note: Creating these correlation lists at scale is very slow. Start off by processing about 10 files, and scale up. 

In [78]:
rehydrate_file_year = "2006";

In [79]:
mat_rehydrate = np.load('matrices/' + rehydrate_file_year + "_seedword.npy");

In [80]:
mat_rehydrate_ppmi = pmi_seed(mat_rehydrate);

In [81]:
correlation_list = getCorrelationLists(mat_rehydrate_ppmi, rem_stopwords=0);

Correlation list for word: asian-american; [(u'asian-american', 12.153338576082547), (u'cowed', 8.5424206634383228), (u'walk-on', 8.4644591219686109), (u'whiz', 7.7966297493929559), (u'receptionist', 7.7344979682859494), (u'spotlights', 7.6100437938125438), (u'assisting', 7.0719342110980845), (u'35-year-old', 7.0058440992690949), (u'29th', 6.6520803655378211), (u'actresses', 6.5402104696944772), (u'classmates', 6.2425419320420206), (u'turnout', 6.041871236579869), (u'best-known', 5.9169689858788432), (u'affecting', 5.8397905298054527), (u'studies', 5.737014873759767), (u'69', 5.2366235557289391), (u'themes', 5.1686222559642827), (u'supports', 5.1006175268502254), (u'population', 5.0653472467974776), (u'advisory', 5.0632617403064559)]
Correlation list for word: african-american; [(u'african-american', 9.6691920935679452), (u'jive-talking', 7.8774326243398898), (u'complainants', 7.4719675162317252), (u'manservant', 7.4719675162317252), (u'foodways', 7.3666070005738993), (u'predominately'

In [138]:
correlation_list_raw = getCorrelationLists(mat_rehydrate, rem_stopwords=1);

Correlation list for word: asian-american; [(u'asian-american', 110), (u'', 17), (u'first', 11), (u'studies', 8), (u'art', 6), (u'percent', 5), (u'one', 4), (u'population', 4), (u'voters', 4), (u'women', 4), (u'four', 3), (u'woman', 3), (u'baby', 2), (u'children', 2), (u'community', 2), (u'experience', 2), (u'films', 2), (u'high', 2), (u'men', 2), (u'network', 2)]
Correlation list for word: african-american; [(u'african-american', 1320), (u'', 212), (u'first', 80), (u'women', 38), (u'artists', 36), (u'woman', 36), (u'community', 35), (u'history', 28), (u'studies', 28), (u'experience', 21), (u'men', 19), (u'percent', 18), (u'young', 18), (u'students', 14), (u'culture', 12), (u'family', 12), (u'life', 12), (u'three', 12), (u'two', 11), (u'children', 10)]
Correlation list for word: black; [(u'black', 14854), (u'', 3048), (u'man', 234), (u'men', 223), (u'first', 219), (u'pepper', 206), (u'ground', 198), (u'women', 174), (u'woman', 165), (u'people', 148), (u'young', 127), (u'voters', 120), 

# Word Document Matrix
Creates a word document matrix for use by Theo and her LDA work!

In [90]:
from nltk.corpus import stopwords

def createWordDocumentMatrixNYT(num_files):
    result = parse_NYT_articles_worddoc(num_files)
    word_file_vec = [(x[0].lower(),x[1]) for x in result[0]]
    word_vec = [x[0] for x in word_file_vec]
    vocab_vec = np.unique(word_vec).tolist()
    file_vec = result[1]
    print 'num terms in corpus: ' + str(len(word_vec))
    print 'vocab size: ' + str(len(vocab_vec))
    print 'matrix dimensions: ' + str(len(vocab_vec)) + ' x ' + str(len(file_vec))
    mat = [[0 for x in range(len(file_vec))] for y in range(len(vocab_vec))]
    
    index_dict = {};
    for i in range (0, len(vocab_vec)):
        index_dict[vocab_vec[i]] = i
    print 'index_dict created!'
    
    file_index_dict = {};
    for i in range (0, len(file_vec)):
        file_index_dict[file_vec[i]] = i
    print 'file_index_dict created!'
    
    i = 0;
    for word_file_tuple in word_file_vec:
        if (i % 1000 == 0):
            print 'parsed ' + str(i) + '/' + str(len(word_vec)) + ' terms'
        word = word_file_tuple[0]
        file_name = word_file_tuple[1]
        
        # CHANGED
        #index_word = vocab_vec.index(word);
        index_word = index_dict[word]  
        #index_file = file_vec.index(file_name);
        index_file = file_index_dict[file_name]
        
        mat[index_word][index_file] +=1
        i = i+1
    print 'Parsed all terms'
    keep = []
    stop = stopwords.words('english')
    updated_vocab_vec = []
    for ind, word in enumerate(vocab_vec):
        if word not in stop:
            keep.append(ind)
            updated_vocab_vec.append(word)
    keep = np.array(mat)[keep]
            
    return (keep, updated_vocab_vec)

In [92]:
(mat, lda_vocab) = createWordDocumentMatrixNYT(num_files=100)

parsing outer file directory 01
parsing directory cor-por-a/2006/01/01
Num files: 100
num terms in corpus: 102283
vocab size: 14050
matrix dimensions: 14050 x 100
index_dict created!
file_index_dict created!
parsed 0/102283 terms
parsed 1000/102283 terms
parsed 2000/102283 terms
parsed 3000/102283 terms
parsed 4000/102283 terms
parsed 5000/102283 terms
parsed 6000/102283 terms
parsed 7000/102283 terms
parsed 8000/102283 terms
parsed 9000/102283 terms
parsed 10000/102283 terms
parsed 11000/102283 terms
parsed 12000/102283 terms
parsed 13000/102283 terms
parsed 14000/102283 terms
parsed 15000/102283 terms
parsed 16000/102283 terms
parsed 17000/102283 terms
parsed 18000/102283 terms
parsed 19000/102283 terms
parsed 20000/102283 terms
parsed 21000/102283 terms
parsed 22000/102283 terms
parsed 23000/102283 terms
parsed 24000/102283 terms
parsed 25000/102283 terms
parsed 26000/102283 terms
parsed 27000/102283 terms
parsed 28000/102283 terms
parsed 29000/102283 terms
parsed 30000/102283 terms

# Sentiment Analysis
Takes in a list V of words and returns the average sentiment score across all terms in V as determined by freebase. Note to Jason: consider other sentiment databases

In [34]:
from nltk.corpus import sentiwordnet as swn
from __future__ import unicode_literals

def getSentiment(word):
    synset = list(swn.senti_synsets(word))
    if len(synset) > 0: #if a synset exists for this word
        synset = synset[0]
        return(synset.pos_score(), synset.neg_score(), synset.obj_score())

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

V = ['good', 'bad', 'great', 'awesome', 'amazing', 'holy', 'beautiful', 'worrisome', 'stupid']
def generate_sentiment(wordList):
    totalSentiment = 0.0;
    for word in wordList:
        if is_ascii(word): #see note below for rationale
            sentiment = getSentiment(word)
            if sentiment == None:
                sentiment = 0.0
            if type(sentiment) is float: #why does this happen
                print "n/a"
            else:  
                totalSentiment += (sentiment[0] - sentiment[1]) 
                print (sentiment[0] - sentiment[1])
        #sentiwordnet generates tuples of pos, neg, and neu. currently naively choosing to consider only sum of pos and neg. 
    averageSentiment = totalSentiment/len(wordList)
    return averageSentiment

def generate_sentiment_2(wordTupleList):
    reader = csv.reader(open('sentiment_words.txt', 'rb'))
    sentiment_words = dict(reader)
    sentiment_score = 0
    for wordTuple in wordTupleList:
        word = wordTuple[0]
        score = 1/wordTuple[1] #inverse of distance
        if word in sentiment_words:
            if sentiment_words[word] == 'pos':
                print word + " +" + str(score)
                sentiment_score += score
            if sentiment_words[word] == 'neg':
                print word + " -" + str(score)
                sentiment_score -= score
    return sentiment_score

#print generate_sentiment_2(neighbors_list)
print generate_sentiment_2(correlated_words)

NameError: name 'correlated_words' is not defined

### gensim example ###

In [None]:
def parser_wrapper(year):
    import time
    num_files = 1000000000 #all files
    root_directory = 'cor-por-a/'+ str(year) + '/'
    
    start = time.clock()
    word2vec = parse_NYT_articles_word2vec(num_files, root_directory)
    np.save('parses/' + str(year) + '_' + 'word2vec_parse', word2vec)
    end = time.clock()
    print "word2vec parse time: " + str(end-start)
    
    start = time.clock()
    seedword = parse_NYT_articles_seedword(num_files, root_directory)
    np.save('parses/' + str(yea
    end = time.clock()
    print "seedword parse time: " + str(end-start)
    
    start = time.clock()
    worddoc = parse_NYT_articles_worddoc(num_files, root_directory)
    np.save('parses/' + str(year) + '_' + 'worddoc_parse', worddoc)
    end = time.clock()
    print "worddoc parse time: " + str(end-start)
    
    #long-term goal: this is dumb. we should probably rewrite the functions to create all three types of parses at the same time
    
    return (word2vec, seedword, worddoc)

def process_year(year):
    parses = parser_wrapper(year) #[0]=word2vec, [1]=seedword, [2]=worddoc
    #make matrices
    #get correlation lists
    #make word2vec VSM
    #get sentiment of correlation list
    #get lda topics 
    #print out and save / year
    #graph across years
parser_wrapper(2006)

KeyboardInterrupt: 

In [110]:
import nltk.data

def parse_NYT_articles_word2vec(num_files, root_directory='cor-por-a/2006/'):
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') #pretrained probabilistic model for parsing sentences
    
    sentenceList = [];
    i = 0;
    numSentences = 0;
    for dirname_1 in os.listdir(root_directory):
        if (dirname_1 == '.DS_Store'):
            continue;
        print "parsing outer file directory " + dirname_1;
        for dirname in os.listdir(root_directory + dirname_1 + '/'):
            if (dirname == '.DS_Store'):
                continue;
            print "parsing directory " + root_directory + dirname_1 + '/' + dirname;
            for filename in os.listdir(root_directory + dirname_1 + '/' + dirname + '/'):                
                if (i >= num_files):
                    print 'Num files: ' + str(i);
                    print "num sentences: " + str(numSentences)
                    return sentenceList
                if (filename == '.DS_Store'):
                    continue;
                article_file = root_directory + dirname_1 + '/' + dirname + '/' + filename;
                #print article_file
                article_rep = parse_NYT_article(article_file);
                if (article_rep):
                    article = article_rep[1];
                    for sentence in sent_detector.tokenize(article.strip()):
                        tokenized_sentence = []
                        sentence = remove_punctuation(sentence.lower())
                        for word in sentence.split():
                            tokenized_sentence.append(word)
                        sentenceList.append(tokenized_sentence)
                        numSentences += 1
                i = i+1;
    print "num files: " + str(i);
    print "num sentences: " + str(numSentences)
    return sentenceList;

In [82]:
from gensim.models import Word2Vec
from nltk.corpus import brown, movie_reviews, treebank
import time

# start = time.clock()
# parse = parse_NYT_articles_word2vec(10000000000)
# end = time.clock()
# print "parse time: " + str(end-start)

# start = time.clock()
# ours = Word2Vec(parse, min_count=5)
# end = time.clock()
# print "Word2Vec model generation time: " + str(end-start)

In [172]:
#train gensim to generate a vector representation of all words in vocabulary
#create new matrix by getting ours['vocab_word'] for all vocab words
#pass that new matrix into the semantic orientation model to get sentiment scores.

In [142]:
#ours.save("2006_word2vec_model")


In [84]:
word2vec_model = Word2Vec.load("matrices/2006_word2vec_model")

In [85]:
print(word2vec_model.most_similar('gold', topn=5))

[('bronze', 0.7129561901092529), (u'silver', 0.7019717693328857), ('pendants', 0.6369662880897522), ('necklaces', 0.6366876363754272), (u'platinum', 0.627060055732727)]


In [86]:
def create_word2vec_mat(word2vec_model, mat_ppmi_obj):
    #mat_obj_ppmi -> mat_ppmi, rownames, frequencies
    #get vocabulary list
    vocab_list = mat_ppmi_obj[1]
    #create new vocab list
    new_vocab_list = []
    #create new mat
    new_vsm = []
    #for each word in vocab list
    for word in vocab_list:
    #get its raw vector and append to new mat, append word to new vocab list
        try:
            new_vocab_list.append(word)
            new_vsm.append(word2vec_model[word])
        except KeyError, e:
            continue
    return (new_vsm, new_vocab_list)

In [87]:
def cosine(u, v):        
    """Cosine distance between 1d np.arrays `u` and `v`, which must have 
    the same dimensionality. Returns a float."""
    # Use scipy's method:
    return scipy.spatial.distance.cosine(u, v)
    # Or define it yourself:
    # return 1.0 - (np.dot(u, v) / (vector_length(u) * vector_length(v)))

In [88]:
def semantic_orientation(
        mat, 
        rownames,
        seeds1=('bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'),
        seeds2=('good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'),
        distfunc=cosine):    
    """No frills implementation of the semantic Orientation (SO) method of 
    Turney and Littman. seeds1 and seeds2 should be representative members 
    of two intutively opposing semantic classes. The method will then try 
    to rank the vocabulary by its relative association with each seed set.
        
    Parameters
    ----------
    mat : 2d np.array
        The matrix used to derive the SO ranking.
        
    rownames : list of str
        The names of the rows of `mat` (the vocabulary).
        
    seeds1 : tuple of str
        The default is the negative seed set of Turney and Littman.
        
    seeds2 : tuple of str
        The default is the positive seed set of Turney and Littman.
        
    distfunc : function mapping vector pairs to floats (default: `cosine`)
        The measure of distance between vectors. Can also be `euclidean`, 
        `matching`, `jaccard`, as well as any other distance measure 
        between 1d vectors. 
    
    Returns
    -------    
    list of tuples
        The vocabulary ranked according to the SO method, with words 
        closest to `seeds1` at the top and words closest to `seeds2` at the 
        bottom. Each member of the list is a (word, score) pair.
    
    """    
    sm1 = _so_seed_matrix(seeds1, mat, rownames)
    sm2 = _so_seed_matrix(seeds2, mat, rownames)
    scores = [(rownames[i], _so_row_func(mat[i], sm1, sm2, distfunc)) for i in xrange(len(mat))]
    return sorted(scores, key=itemgetter(1), reverse=False)

def _so_seed_matrix(seeds, mat, rownames):
    indices = [rownames.index(word) for word in seeds if word in rownames]
    if not indices:
        raise ValueError('The matrix contains no members of the seed set: %s' % ",".join(seeds))
    #print indices
    #print np.array(indices)
    return mat[np.array(indices)]
    
def _so_row_func(row, sm1, sm2, distfunc):
    val1 = np.sum([distfunc(row, srow) for srow in sm1])
    val2 = np.sum([distfunc(row, srow) for srow in sm2])
    return val1 - val2    

In [89]:
word2vec_mat = create_word2vec_mat(word2vec_model, mat_rehydrate_ppmi)

In [90]:
so = semantic_orientation(mat=np.array(word2vec_mat[0]), rownames=word2vec_mat[1])

#convert from list to easily searchable hashmap
word_scores = dict()
for tup in so:
    word_scores[tup[0]] = tup[1]

In [91]:
print "bottom 5"
print(so[:5])
print "top 5"
print(so[-5:])

bottom 5
[(u'inferior', -1.8694573761411135), (u'ancients', -1.6513550385244073), (u'tussling', -1.5947713050262005), (u'biederman', -1.5865474415596097), (u'bi\xe8re', -1.5822854371790678)]
top 5
[(u'gossypol', 1.2959164292257039), (u'stung', 1.2967596300897988), (u'surpluses', 1.3591987337890927), (u'talkative', 1.3674854875844398), (u'draft-day', 1.4666110179148397)]


In [156]:
def get_semantic_score(word_list):
    score = 0
    for word in word_list:
        if word in word_scores:
            score += (word_scores[word])
        else:
            print 'not in vocab'
    return score

In [157]:
seeds_neg = ['bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior']
seeds_pos = ['good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior']
pos_score = get_semantic_score(seeds_pos)
neg_score = get_semantic_score(seeds_neg)
midline = (pos_score + neg_score) / 2.0
print "reference semantic scores: "
print "positive seed set: " + str(pos_score)
print "negative seed set: " + str(neg_score)
print "average score (midline): " + str(midline)




reference semantic scores: 
positive seed set: 3.12092685277
negative seed set: -8.00491091215
average score (midline): -2.44199202969


In [171]:
#iterate through the first index to get all of our categories
#for each category, add [1] to get the correlation list
#for each word in correlation list, get them all by iterating and getting 
#print lists2[1][1][0][0]
#print lists2[category][1][tuple][0]

top_ten = []
category = 1
counter = 0
print("category: " + correlation_list_raw[category][0]) #raw is raw counts, not ppmi
print("words:")
for tup in correlation_list_raw[category][1]: #the actual list part of the category
    top_ten.append(tup[0])
    print(tup[0] + ": " + str(word_scores[tup[0]]))
    counter += 1
    if counter > 10:
        break
        
# print(get_semantic_score(['stupid', 'bad', 'gold']))
# print(get_semantic_score(['stupid', 'bad', 'worst']))
print(get_semantic_score(top_ten))

category: african-american
words:
african-american: -0.743690223597
: -0.831533828512
first: -0.686429540454
women: -0.0857264773776
artists: -0.628374555718
woman: -0.413207287792
community: -0.966005089778
history: -0.217846123311
studies: -0.485555302138
experience: -0.0320994491377
men: -0.316791278551
-5.40725915637


# XOR/AND
Takes in a dict of corpus:list of words and returns a dict of corpus:XOR words and dict of corpus:AND words.

In [None]:
toyList = ['black', 'block', 'beer']

def XOR(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first ^ second
def AND(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first & second

print 'XOR'
print XOR(toyList, neighbors_word_list)
print 'AND'
print AND(toyList, neighbors_word_list)

# Word Cloud
Takes in a matrix M and correlation list L. Using t-sne, produces a word cloud which represents correlation between all terms. 

In [None]:
from sklearn.manifold import TSNE
import numpy
numpy.set_printoptions(threshold='nan')

def word_cloud_preprocessing(words, matrix=mat_ppmi):
    output = []
    for word in words:
        ind = matrix[1].index(word)
        output.append(matrix[0][ind])
    return output
processed_mat = word_cloud_preprocessing(neighbors_word_list)
print processed_mat

def word_cloud(corr_list): #i think its processed_mat / didn't tsne take in a vector of labels as well?
    model = TSNE(n_components=2, random_state=0)
    tsne_matrix = model.fit_transform(corr_list)
    
word_cloud(processed_mat)

# Latent Dirichlet Allocation

topic modeling, currently using dummy data from lda.datasets

NOTE: rerunning can cause relabeling, which means that topic 0 in the first run might now be topic 15 in the next run, so don't be worried if the topic numbers change from run to run

run this on the command line first: pip install --user lda

https://pypi.python.org/pypi/lda

In [28]:
# from sklearn.feature_extraction.text import CountVectorizer
# vec = CountVectorizer(stop_words='english')
# data = vec.fit_transform(wdmat_files)

In [29]:
# #imports

# from __future__ import division, print_function

# np.set_printoptions(threshold='nan')

# #use pip show lda to find the path of where it's installed for you and modify the path append line below with your location
# import sys
# sys.path.append('/Users/theodorachu/.local/lib/python2.7/site-packages')

import lda
# import lda.datasets

In [30]:
# document-term matrix
X = mat.transpose()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))

# the vocab
vocab = tuple(lda_vocab)
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))

# titles for each story
# titles = lda.datasets.load_reuters_titles()
# print("type(titles): {}".format(type(titles)))
# print("len(titles): {}\n".format(len(titles)))

type(X): <type 'numpy.ndarray'>
shape: (1000, 38772)

type(vocab): <type 'tuple'>
len(vocab): 38772



In [31]:
# input is the equivalence set
# output is doc-term matrix of just relevant docs
def find_word(input):
    indices = [i for i, x in enumerate(lda_vocab) if x in input]
    mod_mat = (X.transpose()[indices]).transpose()
    mod_mat_indices = mod_mat.sum(axis = 1) != 0
    return X[mod_mat_indices]
rel_X = find_word(equivalence_set)
print("type(rel_X): {}".format(type(rel_X)))
print("shape: {}\n".format(rel_X.shape))
X = rel_X

type(rel_X): <type 'numpy.ndarray'>
shape: (120, 38772)



In [32]:
#example print statements
#gets word 3117 from document 0

doc_id = 0
word_id = 1

print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
#print("-- doc  : {}".format(titles[doc_id]))

doc id: 0 word id: 1
-- count: 0
-- word : -


In [33]:
#fitting the model

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit_transform(X)

array([[  1.35389610e-01,   1.19047619e-03,   1.08225108e-04, ...,
          2.39177489e-02,   3.28030303e-01,   3.35497835e-03],
       [  8.07502842e-02,   7.99545282e-03,   1.55740811e-02, ...,
          6.10079576e-03,   3.42591891e-01,   1.84956423e-01],
       [  3.70300752e-02,   6.26566416e-05,   2.32456140e-02, ...,
          6.26566416e-05,   3.13972431e-01,   6.26566416e-05],
       ..., 
       [  3.97590361e-02,   1.72117040e-04,   1.72117040e-04, ...,
          1.89328744e-03,   4.08089501e-01,   1.72117040e-04],
       [  1.02798507e-01,   1.51119403e-02,   3.91791045e-03, ...,
          1.86567164e-04,   3.45335821e-01,   1.86567164e-04],
       [  1.53894737e-01,   2.10526316e-04,   2.31578947e-03, ...,
          8.63157895e-03,   2.90736842e-01,   2.31578947e-03]])

In [34]:
#topic-word probabilities
#shape: (num topics, num words)

topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))

type(topic_word): <type 'numpy.ndarray'>
shape: (20, 38772)


In [35]:
for n in range(5):
    sum_pr = sum(topic_word[n,:])
    print("topic: {} sum: {}".format(n, sum_pr))

topic: 0 sum: 1.0
topic: 1 sum: 1.0
topic: 2 sum: 1.0
topic: 3 sum: 1.0
topic: 4 sum: 1.0


In [36]:
#spits out top n words for each topic by probability

n = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

*Topic 0
- one -- two also people last years work still first
*Topic 1
- said police team game football brown carroll c fans coach
*Topic 2
- said mr ms room christmas york hotel square sticky apartment
*Topic 3
- city -- ny1 new mayor first giants york world game
*Topic 4
- list lists magazine bar ms fashion lunch top hot like
*Topic 5
- school ms schools said mother dr children johnson family choir
*Topic 6
- mr years died president state first law court department member
*Topic 7
- book black collection life museum norris photography books history writing
*Topic 8
- ms judge alito said party nelson house nicolas republican estate
*Topic 9
- p -- street 5 10 203 7 jan 1 saturday
*Topic 10
- -- one another good make less many place among something
*Topic 11
- cheese restaurant chicken food restaurants wine dishes sauce 1 street
*Topic 12
- oil water tv percent use 99 7 home set money
*Topic 13
- women world people cultural men human culture bark find century
*Topic 14
- quagga rau ski

In [37]:
#document-topic probabilities
#shape: (num documents, num topics)

doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

type(doc_topic): <type 'numpy.ndarray'>
shape: (120, 20)


In [38]:
dist = {}
for i in range(115):
    topic = doc_topic[i].argmax()
    if topic in dist:
        dist[topic] += 1
    else:
        dist[topic] = 1
    #print("{} (top topic: {})".format(i, doc_topic[i].argmax()))
dist

{1: 3, 3: 4, 4: 1, 6: 3, 8: 1, 11: 5, 12: 3, 13: 1, 15: 5, 18: 88, 19: 1}

In [46]:
#visualizing the inference - matlab setup/imports
import matplotlib.pyplot as plt

# use matplotlib style sheet
try:
    plt.style.use('ggplot')
except:
    # version of matplotlib might not be recent
    pass

right now the plots don't print? it just throws the notebook into busy mode for a very long time so not sure if something is off

In [None]:
#stem plots - height of each stem reflects the probability of the word in the focus topic

f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
    ax[i].stem(topic_word[k,:], linefmt='b-',
               markerfmt='bo', basefmt='w-')
    ax[i].set_xlim(-50,4350)
    ax[i].set_ylim(0, 0.08)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("topic {}".format(k))

ax[4].set_xlabel("word")

plt.draw()
plt.tight_layout()
plt.show()



In [None]:
#topic distribution - probability of each of the 20 topics for every document
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]): #only plotting these specified topics
    ax[i].stem(doc_topic[k,:], linefmt='r-',
               markerfmt='ro', basefmt='w-')
    ax[i].set_xlim(-1, 21)
    ax[i].set_ylim(0, 1)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("Document {}".format(k))

ax[4].set_xlabel("Topic")

plt.tight_layout()
plt.show()

# Deprecated Methods

In [None]:
def parseTextFile(filename):
    text = open('cor-por-a/' + filename, 'r')
    for i in range(0, 10):
        print text.readline()
    text_parse = text.read().split()
    #print text_parse

    lancaster = LancasterStemmer()
#     print lancaster.stem('maximum') 

    porter = PorterStemmer()
    return text_parse
#     print porter.stem('maximum')    

#parseTextFile('TomSawyer.txt')

In [None]:
# This takes fucking forever
def createMatrix(): ###DEPRECATED###
    # Initializes vector of terms
    u_vec = [x.lower() for x in parseTextFile('TomSawyer.txt')];
    vocab_vec = np.unique(u_vec).tolist()
    vocab_size = len(vocab_vec)
    mat = [[0 for x in range(vocab_size)] for y in range(vocab_size)]
    
    # Updates matrix, using bigrams
    for i in range(0, len(u_vec)-1):
        term_one = u_vec[i];
        term_two = u_vec[i+1];
        index_one = vocab_vec.index(term_one)
        index_two = vocab_vec.index(term_two)
        mat[index_one][index_one] += 1;
        mat[index_one][index_two] += 1;
        mat[index_two][index_one] += 1;

    last_term = u_vec[len(u_vec)-1]
    last_term_index = vocab_vec.index(last_term)
    mat[last_term_index][last_term_index] += 1
    return (mat, vocab_vec);

In [None]:
from __future__ import division
def pmi(mat, rownames=None, positive=True):  
    # Joint probability table:
    p = mat / np.sum(mat, axis=None)
    # Pre-compute column sums:
    colprobs = np.sum(p, axis=0)
    # Vectorize this function so that it can be applied rowwise:
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))
    p = np.array([np_pmi_log(row / (np.sum(row)*colprobs)) for row in p])   
    return (p, rownames)

In [None]:
def correlateds(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], w[i]) for i in range(len(mat))]
    #print dists
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    # print sorted_dists
    return sorted_dists

In [None]:
# The correlation list returns an ordered list of (word, correlation_score) tuples, where higher correlation_score
# means the word is more correlated. The correlation list includes all words in the vocabulary, so you can
# selectively take the first n elements if you want to use them.
def correlationList(mat_ppmi):
    return correlateds(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)

In [None]:
###DEPRECATED###
# neighbors_list = neighbors(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)[: 50]
# print neighbors_list

# def retrieve_words(tuple_list):
#     words = list()
#     for _tuple in tuple_list:
#         words.append(_tuple[0])
#     return words

# neighbors_word_list = retrieve_words(neighbors_list)
# print neighbors_word_list