# Setup

In [1]:
import os
import nltk

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer


In [2]:
# Info for creating VSM data
vsmdata_home = "vsmdata"
import os
import sys
import csv
import random
import itertools
from operator import itemgetter
from collections import defaultdict
import numpy as np
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import utils
import string

In [3]:
equivalence_set = ['African American', 'African-American', 'black']

In [4]:
seed_set2 = ['african-american', 'black', 'african', 'happy']

# File Input
Takes in a text file and returns a list of ordered unigrams U. 
It should also consider stemming and other relevant pre-processing. Josh's note: parse "African American" as a unigram.

In [53]:
def parseTextFile(filename):
    text = open('cor-por-a/' + filename, 'r')
    for i in range(0, 10):
        print text.readline()
    text_parse = text.read().split()
    #print text_parse

    lancaster = LancasterStemmer()
#     print lancaster.stem('maximum') 

    porter = PorterStemmer()
    return text_parse
#     print porter.stem('maximum')    

#parseTextFile('TomSawyer.txt')


In [69]:
def parse_NYT_articles_seedword(num_files, root_directory='cor-por-a/2006/'):
    overallCorpus = [];
    i = 0;
    for dirname_1 in os.listdir(root_directory):
        if (dirname_1 == '.DS_Store'):
            continue;
        print "parsing outer file directory " + dirname_1;
        for dirname in os.listdir(root_directory + dirname_1 + '/'):
            if (dirname == '.DS_Store'):
                continue;
            print "parsing directory " + root_directory + dirname_1 + '/' + dirname;
            for filename in os.listdir(root_directory + dirname_1 + '/' + dirname + '/'):                
                if (i >= num_files):
                    print 'Num files: ' + str(i);
                    return overallCorpus
                if (filename == '.DS_Store'):
                    continue;
                article_file = root_directory + dirname_1 + '/' + dirname + '/' + filename;
                #print article_file
                article_rep = parse_NYT_article(article_file);
                if (article_rep):
                    article_text = remove_punctuation(article_rep[1]).split(" ");
                    overallCorpus += ' ';
                    overallCorpus += article_text;
                i = i+1;
    print "num files: " + str(i);
    print 'hey!'
    return overallCorpus;

In [70]:
def parse_NYT_articles_worddoc(num_files, root_directory='cor-por-a/2006/'):
    overallCorpus = [];
    file_list = [];
    i = 0;
    for dirname_1 in os.listdir(root_directory):
        if (dirname_1 == '.DS_Store'):
            continue;
        print "parsing outer file directory " + dirname_1;
        for dirname in os.listdir(root_directory + dirname_1 + '/'):
            if (dirname == '.DS_Store'):
                continue;
            print "parsing directory " + root_directory + dirname_1 + '/' + dirname;
            for filename in os.listdir(root_directory + dirname_1 + '/' + dirname + '/'):      
                if (i >= num_files):
                    print 'Num files: ' + str(i);
                    return (overallCorpus, file_list)
                if (filename == '.DS_Store'):
                    continue;
                article_file = root_directory + dirname_1 + '/' + dirname + '/' + filename;
                file_list.append(filename)
                article_rep = parse_NYT_article(article_file);
                if (article_rep):
                    article_text = remove_punctuation(article_rep[1]).split(" ");
                    for word in article_text:
                        overallCorpus.append((word, filename))
                i = i+1;
    print "num files: " + str(i)
    return (overallCorpus, file_list);

# Correlation Matrix
1. Parse U to create a word-word frequency matrix M, where each row represents a word and each entry x(i,j) represents the number of times word i co-occurs with word j.
2. Convert M to a new matrix M’ with some sort of correlation operation. We could use PMI, Occai (see Josh’s paper), CSA, or some other correlation structure.
3. Let row a represent the unigram “African American”. Take in that row, and output an ordered list of (this_unigram, correlation_score) pairs which represent the correlation score of this_unigram with the term “African American”
4. Produce a list L of the top 100 correlated words with the term “African American”


In [8]:
# returns matrix object where mat_obj[0] refers to the seed_word matrix, where mat[1] refers
# to the vocab list, where mat[2] refers to a frequency list,
# where mat_obj[0][0] refers to the vector representing co-occurrence for first word in seed
# set, and where mat_obj[0][len(seed_set)] refers to a vector of overall counts for each term 
def createSeedWordMatrixNYT(num_files):
    # Initializes vector of terms
    u_vec = [x.lower() for x in parse_NYT_articles_seedword(num_files)];
    num_terms = len(u_vec);
    print 'num terms in corpus: ' + str(num_terms);
    vocab_vec = np.unique(u_vec).tolist()
    vocab_size = len(vocab_vec)
    print 'vocab size: ' + str(vocab_size);
    print 'matrix dimensions: ' + str(len(seed_set2)) + ' x ' + str(len(vocab_vec));
    mat = [[0 for x in range(vocab_size)] for y in range(len(seed_set2)+1)]
    frequency_vec = [0 for x in range(vocab_size)]

    index_dict = {};
    for i in range (0, len(vocab_vec)):
        index_dict[vocab_vec[i]] = i;
    print 'index_dict created!'
    
    # Updates matrix, using bigrams
    term = u_vec[0];
    term_neighbor_r = u_vec[1];
    
    # CHANGED
    #index_term = vocab_vec.index(term)
    index_term = index_dict[term];
    
    frequency_vec[index_term] += 1;
    if (any(seed_word == term for seed_word in seed_set2)):
            index_seed = seed_set2.index(term);
            
            # CHANGED
            #index_neighbor_r = vocab_vec.index(term_neighbor_r);
            index_neighbor_r = index_dict[term_neighbor_r];
            
            mat[index_seed][index_neighbor_r] += 1;
            mat[index_seed][index_term] += 1;
    for i in range(1, len(u_vec)-1):
        if (i % 1000 == 0):
            print 'parsed ' + str(i) + '/' + str(num_terms) + ' terms'
        term = u_vec[i];
        term_neighbor_l = u_vec[i-1];
        term_neighbor_r = u_vec[i+1];
        
        # CHANGED
        #index_term = vocab_vec.index(term)
        index_term = index_dict[term]
        
        frequency_vec[index_term] += 1;
        if (any(seed_word == term for seed_word in seed_set2)):
            index_seed = seed_set2.index(term);
            
            # CHANGED
            #index_neighbor_l = vocab_vec.index(term_neighbor_l);
            #index_neighbor_r = vocab_vec.index(term_neighbor_r);
            index_neighbor_l = index_dict[term_neighbor_l]
            index_neighbor_r = index_dict[term_neighbor_r]
            
            mat[index_seed][index_neighbor_l] += 1;
            mat[index_seed][index_neighbor_r] += 1;
            mat[index_seed][index_term] += 1;
    term = u_vec[len(u_vec)-1];
    term_neighbor_l = u_vec[len(u_vec)-2];
    
    # CHANGED
    #index_term = vocab_vec.index(term)
    index_term = index_dict[term]
    
    frequency_vec[index_term] += 1;
    if (any(seed_word == term for seed_word in seed_set2)):
            index_seed = seed_set2.index(term);
            
            # CHANGED
            #index_neighbor_l = vocab_vec.index(term_neighbor_l);
            index_neighbor_l = index_dict[term];
            
            mat[index_seed][index_neighbor_l] += 1;
            mat[index_seed][index_term] += 1;
    print 'Parsed ' + str(num_terms) + '/' + str(num_terms) + ' terms';
    return (mat, vocab_vec, frequency_vec);

In [9]:
# Returns a word, correlation list tuple for each seed in seed_set2
def getCorrelationLists(mat_obj):
    tupleArr = []
    # Word lists for each word
    for j in range(len(seed_set2)):
        w = mat_obj[0][j]
        dists = [(mat_obj[1][i], w[i]) for i in range(len(w))]
        sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
        print "PMI list for word: " + seed_set2[j] + "; " + str(sorted_dists[:5])
        tupleArr.append((seed_set2[j], sorted_dists));
    
    #frequency list for each word:
    w = mat_obj[2]
    dists = [(mat_obj[1][i], w[i]) for i in range(len(w))]
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    print "Frequency list: " + str(sorted_dists[:5])
    tupleArr.append((seed_set2[j], sorted_dists));
    
    return tupleArr;

In [10]:
# This takes fucking forever
def createMatrix():
    # Initializes vector of terms
    u_vec = [x.lower() for x in parseTextFile('TomSawyer.txt')];
    vocab_vec = np.unique(u_vec).tolist()
    vocab_size = len(vocab_vec)
    mat = [[0 for x in range(vocab_size)] for y in range(vocab_size)]
    
    # Updates matrix, using bigrams
    for i in range(0, len(u_vec)-1):
        term_one = u_vec[i];
        term_two = u_vec[i+1];
        index_one = vocab_vec.index(term_one)
        index_two = vocab_vec.index(term_two)
        mat[index_one][index_one] += 1;
        mat[index_one][index_two] += 1;
        mat[index_two][index_one] += 1;

    last_term = u_vec[len(u_vec)-1]
    last_term_index = vocab_vec.index(last_term)
    mat[last_term_index][last_term_index] += 1
    return (mat, vocab_vec);

In [11]:
def cosine(u, v):        
    return scipy.spatial.distance.cosine(u, v)

In [12]:
def neighbors(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], distfunc(w, mat[i])) for i in range(len(mat))]
    return sorted(dists, key=itemgetter(1), reverse=False)

In [13]:
from __future__ import division
# PMI log: p(x, y)/ p(x)p(y)
def pmi_seed(mat_obj, rownames=None, positive=True):  
    rownames = mat_obj[1];
    frequencies = mat_obj[2];
    word_count = np.sum(frequencies, axis=None)
    
    # Joint probability table:
    p = mat_obj[0] / word_count;
    colprobs = frequencies/word_count;
    sum_of_colprobs = np.sum(colprobs)
    
    
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))    
    mat_ppmi = [];
    for row in p:
        if np.sum(row) > 0:
            mat_ppmi.append(np_pmi_log(row / (np.sum(row)*colprobs)));
        else:
            mat_ppmi.append([0 for x in row])
    return (mat_ppmi, rownames, frequencies)

In [14]:
from __future__ import division
def pmi(mat, rownames=None, positive=True):  
    # Joint probability table:
    p = mat / np.sum(mat, axis=None)
    # Pre-compute column sums:
    colprobs = np.sum(p, axis=0)
    # Vectorize this function so that it can be applied rowwise:
    np_pmi_log = np.vectorize((lambda x : _pmi_log(x, positive=positive)))
    p = np.array([np_pmi_log(row / (np.sum(row)*colprobs)) for row in p])   
    return (p, rownames)

def _pmi_log(x, positive=True):
    val = 0.0
    if x > 0.0:
        val = np.log(x)
    if positive:
        val = max([val,0.0])
    return val

In [15]:
def correlateds(word, mat, rownames, distfunc=cosine):
    if word not in rownames:
        raise ValueError('%s is not in this VSM' % word)
    w = mat[rownames.index(word)]
    dists = [(rownames[i], w[i]) for i in range(len(mat))]
    #print dists
    sorted_dists = sorted(dists, key=itemgetter(1), reverse=True)
    # print sorted_dists
    return sorted_dists

In [16]:
# The correlation list returns an ordered list of (word, correlation_score) tuples, where higher correlation_score
# means the word is more correlated. The correlation list includes all words in the vocabulary, so you can
# selectively take the first n elements if you want to use them.
def correlationList(mat_ppmi):
    return correlateds(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)

### tools to save result of mat calculations ###

In [17]:
# import os
# if not os.path.exists('my_file'): numpy.savetxt('my_file', my_array)

#this will save the result of our matrix into a human-readable text file, and the original array is easily
#recreated using loadtxt.

np.savetxt("mat_features", mat[0])
np.savetxt("mat_labels", mat[1])
np.loadtxt("mat_labels")

NameError: name 'mat' is not defined

   ### tool to remove punctuation from a text ###

In [18]:
s = "string. With. Punctuation?" # Sample string
def remove_punctuation(text):
    for c in string.punctuation:
        text = text.replace(c,"")
    return text

print(remove_punctuation(s))


#period, question mark, exclamation point, comma, semicolon, colon, dash, 
#hyphen, parentheses, brackets, braces, apostrophe, quotation marks, and ellipses

string With Punctuation


### tools to parse out text from ntif/xml document for NYT articles ###

In [19]:
# http://docs.python-guide.org/en/latest/scenarios/xml/
# http://stackoverflow.com/questions/1912434/how-do-i-parse-xml-in-python
import xml.etree.ElementTree as ET

def parse_NYT_article(xmlFile): 
    tree = ET.parse(xmlFile)
    root = tree.getroot()
    year = ''
    article_text = '';
    for child in root:
        if child.tag == 'head':
            for subchild in child:
                if 'name' in subchild.attrib:
                    if subchild.attrib['name'] == 'publication_year':
                        year = subchild.attrib['content']
        if child.tag == 'body':
            body = child
    for child in body:
        if child.tag == 'body.content':
            content = child
    for child in content:
        if child.attrib == {'class': 'full_text'}:
            for paragraph in child:
                article_text += paragraph.text
            return (year, article_text)
                
parse_NYT_article('nyt_sample_2.xml')


('2007',
 'A doctor who works at a clinic in Jamaica has been charged with insurance fraud, accused of billing insurance companies for tests that were never performed on victims of motor vehicle accidents, prosecutors said yesterday. The doctor, Alexander Israeli, 53, of Middle Village, was arraigned in Criminal Court on Monday night on charges of grand larceny and insurance fraud, said Richard A. Brown, the Queens district attorney. Mr. Brown said that Dr. Israeli billed insurance companies last year for $21,000 worth of neurological tests that were not performed. He faces loss of his medical license and up to seven years in prison if convicted, prosecutors said.')

In [20]:
neighbors_list = neighbors(word='colored', mat=mat_ppmi[0], rownames=mat_ppmi[1], distfunc=cosine)[: 50]
print neighbors_list

def retrieve_words(tuple_list):
    words = list()
    for _tuple in tuple_list:
        words.append(_tuple[0])
    return words

neighbors_word_list = retrieve_words(neighbors_list)
print neighbors_word_list

NameError: name 'mat_ppmi' is not defined

# Creating Correlation Lists
Using the functions above, creates seed-word matrix with a user-specified number of files, performs pmi on that matrix, and computes a resulting correlation list for each seed word.

In order to use more files, update the num_files variable. In order to update the seed set, update the seed_set2 global variable to include more words.

Note: Creating these correlation lists at scale is very slow. Start off by processing about 10 files, and scale up. 

In [75]:
mat_obj = createSeedWordMatrixNYT(num_files=1000)

parsing outer file directory 01
parsing directory cor-por-a/2006/01/01
parsing directory cor-por-a/2006/01/02
parsing directory cor-por-a/2006/01/03
parsing directory cor-por-a/2006/01/04
parsing directory cor-por-a/2006/01/05
Num files: 1000
num terms in corpus: 594391
vocab size: 50181
matrix dimensions: 4 x 50181
index_dict created!
parsed 1000/594391 terms
parsed 2000/594391 terms
parsed 3000/594391 terms
parsed 4000/594391 terms
parsed 5000/594391 terms
parsed 6000/594391 terms
parsed 7000/594391 terms
parsed 8000/594391 terms
parsed 9000/594391 terms
parsed 10000/594391 terms
parsed 11000/594391 terms
parsed 12000/594391 terms
parsed 13000/594391 terms
parsed 14000/594391 terms
parsed 15000/594391 terms
parsed 16000/594391 terms
parsed 17000/594391 terms
parsed 18000/594391 terms
parsed 19000/594391 terms
parsed 20000/594391 terms
parsed 21000/594391 terms
parsed 22000/594391 terms
parsed 23000/594391 terms
parsed 24000/594391 terms
parsed 25000/594391 terms
parsed 26000/594391 t

In [76]:
mat_obj_ppmi = pmi_seed(mat_obj)

In [77]:
lists = getCorrelationLists(mat_obj_ppmi)

PMI list for word: african-american; [(u'', 0), (u' ', 0), (u'0', 0), (u'00', 0), (u'007', 0)]
PMI list for word: black; [(u'abacus', 6.7940029604116896), (u'black', 6.7940029604116896), (u'adaptable', 6.7940029604116887), (u'amex', 6.7940029604116887), (u'beansbut', 6.7940029604116887)]
PMI list for word: african; [(u'african', 8.6131614038278581), (u'fauna', 8.6131614038278581), (u'yam', 8.6131614038278581), (u'antelope', 8.3254793313760782), (u'apartheid', 7.514549115159749)]
PMI list for word: happy; [(u'2006granted', 7.9340004652426535), (u'35th', 7.9340004652426535), (u'endingms', 7.9340004652426535), (u'happy', 7.9340004652426535), (u'memoriesthe', 7.9340004652426535)]
Frequency list: [(u'the', 34376), (u'of', 16569), (u'a', 15669), (u'and', 15245), (u'to', 15180)]


# Word Document Matrix
Creates a word document matrix for use by Theo and her LDA work!

In [78]:
def createWordDocumentMatrixNYT(num_files):
    result = parse_NYT_articles_worddoc(num_files);
    word_file_vec = [(x[0].lower(),x[1]) for x in result[0]];
    word_vec = [x[0] for x in word_file_vec];
    vocab_vec = np.unique(word_vec).tolist()
    file_vec = result[1];
    print 'num terms in corpus: ' + str(len(word_vec))
    print 'vocab size: ' + str(len(vocab_vec))
    print 'matrix dimensions: ' + str(len(vocab_vec)) + ' x ' + str(len(file_vec));
    mat = [[0 for x in range(len(file_vec))] for y in range(len(vocab_vec))]
    
    index_dict = {};
    for i in range (0, len(vocab_vec)):
        index_dict[vocab_vec[i]] = i;
    print 'index_dict created!'
    
    file_index_dict = {};
    for i in range (0, len(file_vec)):
        file_index_dict[file_vec[i]] = i;
    print 'file_index_dict created!'
    
    i = 0;
    for word_file_tuple in word_file_vec:
        if (i % 1000 == 0):
            print 'parsed ' + str(i) + '/' + str(len(word_vec)) + ' terms'
        word = word_file_tuple[0]
        file_name = word_file_tuple[1]
        
        # CHANGED
        #index_word = vocab_vec.index(word);
        index_word = index_dict[word]  
        #index_file = file_vec.index(file_name);
        index_file = file_index_dict[file_name];
        
        mat[index_word][index_file] +=1;
        i = i+1;
    print 'Parsed all terms'
    return mat

In [81]:
mat = createWordDocumentMatrixNYT(num_files=100)

parsing outer file directory 01
parsing directory cor-por-a/2006/01/01
Num files: 100
num terms in corpus: 83029
vocab size: 15218
matrix dimensions: 15218 x 100
index_dict created!
file_index_dict created!
parsed 0/83029 terms
parsed 1000/83029 terms
parsed 2000/83029 terms
parsed 3000/83029 terms
parsed 4000/83029 terms
parsed 5000/83029 terms
parsed 6000/83029 terms
parsed 7000/83029 terms
parsed 8000/83029 terms
parsed 9000/83029 terms
parsed 10000/83029 terms
parsed 11000/83029 terms
parsed 12000/83029 terms
parsed 13000/83029 terms
parsed 14000/83029 terms
parsed 15000/83029 terms
parsed 16000/83029 terms
parsed 17000/83029 terms
parsed 18000/83029 terms
parsed 19000/83029 terms
parsed 20000/83029 terms
parsed 21000/83029 terms
parsed 22000/83029 terms
parsed 23000/83029 terms
parsed 24000/83029 terms
parsed 25000/83029 terms
parsed 26000/83029 terms
parsed 27000/83029 terms
parsed 28000/83029 terms
parsed 29000/83029 terms
parsed 30000/83029 terms
parsed 31000/83029 terms
parsed

# Sentiment Analysis
Takes in a list V of words and returns the average sentiment score across all terms in V as determined by freebase. Note to Jason: consider other sentiment databases

In [None]:
from nltk.corpus import sentiwordnet as swn
from __future__ import unicode_literals

def getSentiment(word):
    synset = list(swn.senti_synsets(word))
    if len(synset) > 0: #if a synset exists for this word
        synset = synset[0]
        return(synset.pos_score(), synset.neg_score(), synset.obj_score())

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

V = ['good', 'bad', 'great', 'awesome', 'amazing', 'holy', 'beautiful', 'worrisome', 'stupid']
def generate_sentiment(wordList):
    totalSentiment = 0.0;
    for word in wordList:
        if is_ascii(word): #see note below for rationale
            sentiment = getSentiment(word)
            if sentiment == None:
                sentiment = 0.0
            if type(sentiment) is float: #why does this happen
                print "n/a"
            else:  
                totalSentiment += (sentiment[0] - sentiment[1]) 
                print (sentiment[0] - sentiment[1])
        #sentiwordnet generates tuples of pos, neg, and neu. currently naively choosing to consider only sum of pos and neg. 
    averageSentiment = totalSentiment/len(wordList)
    return averageSentiment

def generate_sentiment_2(wordTupleList):
    reader = csv.reader(open('sentiment_words.txt', 'rb'))
    sentiment_words = dict(reader)
    sentiment_score = 0
    for wordTuple in wordTupleList:
        word = wordTuple[0]
        score = 1/wordTuple[1] #inverse of distance
        if word in sentiment_words:
            if sentiment_words[word] == 'pos':
                print word + " +" + str(score)
                sentiment_score += score
            if sentiment_words[word] == 'neg':
                print word + " -" + str(score)
                sentiment_score -= score
    return sentiment_score

print generate_sentiment_2(neighbors_list)
#print generate_sentiment_2(correlated_words)

In [None]:
def cosine(u, v):        
    """Cosine distance between 1d np.arrays `u` and `v`, which must have 
    the same dimensionality. Returns a float."""
    # Use scipy's method:
    return scipy.spatial.distance.cosine(u, v)
    # Or define it yourself:
    # return 1.0 - (np.dot(u, v) / (vector_length(u) * vector_length(v)))

In [None]:
def semantic_orientation(
        mat, 
        rownames,
        seeds1=('bad', 'nasty', 'poor', 'negative', 'unfortunate', 'wrong', 'inferior'),
        seeds2=('good', 'nice', 'excellent', 'positive', 'fortunate', 'correct', 'superior'),
        distfunc=cosine):    
    """No frills implementation of the semantic Orientation (SO) method of 
    Turney and Littman. seeds1 and seeds2 should be representative members 
    of two intutively opposing semantic classes. The method will then try 
    to rank the vocabulary by its relative association with each seed set.
        
    Parameters
    ----------
    mat : 2d np.array
        The matrix used to derive the SO ranking.
        
    rownames : list of str
        The names of the rows of `mat` (the vocabulary).
        
    seeds1 : tuple of str
        The default is the negative seed set of Turney and Littman.
        
    seeds2 : tuple of str
        The default is the positive seed set of Turney and Littman.
        
    distfunc : function mapping vector pairs to floats (default: `cosine`)
        The measure of distance between vectors. Can also be `euclidean`, 
        `matching`, `jaccard`, as well as any other distance measure 
        between 1d vectors. 
    
    Returns
    -------    
    list of tuples
        The vocabulary ranked according to the SO method, with words 
        closest to `seeds1` at the top and words closest to `seeds2` at the 
        bottom. Each member of the list is a (word, score) pair.
    
    """    
    sm1 = _so_seed_matrix(seeds1, mat, rownames)
    sm2 = _so_seed_matrix(seeds2, mat, rownames)
    scores = [(rownames[i], _so_row_func(mat[i], sm1, sm2, distfunc)) for i in xrange(len(mat))]
    return sorted(scores, key=itemgetter(1), reverse=False)

def _so_seed_matrix(seeds, mat, rownames):
    indices = [rownames.index(word) for word in seeds if word in rownames]
    if not indices:
        raise ValueError('The matrix contains no members of the seed set: %s' % ",".join(seeds))
    print indices
    print np.array(indices)
    return mat[np.array(indices)]
    
def _so_row_func(row, sm1, sm2, distfunc):
    val1 = np.sum([distfunc(row, srow) for srow in sm1])
    val2 = np.sum([distfunc(row, srow) for srow in sm2])
    return val1 - val2    

In [None]:
print mat[1]

How do we tokenize ignoring punctuation?

Solution: for each word, look at last letter, if it is in a set of punctuation, remove that punctuation. Or, just strip away punctuation from the entire text in the very beginning. We're losing some degree of information but it is essentially a way of "normalizing" the words. 

In [None]:
print mat[0][:2]

In [None]:
temp_rowshit = [u'!', u'):', u');', u'1', u'1/10', u'1/2', u'10', u'10/10', u'100', u'11', u'12', u'13', u'14', u'15', u'17', u'1950', u'1950s', u'1970', u'1980', u'2', u'20', u'2000', u'25', u'3', u'3/10', u'30', u'4', u'4/10', u'40', u'5', u'50', u'6', u'60', u'60s', u'7', u'7/10', u'70', u'70s', u'8', u'8/', u'80', u'80s', u'9', u'90', u':)', u'?', u'a', u'abandoned', u'ability', u'able', u'about', u'above', u'absolute', u'absolutely', u'absurd', u'abuse', u'academy', u'accent', u'accents', u'accept', u'accident', u'accidentally', u'according', u'account', u'accurate', u'achieve', u'across', u'act', u'acted', u'acting', u'action', u'actions', u'actor', u'actors', u'actress', u'actresses', u'acts', u'actual', u'actually', u'adam', u'adaptation', u'add', u'added', u'adding', u'addition', u'adds', u'admit', u'adult', u'adults', u'adventure', u'adventures', u'advice', u'affair', u'afraid', u'africa', u'african', u'after', u'afternoon', u'again', u'against', u'age', u'agent', u'ages', u'ago', u'agree', u'ahead', u"ain't", u'air', u'aka', u'al', u'alan', u'alas', u'albert', u'alex', u'alice', u'alien', u'aliens', u'alive', u'all', u'allen', u'allow', u'allowed', u'allows', u'almost', u'alone', u'along', u'already', u'alright', u'also', u'although', u'always', u'am', u'amateur', u'amateurish', u'amazed', u'amazing', u'amazingly', u'america', u'american', u'americans', u'among', u'amongst', u'amount', u'amusing', u'an', u'ancient', u'and', u'anderson', u'andy', u'angel', u'angels', u'anger', u'angle', u'angles', u'angry', u'animal', u'animals', u'animated', u'animation', u'anime', u'ann', u'anna', u'anne', u'annoying', u'another', u'answer', u'answers', u'anthony', u'any', u'anybody', u'anymore', u'anyone', u'anything', u'anyway', u'anywhere', u'apart', u'apartment', u'apparent', u'apparently', u'appeal', u'appealing', u'appear', u'appearance', u'appeared', u'appears', u'appreciate', u'appreciated', u'approach', u'appropriate', u'are', u'area', u"aren't", u'arms', u'army', u'around', u'arrives', u'art', u'arthur', u'artist', u'artistic', u'artists', u'arts', u'as', u'ashamed', u'asian', u'aside', u'ask', u'asked', u'asking', u'asks', u'asleep', u'aspect', u'aspects', u'ass', u'assume', u'at', u'atmosphere', u'atrocious', u'attack', u'attacked', u'attacks', u'attempt', u'attempting', u'attempts', u'attention', u'attitude', u'attractive', u'audience', u'audiences', u'aunt', u'australian', u'authentic', u'author', u'available', u'average', u'avoid', u'award', u'awards', u'aware', u'away', u'awesome', u'awful', u'awkward', u'b', u'b-movie', u'baby', u'back', u'background', u'bad', u'badly', u'balance', u'ball', u'band', u'bank', u'bar', u'barbara', u'barely', u'base', u'baseball', u'based', u'basic', u'basically', u'basis', u'batman', u'battle', u'bbc', u'be', u'beach', u'bear', u'beast', u'beat', u'beautiful', u'beautifully', u'beauty', u'became', u'because', u'become', u'becomes', u'becoming', u'bed', u'been', u'before', u'began', u'begin', u'beginning', u'begins', u'behavior', u'behind', u'being', u'belief', u'believable', u'believe', u'believed', u'believes', u'beloved', u'below', u'ben', u'besides', u'best', u'bet', u'better', u'between', u'beyond', u'big', u'bigger', u'biggest', u'bill', u'billy', u'birth', u'bit', u'bits', u'bizarre', u'black', u'blah', u'blair', u'blame', u'bland', u'blind', u'blockbuster', u'blonde', u'blood', u'bloody', u'blow', u'blown', u'blue', u'board', u'boat', u'bob', u'bodies', u'body', u'bollywood', u'bomb', u'bond', u'book', u'books', u'bore', u'bored', u'boring', u'born', u'boss', u'both', u'bother', u'bothered', u'bottom', u'bought', u'bourne', u'box', u'boy', u'boyfriend', u'boys', u'brad', u'brain', u'brave', u'break', u'breaking', u'breaks', u'breath', u'breathtaking', u'brian', u'brief', u'bright', u'brilliant', u'brilliantly', u'bring', u'bringing', u'brings', u'british', u'broadway', u'broken', u'brooks', u'brother', u'brothers', u'brought', u'brown', u'bruce', u'brutal', u'buddy', u'budget', u'build', u'building', u'built', u'bunch', u'burns', u'burt', u'bus', u'business', u'busy', u'but', u'buy', u'buying', u'by', u'c', u'cabin', u'cable', u'cage', u'caine', u'california', u'call', u'called', u'calling', u'calls', u'came', u'cameo', u'camera', u'camp', u'campy', u'can', u"can't", u'canadian', u'candy', u'cannot', u'cant', u'capable', u'captain', u'capture', u'captured', u'captures', u'car', u'care', u'career', u'cares', u'caring', u'carried', u'carries', u'carry', u'carrying', u'cars', u'cartoon', u'cartoons', u'case', u'cases', u'cash', u'cast', u'casting', u'castle', u'cat', u'catch', u'category', u'caught', u'cause', u'caused', u'causes', u'cell', u'center', u'central', u'century', u'certain', u'certainly', u'cgi', u'challenge', u'chance', u'change', u'changed', u'changes', u'changing', u'channel', u'character', u"character's", u'characters', u'charge', u'charles', u'charlie', u'charm', u'charming', u'chase', u'che', u'cheap', u'check', u'cheesy', u'chemistry', u'chick', u'chief', u'child', u'childhood', u'children', u"children's", u'chilling', u'china', u'chinese', u'choice', u'choices', u'choose', u'chose', u'chosen', u'chris', u'christian', u'christmas', u'christopher', u'church', u'cinderella', u'cinema', u'cinematic', u'cinematography', u'circumstances', u'city', u'claim', u'claims', u'claire', u'clark', u'class', u'classic', u'classics', u'clean', u'clear', u'clearly', u'clever', u'clich', u'climax', u'clips', u'close', u'closer', u'closing', u'clothes', u'club', u'clue', u'code', u'cold', u'collection', u'college', u'color', u'colors', u'columbo', u'combination', u'combined', u'come', u'comedic', u'comedies', u'comedy', u'comes', u'comic', u'comical', u'coming', u'comment', u'commentary', u'comments', u'commercial', u'committed', u'common', u'community', u'company', u'compare', u'compared', u'comparison', u'compelling', u'complete', u'completely', u'complex', u'complicated', u'computer', u'concept', u'concerned', u'conclusion', u'conflict', u'confused', u'confusing', u'confusion', u'connection', u'consider', u'considered', u'considering', u'constant', u'constantly', u'contain', u'contains', u'contemporary', u'content', u'context', u'continue', u'continues', u'continuity', u'contrast', u'contrived', u'control', u'conversation', u'convey', u'convince', u'convinced', u'convincing', u'cool', u'cop', u'cops', u'copy', u'core', u'corny', u'correct', u'cost', u'costs', u'costume', u'costumes', u'could', u"could've", u"couldn't", u'count', u'country', u'couple', u'course', u'court', u'cover', u'covered', u'cowboy', u'crap', u'crappy', u'crash', u'crazy', u'create', u'created', u'creates', u'creating', u'creative', u'creature', u'creatures', u'credit', u'credits', u'creepy', u'crew', u'crime', u'criminal', u'criminals', u'critical', u'criticism', u'critics', u'cross', u'crowd', u'crude', u'cruel', u'cry', u'crying', u'cult', u'cultural', u'culture', u'curious', u'current', u'cut', u'cute', u'cuts', u'cutting', u'd', u'dad', u'daily', u'damn', u'dan', u'dance', u'dancing', u'danger', u'dangerous', u'daniel', u'danny', u'dark', u'darkness', u'date', u'dated', u'daughter', u'daughters', u'david', u'davis', u'day', u'days', u'de', u'dead', u'deadly', u'deal', u'dealing', u'deals', u'dean', u'death', u'deaths', u'debut', u'decade', u'decades', u'decent', u'decide', u'decided', u'decides', u'decision', u'deep', u'deeper', u'deeply', u'definitely', u'degree', u'delight', u'delightful', u'deliver', u'delivered', u'delivers', u'delivery', u'demon', u'demons', u'dennis', u'department', u'depicted', u'depiction', u'depressing', u'depth', u'describe', u'described', u'description', u'desert', u'deserve', u'deserved', u'deserves', u'design', u'designed', u'desire', u'desperate', u'desperately', u'despite', u'destroy', u'destroyed', u'detail', u'details', u'detective', u'determined', u'develop', u'developed', u'development', u'device', u'devil', u'dialog', u'dialogue', u'dick', u'did', u"didn't", u'die', u'died', u'dies', u'difference', u'different', u'difficult', u'direct', u'directed', u'directing', u'direction', u'directly', u'director', u"director's", u'directors', u'dirty', u'disappointed', u'disappointing', u'disappointment', u'disaster', u'disbelief', u'discover', u'discovered', u'discovers', u'disgusting', u'disney', u'display', u'disturbing', u'do', u'doctor', u'documentary', u'does', u"doesn't", u'dog', u'dogs', u'doing', u'dollar', u'dollars', u'don', u"don't", u'donald', u'done', u'door', u'double', u'doubt', u'douglas', u'down', u'downright', u'dozen', u'dr', u'drag', u'dragon', u'drama', u'dramatic', u'draw', u'drawn', u'dreadful', u'dream', u'dreams', u'dress', u'dressed', u'drew', u'drinking', u'drive', u'driven', u'driver', u'driving', u'drop', u'drug', u'drugs', u'drunk', u'dry', u'dubbed', u'dude', u'due', u'dull', u'dumb', u'during', u'dvd', u'dying', u'e', u'each', u'earlier', u'early', u'earth', u'easily', u'easy', u'eat', u'eating', u'ed', u'eddie', u'edge', u'edited', u'editing', u'edward', u'effect', u'effective', u'effectively', u'effects', u'effort', u'efforts', u'eight', u'either', u'element', u'elements', u'elizabeth', u'else', u'embarrassed', u'embarrassing', u'emma', u'emotion', u'emotional', u'emotionally', u'emotions', u'empty', u'encounter', u'end', u'ended', u'ending', u'endless', u'ends', u'enemy', u'energy', u'engaging', u'england', u'english', u'enjoy', u'enjoyable', u'enjoyed', u'enjoying', u'enough', u'enter', u'entertain', u'entertained', u'entertaining', u'entertainment', u'entire', u'entirely', u'environment', u'epic', u'episode', u'episodes', u'equally', u'era', u'eric', u'erotic', u'escape', u'escapes', u'especially', u'essential', u'essentially', u'established', u'etc', u'europe', u'european', u'even', u'evening', u'event', u'events', u'eventually', u'ever', u'every', u'everybody', u'everyday', u'everyone', u'everything', u'everywhere', u'evidence', u'evil', u'exact', u'exactly', u'example', u'examples', u'excellent', u'except', u'exception', u'excited', u'excitement', u'exciting', u'excuse', u'executed', u'execution', u'exist', u'existence', u'exists', u'expect', u'expectations', u'expected', u'expecting', u'experience', u'experienced', u'experiences', u'experiment', u'expert', u'explain', u'explained', u'explains', u'explanation', u'exploitation', u'express', u'expression', u'extent', u'extra', u'extraordinary', u'extras', u'extreme', u'extremely', u'eye', u'eyes', u'f', u'fabulous', u'face', u'faces', u'facial', u'fact', u'factor', u'facts', u'fail', u'failed', u'fails', u'failure', u'fair', u'fairly', u'fairy', u'faith', u'faithful', u'fake', u'fall', u'fallen', u'falling', u'falls', u'false', u'fame', u'familiar', u'families', u'family', u'famous', u'fan', u'fans', u'fantastic', u'fantasy', u'far', u'fare', u'fascinating', u'fashion', u'fast', u'fat', u'fate', u'father', u"father's", u'fault', u'favor', u'favorite', u'favorites', u'favourite', u'fear', u'feature', u'featured', u'features', u'featuring', u'feel', u'feeling', u'feelings', u'feels', u'feet', u'fell', u'fellow', u'felt', u'female', u'festival', u'few', u'fiction', u'fictional', u'field', u'fight', u'fighting', u'fights', u'figure', u'figured', u'figures', u'fill', u'filled', u'film', u"film's", u'film-making', u'filmed', u'filming', u'filmmaker', u'filmmakers', u'films', u'final', u'finale', u'finally', u'find', u'finding', u'finds', u'fine', u'finest', u'finish', u'finished', u'fire', u'first', u'fit', u'fits', u'five', u'flashback', u'flashbacks', u'flat', u'flaws', u'flesh', u'flick', u'flicks', u'flight', u'floor', u'flow', u'fly', u'flying', u'focus', u'focused', u'focuses', u'folks', u'follow', u'followed', u'following', u'follows', u'food', u'fool', u'foot', u'footage', u'football', u'for', u'force', u'forced', u'forces', u'ford', u'foreign', u'forest', u'forever', u'forget', u'forgettable', u'forgot', u'forgotten', u'form', u'format', u'former', u'formula', u'forth', u'fortunately', u'forward', u'foster', u'found', u'four', u'fourth', u'fox', u'frame', u'france', u'frank', u'frankly', u'fred', u'freddy', u'free', u'freedom', u'freeman', u'french', u'frequently', u'fresh', u'friday', u'friend', u'friendly', u'friends', u'friendship', u'frightening', u'from', u'front', u'fu', u'full', u'fully', u'fun', u'funnier', u'funniest', u'funny', u'further', u'future', u'g', u'gags', u'game', u'games', u'gang', u'gangster', u'garbage', u'gary', u'gas', u'gave', u'gay', u'gem', u'gene', u'general', u'generally', u'generation', u'genius', u'genre', u'genuine', u'genuinely', u'george', u'german', u'germany', u'get', u'gets', u'getting', u'ghost', u'ghosts', u'giant', u'girl', u'girlfriend', u'girls', u'give', u'given', u'gives', u'giving', u'glad', u'go', u'god', u'goes', u'going', u'gold', u'golden', u'gone', u'gonna', u'good', u'goofy', u'gordon', u'gore', u'gorgeous', u'gory', u'got', u'gotten', u'government', u'grace', u'grade', u'grand', u'grant', u'granted', u'graphic', u'graphics', u'gratuitous', u'grave', u'great', u'greater', u'greatest', u'green', u'grew', u'grim', u'gritty', u'ground', u'group', u'grow', u'growing', u'grown', u'gruesome', u'guard', u'guess', u'guilty', u'gun', u'guns', u'guy', u'guys', u'h', u'ha', u'had', u"hadn't", u'hair', u'half', u'halfway', u'hall', u'halloween', u'hand', u'handle', u'handled', u'hands', u'handsome', u'hanging', u'happen', u'happened', u'happening', u'happens', u'happiness', u'happy', u'hard', u'hardly', u'hardy', u'harris', u'harry', u'harsh', u'has', u"hasn't", u'hat', u'hate', u'hated', u'haunted', u'haunting', u'have', u"haven't", u'having', u'he', u"he'd", u"he's", u'head', u'heads', u'hear', u'heard', u'hearing', u'heart', u'heaven', u'heavily', u'heavy', u'heck', u'held', u'hell', u'help', u'helped', u'helping', u'helps', u'henry', u'her', u'here', u"here's", u'hero', u'heroes', u'heroine', u'herself', u'hey', u'hidden', u'hide', u'high', u'higher', u'highlight', u'highly', u'hilarious', u'hill', u'him', u'himself', u'hired', u'his', u'historical', u'history', u'hit', u'hitchcock', u'hitler', u'hits', u'hoffman', u'hold', u'holding', u'holds', u'holes', u'hollywood', u'home', u'honest', u'honestly', u'hong', u'honor', u'hope', u'hopefully', u'hopes', u'hoping', u'horrible', u'horribly', u'horrific', u'horror', u'horse', u'hospital', u'hot', u'hotel', u'hour', u'hours', u'house', u'how', u'howard', u'however', u'huge', u'human', u'humanity', u'humans', u'humor', u'humorous', u'humour', u'hunt', u'hunter', u'hurt', u'husband', u'i', u"i'd", u"i'll", u"i'm", u"i've", u'ice', u'idea', u'ideas', u'identity', u'idiot', u'if', u'ignore', u'ii', u'ill', u'image', u'imagery', u'images', u'imagination', u'imagine', u'imdb', u'immediately', u'impact', u'important', u'impossible', u'impressed', u'impression', u'impressive', u'in', u'include', u'included', u'includes', u'including', u'incredible', u'incredibly', u'indeed', u'independent', u'india', u'indian', u'indie', u'individual', u'industry', u'inept', u'influence', u'information', u'initial', u'initially', u'inner', u'innocent', u'insane', u'inside', u'insight', u'inspector', u'inspiration', u'inspired', u'instance', u'instead', u'insult', u'intellectual', u'intelligence', u'intelligent', u'intended', u'intense', u'intensity', u'intentions', u'interest', u'interested', u'interesting', u'international', u'interpretation', u'interview', u'interviews', u'into', u'intriguing', u'introduced', u'introduction', u'invisible', u'involved', u'involves', u'involving', u'irish', u'ironic', u'irritating', u'is', u'island', u"isn't", u'issue', u'issues', u'it', u"it's", u'italian', u'its', u'itself', u'j', u'jack', u'jackie', u'jackson', u'jail', u'james', u'jane', u'japan', u'japanese', u'jason', u'jean', u'jeff', u'jennifer', u'jerry', u'jessica', u'jesus', u'jewish', u'jim', u'jimmy', u'joan', u'job', u'jobs', u'joe', u'john', u'johnny', u'johnson', u'join', u'joke', u'jokes', u'jon', u'jones', u'joseph', u'journey', u'joy', u'jr', u'judge', u'julia', u'julie', u'jump', u'jumps', u'jungle', u'junk', u'just', u'justice', u'k', u'kate', u'keaton', u'keep', u'keeping', u'keeps', u'kelly', u'kept', u'kevin', u'key', u'kick', u'kid', u'kids', u'kill', u'killed', u'killer', u'killers', u'killing', u'kills', u'kim', u'kind', u'kinda', u'kinds', u'king', u'kiss', u'knew', u'know', u'knowing', u'knowledge', u'known', u'knows', u'kong', u'kung', u'l', u'la', u'lack', u'lacking', u'lacks', u'ladies', u'lady', u'lake', u'lame', u'land', u'lane', u'language', u'large', u'largely', u'larry', u'last', u'late', u'later', u'latest', u'latter', u'laugh', u'laughable', u'laughed', u'laughing', u'laughs', u'laughter', u'laura', u'law', u'lawyer', u'lazy', u'lead', u'leader', u'leading', u'leads', u'league', u'learn', u'learned', u'learning', u'learns', u'least', u'leave', u'leaves', u'leaving', u'led', u'lee', u'left', u'legend', u'legendary', u'legs', u'length', u'lesbian', u'leslie', u'less', u'lesson', u'let', u"let's", u'lets', u'level', u'levels', u'lewis', u'lie', u'lies', u'life', u'lifetime', u'light', u'lighting', u'lights', u'likable', u'like', u'liked', u'likely', u'likes', u'limited', u'line', u'lines', u'lisa', u'list', u'listen', u'listening', u'literally', u'little', u'live', u'lived', u'lives', u'living', u'local', u'location', u'locations', u'locked', u'logic', u'london', u'lonely', u'long', u'longer', u'look', u'looked', u'looking', u'looks', u'loose', u'lord', u'lose', u'loses', u'losing', u'loss', u'lost', u'lot', u'lots', u'loud', u'louis', u'lousy', u'love', u'loved', u'lovely', u'lover', u'lovers', u'loves', u'loving', u'low', u'low-budget', u'lower', u'luck', u'lucky', u'ludicrous', u'lugosi', u'luke', u'lynch', u'm', u'machine', u'mad', u'made', u'madness', u'magic', u'magical', u'magnificent', u'main', u'mainly', u'mainstream', u'major', u'majority', u'make', u'make-up', u'makers', u'makes', u'makeup', u'making', u'male', u'man', u"man's", u'manage', u'managed', u'manager', u'manages', u'manner', u'mansion', u'many', u'maria', u'marie', u'mark', u'market', u'marriage', u'married', u'marry', u'martial', u'martin', u'marvelous', u'mary', u'mask', u'massive', u'master', u'masterpiece', u'match', u'material', u'matrix', u'matt', u'matter', u'matters', u'mature', u'max', u'may', u'maybe', u'me', u'mean', u'meaning', u'means', u'meant', u'meanwhile', u'media', u'mediocre', u'meet', u'meeting', u'meets', u'melodrama', u'member', u'members', u'memorable', u'memories', u'memory', u'men', u'mental', u'mention', u'mentioned', u'mere', u'merely', u'mess', u'message', u'met', u'metal', u'mexican', u'mexico', u'mgm', u'michael', u'michelle', u'mid', u'middle', u'midnight', u'might', u'mike', u'mildly', u'miles', u'military', u'million', u'mind', u'minds', u'mine', u'minor', u'minute', u'minutes', u'mirror', u'miss', u'missed', u'missing', u'mission', u'mistake', u'mistakes', u'mix', u'mixed', u'model', u'modern', u'mom', u'moment', u'moments', u'money', u'monster', u'monsters', u'months', u'mood', u'moon', u'moore', u'moral', u'more', u'morgan', u'morning', u'most', u'mostly', u'mother', u'motion', u'mountain', u'mouth', u'move', u'moved', u'movement', u'moves', u'movie', u"movie's", u'movies', u'moving', u'mr', u'mrs', u'ms', u'mst', u'much', u'multiple', u'murder', u'murdered', u'murderer', u'murders', u'murphy', u'music', u'musical', u'musicals', u'must', u'my', u'myself', u'mysterious', u'mystery', u'n', u'naive', u'naked', u'name', u'named', u'names', u'nancy', u'narration', u'narrative', u'nasty', u'nation', u'national', u'native', u'natural', u'naturally', u'nature', u'navy', u'near', u'nearly', u'necessarily', u'necessary', u'ned', u'need', u'needed', u'needless', u'needs', u'negative', u'neither', u'network', u'never', u'nevertheless', u'new', u'news', u'next', u'nice', u'nicely', u'nick', u'night', u'nightmare', u'no', u'nobody', u'noir', u'nominated', u'none', u'nonetheless', u'nonsense', u'nor', u'normal', u'normally', u'north', u'not', u'notable', u'note', u'nothing', u'notice', u'noticed', u'notorious', u'novel', u'novels', u'now', u'nowadays', u'nowhere', u'nude', u'nudity', u'number', u'numbers', u'numerous', u'o', u'obnoxious', u'obsessed', u'obsession', u'obvious', u'obviously', u'occasional', u'occasionally', u'odd', u'oddly', u'of', u'off', u'offensive', u'offer', u'offered', u'offers', u'office', u'officer', u'often', u'oh', u'ok', u'okay', u'old', u'older', u'oliver', u'on', u'once', u'one', u"one's", u'ones', u'only', u'onto', u'open', u'opening', u'opens', u'opera', u'opinion', u'opportunity', u'opposite', u'or', u'order', u'ordinary', u'original', u'originality', u'originally', u'oscar', u'other', u'others', u'otherwise', u'our', u'out', u'outside', u'outstanding', u'over', u'over-the-top', u'overall', u'overly', u'own', u'owner', u'p', u'pace', u'paced', u'pacing', u'pacino', u'page', u'paid', u'pain', u'painful', u'painfully', u'paint', u'pair', u'paper', u'par', u'parents', u'paris', u'park', u'parker', u'parody', u'part', u'particular', u'particularly', u'partner', u'parts', u'party', u'pass', u'passed', u'passing', u'passion', u'past', u'path', u'pathetic', u'patrick', u'paul', u'pay', u'paying', u'peace', u'people', u"people's", u'perfect', u'perfectly', u'performance', u'performances', u'performed', u'perhaps', u'period', u'person', u'personal', u'personalities', u'personality', u'personally', u'perspective', u'pet', u'peter', u'phone', u'photography', u'physical', u'pick', u'picked', u'picks', u'picture', u'pictures', u'piece', u'pieces', u'pile', u'pilot', u'pitt', u'pity', u'place', u'placed', u'places', u'plain', u'plan', u'plane', u'planet', u'plans', u'play', u'played', u'player', u'players', u'playing', u'plays', u'pleasant', u'please', u'pleasure', u'plenty', u'plot', u'plots', u'plus', u'poignant', u'point', u'pointless', u'points', u'police', u'political', u'politics', u'poor', u'poorly', u'pop', u'popular', u'porn', u'portray', u'portrayal', u'portrayed', u'portraying', u'portrays', u'position', u'positive', u'possible', u'possibly', u'post', u'potential', u'powell', u'power', u'powerful', u'powers', u'practically', u'praise', u'predictable', u'prefer', u'pregnant', u'premise', u'prepared', u'presence', u'present', u'presentation', u'presented', u'presents', u'president', u'pretentious', u'pretty', u'previous', u'previously', u'price', u'priest', u'prime', u'prince', u'princess', u'print', u'prior', u'prison', u'private', u'probably', u'problem', u'problems', u'process', u'produce', u'produced', u'producer', u'producers', u'product', u'production', u'productions', u'professional', u'professor', u'program', u'project', u'promise', u'promising', u'propaganda', u'proper', u'properly', u'protagonist', u'protect', u'proud', u'prove', u'proved', u'proves', u'provide', u'provided', u'provides', u'psycho', u'psychological', u'public', u'pull', u'pulled', u'pulls', u'punch', u'pure', u'purely', u'purpose', u'put', u'puts', u'putting', u'qualities', u'quality', u'queen', u'quest', u'question', u'questions', u'quick', u'quickly', u'quiet', u'quirky', u'quite', u'r', u'race', u'rachel', u'racist', u'radio', u'rain', u'raise', u'raised', u'ran', u'random', u'range', u'rape', u'rare', u'rarely', u'rate', u'rated', u'rather', u'rating', u'ratings', u'raw', u'ray', u'reach', u'reaction', u'read', u'reading', u'ready', u'real', u'realism', u'realistic', u'reality', u'realize', u'realized', u'realizes', u'really', u'reason', u'reasons', u'recall', u'received', u'recent', u'recently', u'recognize', u'recommend', u'recommended', u'record', u'red', u'redeeming', u'reference', u'references', u'refreshing', u'regard', u'regarding', u'regret', u'regular', u'relate', u'related', u'relationship', u'relationships', u'relatively', u'release', u'released', u'relief', u'religion', u'religious', u'remain', u'remains', u'remake', u'remarkable', u'remember', u'remembered', u'remind', u'reminded', u'reminds', u'reminiscent', u'remote', u'remotely', u'rent', u'rental', u'rented', u'renting', u'repeated', u'replaced', u'reporter', u'reputation', u'required', u'rescue', u'research', u'respect', u'responsible', u'rest', u'result', u'results', u'retarded', u'return', u'returns', u'reveal', u'revealed', u'reveals', u'revenge', u'review', u'reviewer', u'reviewers', u'reviews', u'revolution', u'rich', u'richard', u'ride', u'ridiculous', u'right', u'rights', u'ring', u'rings', u'rip', u'rise', u'risk', u'rival', u'river', u'road', u'rob', u'robert', u'robin', u'robot', u'rock', u'roger', u'rogers', u'role', u'roles', u'roll', u'rolling', u'romance', u'romantic', u'ron', u'room', u'rose', u'rough', u'round', u'routine', u'roy', u'rubbish', u'ruin', u'ruined', u'rule', u'rules', u'run', u'running', u'runs', u'russell', u'russian', u'ryan', u's', u'sad', u'sadly', u'safe', u'said', u'sake', u'sam', u'same', u'san', u'santa', u'sarah', u'sat', u'satire', u'satisfying', u'saturday', u'save', u'saved', u'saving', u'saw', u'say', u'saying', u'says', u'scale', u'scare', u'scared', u'scares', u'scary', u'scenario', u'scene', u'scenery', u'scenes', u'school', u'sci-fi', u'science', u'scientist', u'score', u'scott', u'scream', u'screaming', u'screen', u'screening', u'screenplay', u'script', u'sea', u'sean', u'search', u'season', u'seasons', u'seat', u'second', u'seconds', u'secret', u'section', u'security', u'see', u'seeing', u'seek', u'seem', u'seemed', u'seemingly', u'seems', u'seen', u'sees', u'segment', u'self', u'sell', u'send', u'sense', u'sensitive', u'sent', u'sequel', u'sequels', u'sequence', u'sequences', u'serial', u'series', u'serious', u'seriously', u'serve', u'served', u'serves', u'service', u'set', u'sets', u'setting', u'settings', u'seven', u'several', u'sex', u'sexual', u'sexy', u'shadow', u'shakespeare', u'shallow', u'shame', u'share', u'sharp', u'she', u"she's", u'sheer', u'sheriff', u'ship', u'shock', u'shocked', u'shocking', u'shoot', u'shooting', u'shop', u'short', u'shot', u'shots', u'should', u"shouldn't", u'show', u'showed', u'shower', u'showing', u'shown', u'shows', u'shut', u'sick', u'side', u'sides', u'sidney', u'sight', u'sign', u'significant', u'silent', u'silly', u'similar', u'simon', u'simple', u'simply', u'sinatra', u'since', u'sing', u'singer', u'singing', u'single', u'sinister', u'sir', u'sister', u'sisters', u'sit', u'sitcom', u'site', u'sitting', u'situation', u'situations', u'six', u'skill', u'skills', u'skin', u'skip', u'sky', u'slapstick', u'slasher', u'sleazy', u'sleep', u'sleeping', u'slightly', u'slow', u'slowly', u'small', u'smart', u'smile', u'smith', u'so', u'so-called', u'soap', u'social', u'society', u'soft', u'sold', u'soldier', u'soldiers', u'solid', u'some', u'somebody', u'somehow', u'someone', u'something', u'sometimes', u'somewhat', u'somewhere', u'son', u'song', u'songs', u'soon', u'sorry', u'sort', u'sorts', u'soul', u'sound', u'sounded', u'sounds', u'soundtrack', u'source', u'south', u'southern', u'space', u'spanish', u'speak', u'speaking', u'speaks', u'special', u'spectacular', u'speech', u'speed', u'spend', u'spends', u'spent', u'spirit', u'spite', u'spoil', u'spoiler', u'spoilers', u'spoof', u'sports', u'spot', u'spy', u'stage', u'stand', u'standard', u'standards', u'standing', u'stands', u'stanley', u'star', u'starred', u'starring', u'stars', u'start', u'started', u'starting', u'starts', u'state', u'statement', u'states', u'station', u'status', u'stay', u'stayed', u'stays', u'steal', u'steals', u'step', u'stephen', u'stereotypes', u'stereotypical', u'steve', u'steven', u'stewart', u'stick', u'still', u'stock', u'stolen', u'stomach', u'stone', u'stop', u'stopped', u'stops', u'store', u'stories', u'story', u'storyline', u'storytelling', u'straight', u'strange', u'strangely', u'street', u'streets', u'strength', u'strong', u'strongly', u'structure', u'struggle', u'struggling', u'stuck', u'student', u'students', u'studio', u'studios', u'study', u'stuff', u'stunning', u'stupid', u'stupidity', u'style', u'stylish', u'subject', u'substance', u'subtitles', u'subtle', u'succeeds', u'success', u'successful', u'successfully', u'such', u'suck', u'sucked', u'sucks', u'sudden', u'suddenly', u'suffer', u'suffering', u'suffers', u'suggest', u'suicide', u'suit', u'summary', u'summer', u'sun', u'sunday', u'super', u'superb', u'superior', u'superman', u'supernatural', u'support', u'supporting', u'suppose', u'supposed', u'supposedly', u'sure', u'surely', u'surface', u'surprise', u'surprised', u'surprises', u'surprising', u'surprisingly', u'surreal', u'survive', u'susan', u'suspect', u'suspects', u'suspense', u'suspenseful', u'sweet', u'sword', u'sympathetic', u'sympathy', u'system', u't', u'table', u'take', u'taken', u'takes', u'taking', u'tale', u'talent', u'talented', u'talents', u'tales', u'talk', u'talking', u'talks', u'tape', u'target', u'tarzan', u'task', u'taste', u'taylor', u'teacher', u'team', u'tears', u'technical', u'technically', u'technology', u'ted', u'tedious', u'teen', u'teenage', u'teenager', u'teenagers', u'teens', u'teeth', u'television', u'tell', u'telling', u'tells', u'ten', u'tend', u'tension', u'term', u'terms', u'terrible', u'terribly', u'terrific', u'terror', u'test', u'texas', u'than', u'thank', u'thankfully', u'thanks', u'that', u"that's", u'thats', u'the', u'theater', u'theaters', u'theatre', u'theatrical', u'their', u'them', u'theme', u'themes', u'themselves', u'then', u'theory', u'there', u"there's", u'therefore', u'these', u'they', u"they're", u"they've", u'thin', u'thing', u'things', u'think', u'thinking', u'thinks', u'third', u'this', u'thomas', u'thoroughly', u'those', u'though', u'thought', u'thoughts', u'three', u'thriller', u'thrilling', u'through', u'throughout', u'throw', u'throwing', u'thrown', u'throws', u'thus', u'tight', u'till', u'tim', u'time', u'times', u'timing', u'tiny', u'tired', u'titanic', u'title', u'titles', u'to', u'today', u"today's", u'together', u'told', u'tom', u'tone', u'tony', u'too', u'took', u'top', u'topic', u'torture', u'total', u'totally', u'touch', u'touched', u'touches', u'touching', u'tough', u'toward', u'towards', u'town', u'track', u'tradition', u'traditional', u'tragedy', u'tragic', u'trailer', u'train', u'training', u'trapped', u'trash', u'travel', u'treasure', u'treat', u'treated', u'treatment', u'tree', u'trek', u'trick', u'tried', u'tries', u'trilogy', u'trip', u'trouble', u'truck', u'true', u'truly', u'trust', u'truth', u'try', u'trying', u'turkey', u'turn', u'turned', u'turning', u'turns', u'tv', u'twenty', u'twice', u'twist', u'twisted', u'twists', u'two', u'type', u'types', u'typical', u'u', u'ugly', u'uk', u'ultimate', u'ultimately', u'unable', u'unbelievable', u'uncle', u'unconvincing', u'under', u'underground', u'underrated', u'understand', u'understanding', u'understood', u'unexpected', u'unfortunate', u'unfortunately', u'unfunny', u'uninteresting', u'unique', u'united', u'universal', u'universe', u'unknown', u'unless', u'unlike', u'unlikely', u'unnecessary', u'unrealistic', u'until', u'unusual', u'up', u'upon', u'urban', u'us', u'usa', u'use', u'used', u'uses', u'using', u'usual', u'usually', u'utter', u'utterly', u'v', u'vacation', u'value', u'values', u'vampire', u'vampires', u'van', u'variety', u'various', u'vehicle', u'version', u'versions', u'very', u'veteran', u'vhs', u'via', u'victim', u'victims', u'victor', u'victoria', u'video', u'vietnam', u'view', u'viewed', u'viewer', u'viewers', u'viewing', u'views', u'village', u'villain', u'villains', u'violence', u'violent', u'virtually', u'vision', u'visit', u'visual', u'visually', u'visuals', u'voice', u'voices', u'von', u'vote', u'vs', u'w', u'wait', u'waiting', u'walk', u'walked', u'walking', u'walks', u'wall', u'walter', u'want', u'wanted', u'wanting', u'wants', u'war', u'warm', u'warned', u'warner', u'warning', u'wars', u'was', u'washington', u"wasn't", u'waste', u'wasted', u'watch', u'watchable', u'watched', u'watching', u'water', u'wave', u'way', u'wayne', u'ways', u'we', u"we're", u"we've", u'weak', u'weapons', u'wear', u'wearing', u'wears', u'wedding', u'week', u'weekend', u'weeks', u'weird', u'welcome', u'well', u'welles', u'went', u'were', u"weren't", u'werewolf', u'west', u'western', u'westerns', u'what', u"what's", u'whatever', u'whatsoever', u'when', u'whenever', u'where', u'whether', u'which', u'while', u'whilst', u'white', u'who', u"who's", u'whoever', u'whole', u'whom', u'whose', u'why', u'wide', u'wife', u'wild', u'will', u'william', u'williams', u'willing', u'wilson', u'win', u'wind', u'window', u'winner', u'winning', u'wins', u'wise', u'wish', u'wit', u'witch', u'with', u'within', u'without', u'witness', u'witty', u'woman', u'women', u'won', u"won't", u'wonder', u'wonderful', u'wonderfully', u'wondering', u'wood', u'wooden', u'woods', u'woody', u'word', u'words', u'work', u'worked', u'working', u'works', u'world', u'worse', u'worst', u'worth', u'worthwhile', u'worthy', u'would', u"would've", u"wouldn't", u'wow', u'write', u'writer', u'writers', u'writing', u'written', u'wrong', u'wrote', u'x', u'yeah', u'year', u'year-old', u'years', u'yes', u'yet', u'york', u'you', u"you'd", u"you'll", u"you're", u"you've", u'young', u'younger', u'your', u'yourself', u'youth', u'zero', u'zombie', u'zombies']

In [None]:
print np.array(mat[0])
print np.array(temp_rowshit)

In [None]:
temp_rownames = ['hello', 'test', 'pie', 'dirty', 'bad', 'good']
so = semantic_orientation(mat=np.array(mat[0]), rownames=mat[1])
so[:5]
so[-5:]

In [None]:
so[:5]

In [None]:
#convert from list to easily searchable hashmap
word_scores = dict()
for tup in so:
    word_scores[tup[0]] = tup[1]
    
def get_semantic_score(word_list):
    score = 0
    for word in word_list:
        if word in word_scores:
            score += word_scores[word]
        else:
            print 'not in vocab'
    return score

get_semantic_score(['good', 'bad', 'john'])

In [None]:
"你好".encode('utf-8')
encode converts a unicode object to a string object. But here you have invoked it on a string object (because you don't have the u). So python has to convert the string to a unicode object first. So it does the equivalent of

"你好".decode().encode('utf-8')
But the decode fails because the string isn't valid ascii. That's why you get a complaint about not being able to decode.

# XOR/AND
Takes in a dict of corpus:list of words and returns a dict of corpus:XOR words and dict of corpus:AND words.

In [None]:
toyList = ['black', 'block', 'beer']

def XOR(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first ^ second
def AND(corpus1, corpus2):
    first = set(corpus1)
    second = set(corpus2)
    return first & second

print 'XOR'
print XOR(toyList, neighbors_word_list)
print 'AND'
print AND(toyList, neighbors_word_list)

# Word Cloud
Takes in a matrix M and correlation list L. Using t-sne, produces a word cloud which represents correlation between all terms. 

In [None]:
from sklearn.manifold import TSNE
import numpy
numpy.set_printoptions(threshold='nan')

def word_cloud_preprocessing(words, matrix=mat_ppmi):
    output = []
    for word in words:
        ind = matrix[1].index(word)
        output.append(matrix[0][ind])
    return output
processed_mat = word_cloud_preprocessing(neighbors_word_list)
print processed_mat

def word_cloud(corr_list): #i think its processed_mat / didn't tsne take in a vector of labels as well?
    model = TSNE(n_components=2, random_state=0)
    tsne_matrix = model.fit_transform(corr_list)
    
word_cloud(processed_mat)

# Latent Dirichlet Allocation

topic modeling, currently using dummy data from lda.datasets

NOTE: rerunning can cause relabeling, which means that topic 0 in the first run might now be topic 15 in the next run, so don't be worried if the topic numbers change from run to run

run this on the command line first: pip install --user lda

https://pypi.python.org/pypi/lda

In [None]:
#imports

from __future__ import division, print_function

import numpy as np

#use pip show lda to find the path of where it's installed for you and modify the path append line below with your location
import sys
sys.path.append('/Users/theodorachu/.local/lib/python2.7/site-packages')

import lda
import lda.datasets

In [None]:
# document-term matrix
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))

# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))

# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))

In [None]:
#example print statements
#gets word 3117 from document 0

doc_id = 0
word_id = 3117

print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
print("-- doc  : {}".format(titles[doc_id]))

In [None]:
#fitting the model

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(X)

In [None]:
#topic-word probabilities
#shape: (num topics, num words)

topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))

In [None]:
#spits out top n words for each topic by probability

n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

In [None]:
#document-topic probabilities
#shape: (num documents, num topics)

doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))

In [None]:
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))

In [None]:
#visualizing the inference - matlab setup/imports
import matplotlib.pyplot as plt

# use matplotlib style sheet
try:
    plt.style.use('ggplot')
except:
    # version of matplotlib might not be recent
    pass

right now the plots don't print? it just throws the notebook into busy mode for a very long time so not sure if something is off

In [None]:
#stem plots - height of each stem reflects the probability of the word in the focus topic

f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
    ax[i].stem(topic_word[k,:], linefmt='b-',
               markerfmt='bo', basefmt='w-')
    ax[i].set_xlim(-50,4350)
    ax[i].set_ylim(0, 0.08)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("topic {}".format(k))

ax[4].set_xlabel("word")

plt.tight_layout()
plt.show()

In [None]:
#topic distribution - probability of each of the 20 topics for every document
f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]): #only plotting these specified topics
    ax[i].stem(doc_topic[k,:], linefmt='r-',
               markerfmt='ro', basefmt='w-')
    ax[i].set_xlim(-1, 21)
    ax[i].set_ylim(0, 1)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("Document {}".format(k))

ax[4].set_xlabel("Topic")

plt.tight_layout()
plt.show()