# PS2

In [1]:
# QUESTION ONE
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from collections import Counter
"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        

        
    def demo_self():
        print 'this will error out'

In [54]:
import numpy as np
import codecs
import nltk
import math
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from __future__ import division
from collections import Counter
import copy

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        
        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        self.document_term_matrix1()
        
        #self.tf_idf1()
        
        self.tf_idf2()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
            
    def document_term_matrix(self):
        """
        description: create a matrix listing the number of times each vocab term appears in each doc
        """
        
        # get v, the number of tokens
        v = len(self.token_set)
        
        
        doc_term_matrix = []
        for doc in self.docs: 

            # create an empty dictionary of words, add the counts
            wordlist_dict = {}
            word_list = doc.tokens
            for i in self.token_set:
                wordlist_dict[i] = 0.0
            for word in word_list:
                wordlist_dict[word] += 1
            # for each doc, we append its list of word counts to the doc term matrix    
            doc_term_matrix.append(wordlist_dict.values())
        
        # turn the resulting list into a Dxv matrix
        self.doc_term_matrix = np.array(doc_term_matrix)
        self.doc_term_matrix = self.doc_term_matrix.reshape((self.N, v))
        
        
    def tf_idf(self):
        # call document_term_matrix
        self.document_term_matrix()
        
        tf = np.copy(self.doc_term_matrix)
        df = [0] * len(self.token_set)
        
        # create the tf and df values
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                if tf[i][j] > 0:
                    df[j] += 1
                    tf[i][j] = 1 + math.log(tf[i][j])
        
        for j in range(len(df)):
            df[j] = math.log(self.N/df[j])
        
        # put it together into a tf-idf matrix
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                tf[i][j] = tf[i][j] * df[j]
         

        self.tfidf = tf
    
    def dict_rank(self, dictionary, representation, n):
        
        self.tf_idf()

        # Choose whether to rely on the doc term matrix of the tf-idf matrix
        if representation == "doc-term":
            compare_docs = copy.copy(self.doc_term_matrix)
        elif representation == "tf-idf":     
            compare_docs = copy.copy(self.tfidf)

        # Prepare the variables for use later
        docs = copy.copy(self.docs)
        doclist = []
        weights = [0] * self.N
        j = 0
        
        # Collect the weights of each document
        for doc in docs:
            i = 0
            for token in iter(self.token_set):
                if token in dictionary:
                    weights[j] += compare_docs[j][i]
                i += 1
            j += 1
        
        # take the documents with the n largest weights
        for i in range(n):
            whichmax = weights.index(max(weights))
            docmax = docs[whichmax]
            doclist.append(docmax)
            weights.remove(max(weights))
            docs.remove(docmax)
        
        # Add it to self
        self.dictrank = doclist 

        
    ################## dom's version      
    def document_term_matrix1(self):
        import pandas as pd
        srted = sorted(self.token_set)
        """ return a D by V array of frequency counts """
        self.array = pd.DataFrame(columns=srted)
        for doc in self.docs:
            ls = doc.tokens
            unique, counts = np.unique(ls, return_counts=True)
            dc = dict(zip(unique,counts))
            big = {key: 0 for key in srted}
            big.update(dc)
            row = big.values()
            self.array.loc[doc]=row
            
        self.array.index = range(1,len(self.docs)+1)
        
    """ this is using double normalization 0.5 instead of log normalization   
    def tf_idf1(self):
        num_per_doc = self.array.astype(bool).sum(axis=0)
        
        idf_all = np.log(self.N/num_per_doc)
        
        import pandas as pd
        srted = sorted(self.token_set)
        self.tfidf1 = pd.DataFrame(columns=srted)

        for i in (range(self.N+1)[1:]):
            self.tfidf1.loc[i] = (self.array.xs(i)/(2*max(self.array.xs(i)))+0.5)*idf_all
    """
    
    def tf_idf2(self):
        num_per_doc = self.array.astype(bool).sum(axis=0)
        
        idf_all = np.log(self.N/num_per_doc)
        
        import pandas as pd
        srted = sorted(self.token_set)
        self.tfidf2 = pd.DataFrame(columns=srted)
        
        def log_norm(x):
            if x==0:
                return 0
            else:
                return(1+np.log(x))
        
        array1 = self.array.applymap(log_norm)

        for i in (range(self.N+1)[1:]):
            self.tfidf2.loc[i] = array1.xs(i)*idf_all

                     
    def dict_rank1(self, dictionary, representation, n=10):
        import pandas as pd
        srted = sorted(self.token_set)
        dict_intersection = list(set(srted) & set(dictionary))
        if representation == "doc-term":
            rowsums = np.sum(self.array[dict_intersection],axis=1)
            b=pd.DataFrame(rowsums, columns=["frequency"])
            b=b.sort_values(by="frequency",ascending=0)
            self.dictrank1 = b[:n]
        elif representation == "tf-idf":     
            rowsums = np.sum(self.tfidf1[dict_intersection],axis=1)
            b=pd.DataFrame(rowsums, columns=["frequency"])
            b=b.sort_values(by="frequency",ascending=0)
            self.dictrank1 = b[:n]
            
            

In [55]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

# Question 1

In [56]:
text = open('../week0/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

corpus = Corpus(pres_speech_list, '../data/stopwords/stopwords.txt', 2)


AttributeError: Corpus instance has no attribute 'tf_idf1'

In [7]:
U, s, V = np.linalg.svd(corpus.tfidf2, full_matrices=True)

In [8]:
# low rank approximation
S = np.zeros((236,13568), dtype=complex)
S[:100, :100] = np.diag(s[:100])
X_hat = np.dot(U, np.dot(S, V))

In [9]:
doc_magnitude = corpus.tfidf2.apply(np.linalg.norm, axis=1).values.reshape((236,1))

cos_sim_mat = np.true_divide(corpus.tfidf2.dot(corpus.tfidf2.T),np.dot(doc_magnitude,doc_magnitude.T))

In [10]:
doc_magnitude_X_hat = np.apply_along_axis(np.linalg.norm, 1, X_hat).reshape((236,1))

cos_sim_mat_X_hat = np.true_divide(X_hat.dot(X_hat.T),np.dot(doc_magnitude_X_hat,doc_magnitude_X_hat.T))

import pandas as pd
cos_sim_mat_X_hat = pd.DataFrame(cos_sim_mat_X_hat)
cos_sim_mat_X_hat.columns = range(1,237)
cos_sim_mat_X_hat.index = range(1,237)

In [21]:
party=[]
import csv
with open('/home/fizlaz/bgse/14D010_Text_Mining/text_mining_DUB/pres_party_236.csv', mode='r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        #print(row[0], row[1])
        #party.append([row[0], row[1]])
        party.append(row[1])
        
party

['None, Federalist',
 'None, Federalist',
 'None, Federalist',
 'None, Federalist',
 'None, Federalist',
 'None, Federalist',
 'None, Federalist',
 'None, Federalist',
 'Federalist',
 'Federalist',
 'Federalist',
 'Federalist',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democratic-Republican',
 'Democrat',
 'Democrat',
 'Democrat',
 'Dem

In [33]:
import numpy as np

democrat = np.array([i for i, x in enumerate(party) if x == "Democrat"])+1
republican = np.array([i for i, x in enumerate(party) if x == "Republican"])+1

array([ 41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  61,
        69,  70,  71,  72,  73,  74,  75,  76, 105, 106, 107, 108, 113,
       114, 115, 116, 117, 118, 119, 120, 135, 136, 137, 138, 139, 140,
       165, 166, 176, 177, 178, 179, 180, 181, 182, 183, 184, 196, 197,
       198, 199, 200, 201, 202, 215, 216, 217, 218, 219, 220, 221, 222,
       231, 232, 233, 234, 235, 236])

In [36]:
cos_sim_mat_X_hat.loc[democrat,republican]

Unnamed: 0,77,78,79,80,89,90,91,92,93,94,...,213,214,223,224,225,226,227,228,229,230
41,(0.999999424995+0j),(0.999999497161+0j),(0.999997350774+0j),(0.999994631577+0j),(0.999994400349+0j),(0.999993511271+0j),(0.999998488488+0j),(0.999999113747+0j),(0.999998701749+0j),(0.999998818674+0j),...,(0.999988587732+0j),(0.999996505718+0j),(0.999992318858+0j),(0.999988773754+0j),(0.999978884808+0j),(0.999983860585+0j),(0.999994258574+0j),(0.999995889967+0j),(0.999990666001+0j),(0.999985909952+0j)
42,(0.9999994046+0j),(0.999999469465+0j),(0.999997197741+0j),(0.999994615417+0j),(0.999994028261+0j),(0.999993339932+0j),(0.99999826904+0j),(0.999998955294+0j),(0.999998625195+0j),(0.999998785754+0j),...,(0.999988663037+0j),(0.999996551268+0j),(0.999992224308+0j),(0.999988789964+0j),(0.999979044016+0j),(0.999983835717+0j),(0.999994192458+0j),(0.999995846264+0j),(0.999990646906+0j),(0.99998594848+0j)
43,(0.999999332305+0j),(0.999999431985+0j),(0.99999687195+0j),(0.999994434439+0j),(0.999993744147+0j),(0.999993252534+0j),(0.999998054771+0j),(0.999998954437+0j),(0.99999856435+0j),(0.999998635062+0j),...,(0.999988419383+0j),(0.999996391711+0j),(0.999992288772+0j),(0.999988700146+0j),(0.999978929551+0j),(0.999983827536+0j),(0.999994215518+0j),(0.999995859699+0j),(0.999990609192+0j),(0.999986025663+0j)
44,(0.999999381007+0j),(0.999999469084+0j),(0.999997039067+0j),(0.999994479606+0j),(0.999993984673+0j),(0.999993445579+0j),(0.999998304524+0j),(0.999998991018+0j),(0.999998599353+0j),(0.999998717465+0j),...,(0.999988632917+0j),(0.999996461523+0j),(0.999992221921+0j),(0.99998874216+0j),(0.999978984551+0j),(0.999983828547+0j),(0.99999422304+0j),(0.999995858433+0j),(0.999990623067+0j),(0.999985923304+0j)
45,(0.999999214809+0j),(0.999999354832+0j),(0.99999672032+0j),(0.999994472122+0j),(0.999993374207+0j),(0.999993345547+0j),(0.999998064949+0j),(0.999998765419+0j),(0.999998578121+0j),(0.999998683764+0j),...,(0.999988517873+0j),(0.99999647853+0j),(0.999992218796+0j),(0.999988476187+0j),(0.999978806638+0j),(0.999983738906+0j),(0.999994026644+0j),(0.999995713919+0j),(0.999990567748+0j),(0.999985786546+0j)
46,(0.999998273368+0j),(0.999998522848+0j),(0.999996130836+0j),(0.999993614249+0j),(0.999992782873+0j),(0.999992433098+0j),(0.999997428973+0j),(0.99999786555+0j),(0.999997662701+0j),(0.999997673905+0j),...,(0.999987420188+0j),(0.999995605168+0j),(0.9999913002+0j),(0.999987335101+0j),(0.999977859372+0j),(0.999982849468+0j),(0.999992939519+0j),(0.999994680156+0j),(0.999989531309+0j),(0.999984828064+0j)
47,(0.999997434238+0j),(0.999997645556+0j),(0.999995548655+0j),(0.999992979113+0j),(0.999991782936+0j),(0.999990986864+0j),(0.999996284892+0j),(0.999996824468+0j),(0.999996580265+0j),(0.999996601365+0j),...,(0.999986598132+0j),(0.999994954685+0j),(0.999990422435+0j),(0.999986564857+0j),(0.999977114668+0j),(0.999982078475+0j),(0.999992235909+0j),(0.999993914982+0j),(0.99998868313+0j),(0.999984075762+0j)
48,(0.999998632305+0j),(0.999998650539+0j),(0.999997746535+0j),(0.999995706774+0j),(0.999994091969+0j),(0.999992702882+0j),(0.999997810971+0j),(0.999998400954+0j),(0.999998222197+0j),(0.999998311526+0j),...,(0.999987808841+0j),(0.999995794499+0j),(0.999991511926+0j),(0.999987858749+0j),(0.999978116305+0j),(0.999982921821+0j),(0.999993378851+0j),(0.999994984735+0j),(0.999989872134+0j),(0.999985114803+0j)
49,(0.999999258284+0j),(0.999999370447+0j),(0.999997096917+0j),(0.999994682336+0j),(0.99999403476+0j),(0.999993368593+0j),(0.999998388577+0j),(0.999998981226+0j),(0.999998707785+0j),(0.999998788572+0j),...,(0.999988414941+0j),(0.9999963385+0j),(0.999992084739+0j),(0.999988507833+0j),(0.999978674192+0j),(0.999983513795+0j),(0.999994055016+0j),(0.999995654155+0j),(0.999990396145+0j),(0.999985739395+0j)
50,(0.999999491029+0j),(0.999999530405+0j),(0.999997406716+0j),(0.999994854788+0j),(0.999994243721+0j),(0.999993890162+0j),(0.999998503013+0j),(0.99999916909+0j),(0.999998763856+0j),(0.999998782989+0j),...,(0.999988867936+0j),(0.999996604411+0j),(0.999992334878+0j),(0.999988858983+0j),(0.999978991154+0j),(0.99998391361+0j),(0.999994377072+0j),(0.999995997107+0j),(0.999990747375+0j),(0.999985957999+0j)


In [43]:
a=(np.sum(cos_sim_mat_X_hat.loc[democrat,republican],axis=1)/len(republican))
dem_rep_X_hat = sum(a)/len(a)

b=(np.sum(cos_sim_mat_X_hat.loc[democrat,democrat],axis=1)/len(democrat))
dem_dem_X_hat = sum(b)/len(b)

c=(np.sum(cos_sim_mat_X_hat.loc[republican,republican],axis=1)/len(republican))
rep_rep_X_hat = sum(c)/len(c)

d=(np.sum(cos_sim_mat.loc[democrat,republican],axis=1)/len(republican))
dem_rep = sum(d)/len(d)

e=(np.sum(cos_sim_mat.loc[democrat,democrat],axis=1)/len(democrat))
dem_dem = sum(e)/len(e)

f=(np.sum(cos_sim_mat.loc[republican,republican],axis=1)/len(republican))
rep_rep = sum(f)/len(f)

In [47]:
print dem_rep_X_hat

print dem_dem_X_hat

print rep_rep_X_hat

print dem_rep

print dem_dem

print rep_rep

(0.999991205944+0j)
(0.999993439877+0j)
(0.999989422866+0j)
0.999987867539
0.99998996642
0.999986262528


In [49]:
cos_sim_mat

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,227,228,229,230,231,232,233,234,235,236
1,1.000000,0.999987,0.999987,0.999986,0.999989,0.999978,0.999983,0.999984,0.999986,0.999982,...,0.999982,0.999984,0.999979,0.999976,0.999987,0.999981,0.999981,0.999978,0.999979,0.999973
2,0.999987,1.000000,0.999991,0.999991,0.999993,0.999982,0.999987,0.999989,0.999990,0.999987,...,0.999987,0.999989,0.999984,0.999980,0.999992,0.999986,0.999986,0.999983,0.999984,0.999978
3,0.999987,0.999991,1.000000,0.999990,0.999992,0.999981,0.999987,0.999987,0.999989,0.999985,...,0.999986,0.999987,0.999983,0.999979,0.999990,0.999984,0.999984,0.999981,0.999982,0.999976
4,0.999986,0.999991,0.999990,1.000000,0.999992,0.999981,0.999986,0.999987,0.999989,0.999985,...,0.999986,0.999987,0.999983,0.999979,0.999990,0.999985,0.999984,0.999981,0.999982,0.999976
5,0.999989,0.999993,0.999992,0.999992,1.000000,0.999984,0.999988,0.999990,0.999992,0.999988,...,0.999989,0.999991,0.999986,0.999982,0.999994,0.999988,0.999988,0.999985,0.999986,0.999979
6,0.999978,0.999982,0.999981,0.999981,0.999984,1.000000,0.999979,0.999979,0.999980,0.999977,...,0.999977,0.999979,0.999974,0.999970,0.999982,0.999976,0.999976,0.999972,0.999974,0.999967
7,0.999983,0.999987,0.999987,0.999986,0.999988,0.999979,1.000000,0.999985,0.999986,0.999982,...,0.999982,0.999984,0.999979,0.999975,0.999986,0.999981,0.999980,0.999977,0.999978,0.999972
8,0.999984,0.999989,0.999987,0.999987,0.999990,0.999979,0.999985,1.000000,0.999989,0.999986,...,0.999984,0.999986,0.999981,0.999977,0.999989,0.999983,0.999983,0.999980,0.999981,0.999975
9,0.999986,0.999990,0.999989,0.999989,0.999992,0.999980,0.999986,0.999989,1.000000,0.999989,...,0.999986,0.999988,0.999983,0.999979,0.999991,0.999985,0.999984,0.999981,0.999982,0.999976
10,0.999982,0.999987,0.999985,0.999985,0.999988,0.999977,0.999982,0.999986,0.999989,1.000000,...,0.999982,0.999984,0.999979,0.999975,0.999987,0.999981,0.999981,0.999978,0.999979,0.999972


# Question 2