# PS2

In [1]:
# QUESTION ONE
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from collections import Counter
"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        

        
    def demo_self():
        print 'this will error out'

In [57]:
import numpy as np
import codecs
import nltk
import math
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from __future__ import division
from collections import Counter
import copy

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        
        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        self.document_term_matrix1()
        
        #self.tf_idf1()
        
        self.tf_idf2()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
            
    def document_term_matrix(self):
        """
        description: create a matrix listing the number of times each vocab term appears in each doc
        """
        
        # get v, the number of tokens
        v = len(self.token_set)
        
        
        doc_term_matrix = []
        for doc in self.docs: 

            # create an empty dictionary of words, add the counts
            wordlist_dict = {}
            word_list = doc.tokens
            for i in self.token_set:
                wordlist_dict[i] = 0.0
            for word in word_list:
                wordlist_dict[word] += 1
            # for each doc, we append its list of word counts to the doc term matrix    
            doc_term_matrix.append(wordlist_dict.values())
        
        # turn the resulting list into a Dxv matrix
        self.doc_term_matrix = np.array(doc_term_matrix)
        self.doc_term_matrix = self.doc_term_matrix.reshape((self.N, v))
        
        
    def tf_idf(self):
        # call document_term_matrix
        self.document_term_matrix()
        
        tf = np.copy(self.doc_term_matrix)
        df = [0] * len(self.token_set)
        
        # create the tf and df values
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                if tf[i][j] > 0:
                    df[j] += 1
                    tf[i][j] = 1 + math.log(tf[i][j])
        
        for j in range(len(df)):
            df[j] = math.log(self.N/df[j])
        
        # put it together into a tf-idf matrix
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                tf[i][j] = tf[i][j] * df[j]
         

        self.tfidf = tf
    
    def dict_rank(self, dictionary, representation, n):
        
        self.tf_idf()

        # Choose whether to rely on the doc term matrix of the tf-idf matrix
        if representation == "doc-term":
            compare_docs = copy.copy(self.doc_term_matrix)
        elif representation == "tf-idf":     
            compare_docs = copy.copy(self.tfidf)

        # Prepare the variables for use later
        docs = copy.copy(self.docs)
        doclist = []
        weights = [0] * self.N
        j = 0
        
        # Collect the weights of each document
        for doc in docs:
            i = 0
            for token in iter(self.token_set):
                if token in dictionary:
                    weights[j] += compare_docs[j][i]
                i += 1
            j += 1
        
        # take the documents with the n largest weights
        for i in range(n):
            whichmax = weights.index(max(weights))
            docmax = docs[whichmax]
            doclist.append(docmax)
            weights.remove(max(weights))
            docs.remove(docmax)
        
        # Add it to self
        self.dictrank = doclist 

        
    ################## dom's version      
    def document_term_matrix1(self):
        import pandas as pd
        srted = sorted(self.token_set)
        """ return a D by V array of frequency counts """
        self.array = pd.DataFrame(columns=srted)
        for doc in self.docs:
            ls = doc.tokens
            unique, counts = np.unique(ls, return_counts=True)
            dc = dict(zip(unique,counts))
            big = {key: 0 for key in srted}
            big.update(dc)
            row = big.values()
            self.array.loc[doc]=row
            
        self.array.index = range(1,len(self.docs)+1)
        
    """ this is using double normalization 0.5 instead of log normalization   
    def tf_idf1(self):
        num_per_doc = self.array.astype(bool).sum(axis=0)
        
        idf_all = np.log(self.N/num_per_doc)
        
        import pandas as pd
        srted = sorted(self.token_set)
        self.tfidf1 = pd.DataFrame(columns=srted)

        for i in (range(self.N+1)[1:]):
            self.tfidf1.loc[i] = (self.array.xs(i)/(2*max(self.array.xs(i)))+0.5)*idf_all
    """
    
    def tf_idf2(self):
        num_per_doc = self.array.astype(bool).sum(axis=0)
        
        idf_all = np.log(self.N/num_per_doc)
        
        import pandas as pd
        srted = sorted(self.token_set)
        self.tfidf2 = pd.DataFrame(columns=srted)
        
        def log_norm(x):
            if x==0:
                return 0
            else:
                return(1+np.log(x))
        
        array1 = self.array.applymap(log_norm)

        for i in (range(self.N+1)[1:]):
            self.tfidf2.loc[i] = array1.xs(i)*idf_all

                     
    def dict_rank1(self, dictionary, representation, n=10):
        import pandas as pd
        srted = sorted(self.token_set)
        dict_intersection = list(set(srted) & set(dictionary))
        if representation == "doc-term":
            rowsums = np.sum(self.array[dict_intersection],axis=1)
            b=pd.DataFrame(rowsums, columns=["frequency"])
            b=b.sort_values(by="frequency",ascending=0)
            self.dictrank1 = b[:n]
        elif representation == "tf-idf":     
            rowsums = np.sum(self.tfidf1[dict_intersection],axis=1)
            b=pd.DataFrame(rowsums, columns=["frequency"])
            b=b.sort_values(by="frequency",ascending=0)
            self.dictrank1 = b[:n]
            
            

In [58]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

# Question 1

In [59]:
text = open('../week0/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

corpus = Corpus(pres_speech_list, '../data/stopwords/stopwords.txt', 2)


In [60]:
U, s, V = np.linalg.svd(corpus.tfidf2, full_matrices=True)

In [73]:
# low rank approximation
S = np.zeros((236,13568))
S[:100, :100] = np.diag(s[:100])
X_hat = np.dot(U, np.dot(S, V))

In [75]:
doc_magnitude = corpus.tfidf2.apply(np.linalg.norm, axis=1).values.reshape((236,1))

cos_sim_mat = np.true_divide(corpus.tfidf2.dot(corpus.tfidf2.T),np.dot(doc_magnitude,doc_magnitude.T))

In [84]:
doc_magnitude_X_hat = np.apply_along_axis(np.linalg.norm, 1, X_hat).reshape((236,1))

cos_sim_mat_X_hat = np.true_divide(X_hat.dot(X_hat.T),np.dot(doc_magnitude_X_hat,doc_magnitude_X_hat.T))

import pandas as pd
cos_sim_mat_X_hat = pd.DataFrame(cos_sim_mat_X_hat)
cos_sim_mat_X_hat.columns = range(1,237)
cos_sim_mat_X_hat.index = range(1,237)

In [85]:
party=[]
import csv
with open('/home/fizlaz/bgse/14D010_Text_Mining/text_mining_DUB/pres_party_236.csv', mode='r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        #print(row[0], row[1])
        #party.append([row[0], row[1]])
        party.append(row[1])
        

In [86]:
import numpy as np

democrat = np.array([i for i, x in enumerate(party) if x == "Democrat"])+1
republican = np.array([i for i, x in enumerate(party) if x == "Republican"])+1

In [87]:
a=(np.sum(cos_sim_mat_X_hat.loc[democrat,republican],axis=1)/len(republican))
dem_rep_X_hat = sum(a)/len(a)

b=(np.sum(cos_sim_mat_X_hat.loc[democrat,democrat],axis=1)/len(democrat))
dem_dem_X_hat = sum(b)/len(b)

c=(np.sum(cos_sim_mat_X_hat.loc[republican,republican],axis=1)/len(republican))
rep_rep_X_hat = sum(c)/len(c)

d=(np.sum(cos_sim_mat.loc[democrat,republican],axis=1)/len(republican))
dem_rep = sum(d)/len(d)

e=(np.sum(cos_sim_mat.loc[democrat,democrat],axis=1)/len(democrat))
dem_dem = sum(e)/len(e)

f=(np.sum(cos_sim_mat.loc[republican,republican],axis=1)/len(republican))
rep_rep = sum(f)/len(f)

In [89]:
print dem_rep_X_hat
print dem_rep

print dem_dem_X_hat
print dem_dem

print rep_rep_X_hat
print rep_rep

0.258515324543
0.124664376825
0.248789881752
0.155281703464
0.325924856127
0.12818336127


# Question 2