In [1]:
# QUESTION ONE
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from collections import Counter
"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        

        
    def demo_self():
        print 'this will error out'

In [2]:
import numpy as np
import codecs
import nltk
import math
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from __future__ import division
from collections import Counter
import copy

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        
        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        self.document_term_matrix1()
        
        self.tf_idf1()
        
        self.tf_idf2()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
            
    def document_term_matrix(self):
        """
        description: create a matrix listing the number of times each vocab term appears in each doc
        """
        
        # get v, the number of tokens
        v = len(self.token_set)
        
        
        doc_term_matrix = []
        for doc in self.docs: 

            # create an empty dictionary of words, add the counts
            wordlist_dict = {}
            word_list = doc.tokens
            for i in self.token_set:
                wordlist_dict[i] = 0.0
            for word in word_list:
                wordlist_dict[word] += 1
            # for each doc, we append its list of word counts to the doc term matrix    
            doc_term_matrix.append(wordlist_dict.values())
        
        # turn the resulting list into a Dxv matrix
        self.doc_term_matrix = np.array(doc_term_matrix)
        self.doc_term_matrix = self.doc_term_matrix.reshape((self.N, v))
        
        
    def tf_idf(self):
        # call document_term_matrix
        self.document_term_matrix()
        
        tf = np.copy(self.doc_term_matrix)
        df = [0] * len(self.token_set)
        
        # create the tf and df values
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                if tf[i][j] > 0:
                    df[j] += 1
                    tf[i][j] = 1 + math.log(tf[i][j])
        
        for j in range(len(df)):
            df[j] = math.log(self.N/df[j])
        
        # put it together into a tf-idf matrix
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                tf[i][j] = tf[i][j] * df[j]
         

        self.tfidf = tf
    
    def dict_rank(self, dictionary, representation, n):
        
        self.tf_idf()

        # Choose whether to rely on the doc term matrix of the tf-idf matrix
        if representation == "doc-term":
            compare_docs = copy.copy(self.doc_term_matrix)
        elif representation == "tf-idf":     
            compare_docs = copy.copy(self.tfidf)

        # Prepare the variables for use later
        docs = copy.copy(self.docs)
        doclist = []
        weights = [0] * self.N
        j = 0
        
        # Collect the weights of each document
        for doc in docs:
            i = 0
            for token in iter(self.token_set):
                if token in dictionary:
                    weights[j] += compare_docs[j][i]
                i += 1
            j += 1
        
        # take the documents with the n largest weights
        for i in range(n):
            whichmax = weights.index(max(weights))
            docmax = docs[whichmax]
            doclist.append(docmax)
            weights.remove(max(weights))
            docs.remove(docmax)
        
        # Add it to self
        self.dictrank = doclist 

        
    ################## dom's version      
    def document_term_matrix1(self):
        import pandas as pd
        srted = sorted(self.token_set)
        """ return a D by V array of frequency counts """
        self.array = pd.DataFrame(columns=srted)
        for doc in self.docs:
            ls = doc.tokens
            unique, counts = np.unique(ls, return_counts=True)
            dc = dict(zip(unique,counts))
            big = {key: 0 for key in srted}
            big.update(dc)
            row = big.values()
            self.array.loc[doc]=row
            
        self.array.index = range(1,len(self.docs)+1)
        
    """ this is using double normalization 0.5 instead of log normalization   
    def tf_idf1(self):
        num_per_doc = self.array.astype(bool).sum(axis=0)
        
        idf_all = np.log(self.N/num_per_doc)
        
        import pandas as pd
        srted = sorted(self.token_set)
        self.tfidf1 = pd.DataFrame(columns=srted)

        for i in (range(self.N+1)[1:]):
            self.tfidf1.loc[i] = (self.array.xs(i)/(2*max(self.array.xs(i)))+0.5)*idf_all
    """
    
    def tf_idf2(self):
        num_per_doc = self.array.astype(bool).sum(axis=0)
        
        idf_all = np.log(self.N/num_per_doc)
        
        import pandas as pd
        srted = sorted(self.token_set)
        self.tfidf2 = pd.DataFrame(columns=srted)
        
        def log_norm(x):
            if x==0:
                return 0
            else:
                return(1+np.log(x))
        
        array1 = self.array.applymap(log_norm)

        for i in (range(self.N+1)[1:]):
            self.tfidf2.loc[i] = array1.xs(i)*idf_all

                     
    def dict_rank1(self, dictionary, representation, n=10):
        import pandas as pd
        srted = sorted(self.token_set)
        dict_intersection = list(set(srted) & set(dictionary))
        if representation == "doc-term":
            rowsums = np.sum(self.array[dict_intersection],axis=1)
            b=pd.DataFrame(rowsums, columns=["frequency"])
            b=b.sort_values(by="frequency",ascending=0)
            self.dictrank1 = b[:n]
        elif representation == "tf-idf":     
            rowsums = np.sum(self.tfidf1[dict_intersection],axis=1)
            b=pd.DataFrame(rowsums, columns=["frequency"])
            b=b.sort_values(by="frequency",ascending=0)
            self.dictrank1 = b[:n]
            
            

In [3]:
import re
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)        

text = open('../week0/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)


#Instantite the corpus class
corpus = Corpus(pres_speech_list, '../data/stopwords.txt', 2)
#print corpus.docs[0].text


# Question 2

In [9]:
import numpy as np
import pandas as pd
# Load in the data
harvard = pd.read_table('../data/inquirerbasic.csv', encoding = "utf-8", sep = ',')

### dom cleaning
df = harvard.parse()

a=df["Entry"].tolist()

regex = re.compile('[^a-zA-Z]')

for i in range(len(a)):
    a[i] = str(a[i])
    
for i in range(len(a)):
    a[i] = regex.sub('', a[i])
    
for i in range(len(a)):
    a[i] = a[i].lower()
    
terms = a
###

# We are going to look to see which of the speeches is most positive
# Create a dictionary of the 
#terms = harvard['Entry']
positive = harvard['Positiv']
pos_terms = dict(zip(terms, positive))

corpus.document_term_matrix()

# Create a score based on the document term matrix
harvard_score = [0] * len(corpus.docs)
for doc in range(len(corpus.docs)):
    tokencount = 0
    for token in iter(corpus.token_set):
        if token.upper() in pos_terms:
            harvard_score[doc] += corpus.doc_term_matrix[doc][tokencount]
        tokencount += 1
harvard_score

# we collect a list of decades so as to compare the speeches
decades = []
for doc in range(len(corpus.docs)):
    decades.append(int(corpus.docs[doc].year) // 10)
    
# collect the total positivity score of each decade, and the total number of speeches for that decade
tot_score_decade = dict(zip(set(decades), [0] * len(set(decades)) ))
tot_num_decade = dict(zip(set(decades), [0] * len(set(decades)) ))
# Arguably the worst code ever written
for decade in iter(set(decades)):
    for i in range(len(decades)):
        if decades[i] == decade:
            tot_score_decade[decade] += harvard_score[i]
            tot_num_decade[decade] += 1
            
# Find the avg
avg_score_decade = dict(zip(set(decades), np.array(tot_score_decade.values()) / np.array(tot_num_decade.values())))
avg_score_decade

  data = self._reader.read(nrows)


[127.0,
 98.0,
 184.0,
 176.0,
 167.0,
 260.0,
 200.0,
 263.0,
 181.0,
 223.0,
 146.0,
 131.0,
 282.0,
 209.0,
 200.0,
 164.0,
 254.0,
 287.0,
 214.0,
 274.0,
 172.0,
 237.0,
 216.0,
 310.0,
 302.0,
 214.0,
 285.0,
 307.0,
 400.0,
 432.0,
 438.0,
 358.0,
 517.0,
 476.0,
 607.0,
 852.0,
 784.0,
 717.0,
 581.0,
 703.0,
 980.0,
 1447.0,
 610.0,
 719.0,
 720.0,
 1197.0,
 984.0,
 1134.0,
 1027.0,
 1053.0,
 1181.0,
 873.0,
 783.0,
 787.0,
 695.0,
 848.0,
 1483.0,
 1533.0,
 1489.0,
 1894.0,
 716.0,
 768.0,
 1312.0,
 973.0,
 927.0,
 1003.0,
 1140.0,
 1030.0,
 1290.0,
 1600.0,
 1283.0,
 1362.0,
 634.0,
 778.0,
 581.0,
 576.0,
 832.0,
 671.0,
 1115.0,
 903.0,
 679.0,
 889.0,
 601.0,
 952.0,
 1012.0,
 906.0,
 1106.0,
 631.0,
 983.0,
 770.0,
 1108.0,
 1335.0,
 1275.0,
 1026.0,
 829.0,
 860.0,
 1915.0,
 1468.0,
 483.0,
 1252.0,
 1222.0,
 1041.0,
 1558.0,
 1358.0,
 1283.0,
 1655.0,
 1392.0,
 1477.0,
 1189.0,
 1947.0,
 2207.0,
 1880.0,
 1935.0,
 906.0,
 1558.0,
 1705.0,
 2455.0,
 2231.0,
 2573.0,
 19

In [13]:
# We now repeat the exercise, but using the tf-idf matrix instead
corpus.tf_idf()

# Create a score based on the document term matrix
harvard_score = [0] * len(corpus.docs)
for doc in range(len(corpus.docs)):
    tokencount = 0
    for token in iter(corpus.token_set):
        if token.upper() in pos_terms:
            harvard_score[doc] += corpus.tfidf[doc][tokencount]
        tokencount += 1
harvard_score

# we collect a list of decades so as to compare the speeches
decades = []
for doc in range(len(corpus.docs)):
    decades.append(int(corpus.docs[doc].year) // 10)
    
# collect the total positivity score of each decade, and the total number of speeches for that decade
tot_score_decade = dict(zip(set(decades), [0] * len(set(decades)) ))
tot_num_decade = dict(zip(set(decades), [0] * len(set(decades)) ))
# Arguably the worst code ever written
for decade in iter(set(decades)):
    for i in range(len(decades)):
        if decades[i] == decade:
            tot_score_decade[decade] += harvard_score[i]
            tot_num_decade[decade] += 1
            
# Find the avg
avg_score_decade = dict(zip(set(decades), np.array(tot_score_decade.values()) / np.array(tot_num_decade.values())))
avg_score_decade

{179: 75.529720245783082,
 180: 79.796968611307989,
 181: 117.00793291445818,
 182: 221.13446114888447,
 183: 319.40272820869586,
 184: 360.25412714809283,
 185: 406.06344332756385,
 186: 311.76881045773371,
 187: 311.58545862814077,
 188: 452.15421507681685,
 189: 590.51799758266691,
 190: 710.27910581935157,
 191: 355.90895194198595,
 192: 245.76912290548117,
 193: 131.27552590543115,
 194: 221.05955380046828,
 195: 203.23708865967771,
 196: 237.08213357469759,
 197: 271.28890522217705,
 198: 369.79087368573096,
 199: 297.39829052601192,
 200: 284.2533123055922,
 201: 353.29554321530941}

# Question 3

In [10]:
afinn = dict(map(lambda (k,v): (k,int(v)), 
                     [ line.split('\t') for line in open('../data/AFINN-111.txt') ]))



# Question 4

In [18]:
# WE STILL HAVEN'T DONE ANYTHING ABOUT THE FACT THAT THE AFINN IS NOT TOKENISED

sent_score = [0] * len(corpus.docs)
for doc in range(len(corpus.docs)):
    tokencount = 0
    for token in iter(corpus.token_set):
        try:
            sent_score[doc] += afinn[token] * corpus.doc_term_matrix[doc][tokencount]
        except:
            None
        tokencount += 1
sent_score

[-22.0,
 10.0,
 -38.0,
 -22.0,
 -21.0,
 -10.0,
 -19.0,
 -28.0,
 -31.0,
 -11.0,
 -5.0,
 -9.0,
 -19.0,
 -42.0,
 -42.0,
 -25.0,
 -52.0,
 -22.0,
 -21.0,
 -25.0,
 -1.0,
 11.0,
 -31.0,
 -20.0,
 -11.0,
 -13.0,
 -37.0,
 -16.0,
 -28.0,
 -33.0,
 -1.0,
 14.0,
 -51.0,
 -14.0,
 -41.0,
 -42.0,
 -56.0,
 -14.0,
 -58.0,
 -54.0,
 -61.0,
 -121.0,
 -99.0,
 -123.0,
 -84.0,
 -114.0,
 -98.0,
 -82.0,
 -125.0,
 -112.0,
 -146.0,
 -87.0,
 -72.0,
 -94.0,
 -55.0,
 -39.0,
 -112.0,
 10.0,
 -75.0,
 -134.0,
 -101.0,
 -39.0,
 -176.0,
 -130.0,
 -97.0,
 -144.0,
 -74.0,
 -84.0,
 2.0,
 -186.0,
 -62.0,
 -85.0,
 -43.0,
 -107.0,
 -67.0,
 -51.0,
 -19.0,
 -75.0,
 -179.0,
 -97.0,
 -62.0,
 -47.0,
 -38.0,
 -118.0,
 -50.0,
 -95.0,
 -71.0,
 -3.0,
 -208.0,
 -161.0,
 -123.0,
 -173.0,
 -217.0,
 -136.0,
 -105.0,
 -102.0,
 -205.0,
 -175.0,
 -78.0,
 -202.0,
 -165.0,
 -85.0,
 -181.0,
 -97.0,
 -179.0,
 -237.0,
 -212.0,
 -188.0,
 -130.0,
 -168.0,
 -148.0,
 -140.0,
 -33.0,
 -88.0,
 -171.0,
 -113.0,
 -128.0,
 -51.0,
 -20.0,
 -137.0,
 -163.0,
 