In [6]:
# QUESTION ONE
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from collections import Counter
"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        

        
    def demo_self():
        print 'this will error out'

In [8]:
import numpy as np
import codecs
import nltk
import math
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from collections import Counter
import copy

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        
        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
            
    def document_term_matrix(self):
        """
        description: create a matrix listing the number of times each vocab term appears in each doc
        """
        
        # get v, the number of tokens
        v = len(self.token_set)
        
        
        doc_term_matrix = []
        for doc in self.docs: 

            # create an empty dictionary of words
            wordlist_dict = {}
            word_list = doc.tokens
            for i in self.token_set:
                wordlist_dict[i] = 0.0
            for word in word_list:
                wordlist_dict[word] += 1
                
            doc_term_matrix.append(wordlist_dict.values())
            
        self.doc_term_matrix = np.array(doc_term_matrix)
        self.doc_term_matrix = self.doc_term_matrix.reshape((self.N, v))
        
    def tf_idf(self):
        # call document_term_matrix
        self.document_term_matrix()
        
        tf = np.copy(self.doc_term_matrix)
        df = [0] * len(self.token_set)
        
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                if tf[i][j] > 0:
                    df[j] += 1
                    tf[i][j] = 1 + math.log(tf[i][j])
        
        for j in range(len(df)):
            df[j] = math.log(self.N/df[j])
        
        for i in range(len(tf)):
            for j in range(len(tf[i])):
                tf[i][j] = tf[i][j] * df[j]
         

        self.tfidf = tf
    
    def dict_rank(self, dictionary, representation, n):
        
        self.tf_idf()

        # Choose whether to rely on the doc term matrix of the tf-idf matrix
        if representation == "doc-term":
            compare_docs = copy.copy(self.doc_term_matrix)
        elif representation == "tf-idf":     
            compare_docs = copy.copy(self.tfidf)

        # Prepare the variables for use later
        docs = copy.copy(self.docs)
        doclist = []
        weights = [0] * self.N
        j = 0
        
        # Collect the weights of each document
        for doc in docs:
            i = 0
            for token in iter(self.token_set):
                if token in dictionary:
                    weights[j] += compare_docs[j][i]
                i += 1
            j += 1
        
        # take the documents with the n largest weights
        for i in range(n):
            whichmax = weights.index(max(weights))
            docmax = docs[whichmax]
            doclist.append(docmax)
            weights.remove(max(weights))
            docs.remove(docmax)
        
        # Add it to self
        self.dictrank = doclist 
            

            

In [9]:
import re
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)        

text = open('../week0/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)


#Instantite the corpus class
corpus = Corpus(pres_speech_list, '../data/stopwords.txt', 2)
#print corpus.docs[0].text


# Question 2

In [5]:
import numpy as np

harvard = np.loadtxt('../data/inquirerbasic.csv')
#harvard[600000]
len(harvard)

ValueError: could not convert string to float: Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,Weak,Submit,Active,Passive,Pleasur,Pain,Feel,Arousal,EMOT,Virtue,Vice,Ovrst,Undrst,Academ,Doctrin,Econ@,Exch,ECON,Exprsv,Legal,Milit,P

# Question 3

In [10]:
afinn = dict(map(lambda (k,v): (k,int(v)), 
                     [ line.split('\t') for line in open('../data/AFINN-111.txt') ]))



In [18]:
# WE STILL HAVEN'T DONE ANYTHING ABOUT THE FACT THAT THE AFINN IS NOT TOKENISED

sent_score = [0] * len(corpus.docs)
for doc in range(len(corpus.docs)):
    tokencount = 0
    for token in iter(corpus.token_set):
        try:
            sent_score[doc] += afinn[token] * corpus.doc_term_matrix[doc][tokencount]
        except:
            None
        tokencount += 1
sent_score

[-22.0,
 10.0,
 -38.0,
 -22.0,
 -21.0,
 -10.0,
 -19.0,
 -28.0,
 -31.0,
 -11.0,
 -5.0,
 -9.0,
 -19.0,
 -42.0,
 -42.0,
 -25.0,
 -52.0,
 -22.0,
 -21.0,
 -25.0,
 -1.0,
 11.0,
 -31.0,
 -20.0,
 -11.0,
 -13.0,
 -37.0,
 -16.0,
 -28.0,
 -33.0,
 -1.0,
 14.0,
 -51.0,
 -14.0,
 -41.0,
 -42.0,
 -56.0,
 -14.0,
 -58.0,
 -54.0,
 -61.0,
 -121.0,
 -99.0,
 -123.0,
 -84.0,
 -114.0,
 -98.0,
 -82.0,
 -125.0,
 -112.0,
 -146.0,
 -87.0,
 -72.0,
 -94.0,
 -55.0,
 -39.0,
 -112.0,
 10.0,
 -75.0,
 -134.0,
 -101.0,
 -39.0,
 -176.0,
 -130.0,
 -97.0,
 -144.0,
 -74.0,
 -84.0,
 2.0,
 -186.0,
 -62.0,
 -85.0,
 -43.0,
 -107.0,
 -67.0,
 -51.0,
 -19.0,
 -75.0,
 -179.0,
 -97.0,
 -62.0,
 -47.0,
 -38.0,
 -118.0,
 -50.0,
 -95.0,
 -71.0,
 -3.0,
 -208.0,
 -161.0,
 -123.0,
 -173.0,
 -217.0,
 -136.0,
 -105.0,
 -102.0,
 -205.0,
 -175.0,
 -78.0,
 -202.0,
 -165.0,
 -85.0,
 -181.0,
 -97.0,
 -179.0,
 -237.0,
 -212.0,
 -188.0,
 -130.0,
 -168.0,
 -148.0,
 -140.0,
 -33.0,
 -88.0,
 -171.0,
 -113.0,
 -128.0,
 -51.0,
 -20.0,
 -137.0,
 -163.0,
 