In [2]:
import numpy as np
import pandas as pd
import re
import nltk 
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import textstat as ts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#Probably won't use all of these, but I'll just import them all here anyway

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jrnoo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Looking at some of the functions we wanted, I see that the length of the speech was one. textstat actually has that already - it is called `lexicon_count` (for number of words) or `sentence_count` (for number of sentences). This allows me to define some other simple functions:

In [115]:
def mreplace(text,list1,list2): #replaces strings in text more efficiently than built-in methods
    for i in range(len(list1)):
        text=text.replace(list1[i],list2[i])
    return text

def remover(text,removees): #removes a list of strings from text
    nothing = [ "" for i in removees ]
    return mreplace(text,removees,nothing)

def plaintxt(text,nospace=True): #removes all spaces, line breaks, and punctuation from text
    removees = ["\n",".",",","?","(",")","!","/",":",";","-","'",'"',"[","]"]
    if nospace: # If nospace is false, spaces will not be removed
        removees.append(" ")
    return remover(text,removees)

def sentlength(text): #average sentence length
    return ts.lexicon_count(text)/ts.sentence_count(text)

def wordlength(text): #average letters per word
    return len(plaintxt(text))/ts.lexicon_count(text)

def wordsyll(text): #average syllables per word
    return ts.syllable_count(text)/ts.lexicon_count(text)

We can also get the reading level from textstat. The best one to use is probably the `text_standard` function with `float_output=True`.

In [4]:
allSW = set(stopwords.words('english')) #set of all stop words
'a' in allSW 

True

In [5]:
def stopwordprop(text): #Proportion of stop words in text
    nopunc = plaintxt(text,nospace=False) #get rid of punctuations and line breaks
    words = nopunc.lower().split() #remove capitalization and split into individual words
    count = 0
    for word in words: #count the number of stop words in the text
        if word in allSW:
            count += 1 
    return count/len(words)

In [19]:
speeches = pd.read_csv("archive/presidential_speeches.csv")["Transcript"]
speeches=speeches.apply(str)
speeches.apply(sentlength)

0       59.750000
1      108.750000
2       40.380952
3       35.025000
4       29.208333
          ...    
987     17.755814
988     18.097222
989     11.971831
990     17.610092
991     12.823183
Name: Transcript, Length: 992, dtype: float64

In [7]:
speeches.apply(wordlength)

0      4.943515
1      4.751724
2      5.128538
3      4.895075
4      4.517832
         ...   
987    4.801572
988    4.722947
989    4.185647
990    4.936963
991    4.419641
Name: Transcript, Length: 992, dtype: float64

In [8]:
speeches.apply(stopwordprop)

0      0.544630
1      0.528736
2      0.534198
3      0.548111
4      0.543324
         ...   
987    0.460535
988    0.444530
989    0.516137
990    0.454262
991    0.500536
Name: Transcript, Length: 992, dtype: float64

Idea for making function for two & three word phrase count: 
Create a list of all words in plain text, as before. Then create a list of the form `[(concatenation of words i thru i+n-1) for i in range(word_count-n+1)]`, where n is the number of words in the phrases you are going for. Then use the vectorizor.

In [9]:
def phrases(text,n):
    nopunc = plaintxt(text,nospace=False) #probably want to add something here to deal with sentences ending, but
                                          # I'll do that later
    words = nopunc.lower().split()
    phraselist = [" ".join(words[i:i+n]) for i in range(len(words)-n+1)] #I might need to change " " to "" when I 
                                            # use this with the word vectorizer
    return phraselist

In [10]:
speech = speeches[0]
speech

'Fellow Citizens of the Senate and the House of Representatives: Among the vicissitudes incident to life, no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the fourteenth day of the present month. On the one hand, I was summoned by my Country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years: a retreat which was rendered every day more necessary as well as more dear to me, by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my Country called me, being sufficient to awaken in the wisest and most experienced of her citizens, a distrustful scrutiny into his qualification, could not

In [11]:
phrases(speech,2)

['fellow citizens',
 'citizens of',
 'of the',
 'the senate',
 'senate and',
 'and the',
 'the house',
 'house of',
 'of representatives',
 'representatives among',
 'among the',
 'the vicissitudes',
 'vicissitudes incident',
 'incident to',
 'to life',
 'life no',
 'no event',
 'event could',
 'could have',
 'have filled',
 'filled me',
 'me with',
 'with greater',
 'greater anxieties',
 'anxieties than',
 'than that',
 'that of',
 'of which',
 'which the',
 'the notification',
 'notification was',
 'was transmitted',
 'transmitted by',
 'by your',
 'your order',
 'order and',
 'and received',
 'received on',
 'on the',
 'the fourteenth',
 'fourteenth day',
 'day of',
 'of the',
 'the present',
 'present month',
 'month on',
 'on the',
 'the one',
 'one hand',
 'hand i',
 'i was',
 'was summoned',
 'summoned by',
 'by my',
 'my country',
 'country whose',
 'whose voice',
 'voice i',
 'i can',
 'can never',
 'never hear',
 'hear but',
 'but with',
 'with veneration',
 'veneration and',

In [55]:
def phrase_count(text,n=1,minnum=0):
    ngrams = phrases(text,n)
    unique_phrases = np.unique(np.array(ngrams))
    counter = {}
    for gram in unique_phrases:
        num = ngrams.count(gram)
        if num>=minnum: counter[gram] = num
    return counter

In [90]:
def dataframeizer(functions): # turns a list of functions that take in text into a series of functions that take in data frames
    funcs = pd.Series(functions) #starts by turning the list into a series
    newfunc = lambda f: lambda df: df.apply(f) #second-order function that takes in a function f and returns a function that applies f to a data frame or series
    newfs = funcs.apply(newfunc) #turns every function in funcs from a text function to a data frame function
    return newfs

In [58]:
RL = lambda text: ts.text_standard(text,float_output=True) # Reading level of the text

In [103]:
sentlen, wordlen, avesylls, SWprop, readlvl, sentcount, wordcount = dataframeizer([sentlength,wordlength,wordsyll,stopwordprop,ts.flesch_reading_ease,ts.sentence_count,ts.lexicon_count])

In [98]:
wordlen(speeches).head()

0    4.943515
1    4.751724
2    5.128538
3    4.895075
4    4.517832
Name: Transcript, dtype: float64

In [99]:
speeches.apply(wordlength).head()

0    4.943515
1    4.751724
2    5.128538
3    4.895075
4    4.517832
Name: Transcript, dtype: float64

In [100]:
wordcount(speeches).head()

0    1434
1     435
2     848
3    1401
4    1402
Name: Transcript, dtype: int64

In [101]:
speeches.apply(ts.lexicon_count)

0      1434
1       435
2       848
3      1401
4      1402
       ... 
987    1527
988    5212
989    8500
990    3839
991    6527
Name: Transcript, Length: 992, dtype: int64

In [104]:
speeches.apply(ts.flesch_reading_ease).head()

0    10.78
1   -30.51
2    22.01
3    44.41
4    58.76
Name: Transcript, dtype: float64

In [105]:
readlvl(speeches)

0      10.78
1     -30.51
2      22.01
3      44.41
4      58.76
       ...  
987    61.87
988    61.56
989    84.68
990    62.07
991    75.40
Name: Transcript, Length: 992, dtype: float64

In [108]:
def Ngrams(df,n,mini=0):
    ngcount = lambda text: phrase_count(text,n=n,minnum=mini)
    return df.apply(ngcount)

In [109]:
Ngrams(speeches,2)

0      {'a call': 1, 'a distrustful': 1, 'a form': 1,...
1      {'26th day': 1, 'a blessing': 1, 'a day': 1, '...
2      {'a communication': 1, 'a competent': 1, 'a di...
3      {'[ the': 1, '] growing': 1, 'a considerable':...
4      {'1784 excepting': 1, '22d of': 1, '[ and': 1,...
                             ...                        
987    {'$ 57': 1, '$ 700': 1, '$ 800': 1, '$ 805': 1...
988    {'$ 100': 1, '$ 250': 1, '$ 40000': 1, '$ 500'...
989    {'$ 14': 2, '$ 15': 1, '$ 2': 2, '$ 20': 1, '$...
990    {'$ 15': 1, '$ 500': 1, '$ 87': 1, '10 hours':...
991    {'1 committed': 1, '1 doing': 1, '1 first': 1,...
Name: Transcript, Length: 992, dtype: object

In [118]:
phrase_count(speeches[3],2,minnum=2)

{'a war': 2,
 'all the': 2,
 'and house': 2,
 'and i': 2,
 'and more': 3,
 'and particularly': 2,
 'and the': 3,
 'as it': 3,
 'as the': 3,
 'as well': 4,
 'be made': 3,
 'be not': 2,
 'before you': 2,
 'belongs to': 2,
 'but as': 2,
 'by a': 3,
 'by such': 2,
 'by the': 4,
 'by which': 2,
 'citizens of': 2,
 'effect the': 2,
 'establishment of': 2,
 'expedient to': 2,
 'far and': 2,
 'fellow citizens': 3,
 'for the': 5,
 'for their': 2,
 'for this': 2,
 'from the': 2,
 'gentlemen of': 2,
 'has been': 5,
 'have been': 3,
 'hope that': 2,
 'house of': 3,
 'i have': 2,
 'in certain': 2,
 'in that': 2,
 'in the': 2,
 'in their': 2,
 'is not': 2,
 'it has': 3,
 'it is': 3,
 'it will': 3,
 'made on': 2,
 'may be': 2,
 'more and': 2,
 'not less': 2,
 'of a': 6,
 'of europe': 2,
 'of kentucky': 2,
 'of our': 8,
 'of representatives': 3,
 'of that': 2,
 'of the': 34,
 'of their': 3,
 'on the': 3,
 'on this': 2,
 'our commerce': 2,
 'our fellow': 2,
 'our own': 2,
 'part of': 3,
 'representativ