In [92]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from glob import glob
import re
import spacy
import pickle
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [9]:
PATH = os.path.abspath("../Thinkful Projects/Corpus of Presential Speeches")

In [10]:
file_list = glob(os.path.join(PATH, "**/*.txt"), recursive=True)

In [11]:
president_files = {}
for president in os.listdir(PATH):
    SOURCE = os.path.join(PATH, president)
    for file in os.listdir(SOURCE):
        if president in president_files:
            p = os.path.join(SOURCE, file)
            president_files[president].append(p)
        else:
            p = os.path.join(SOURCE, file)
            president_files[president] = [p]

In [12]:
reg_ex = re.compile('<.*?>')
pres_texts = {}
for pres, file_list in president_files.items():
    pres_text = ""
    for file in file_list:
        f = open(file, 'r', encoding='utf8')
        text = f.read()
        cleaned = reg_ex.sub('', text).strip()
        pres_text = pres_text + " " + cleaned
        f.close()
    pres_texts[pres] = pres_text

In [13]:
#nlp = spacy.load('en', disable=['tagger', 'ner'])
#nlp.max_length = 1500000 
#docs = [[pres, nlp(text)] for pres, text in pres_texts.items()]
#pickle.dump(docs, open("parsed_docs", "wb"))

docs = pickle.load(open("parsed_docs", "rb"))

In [14]:
sent_and_pres = [[sent, pres] for pres, doc in docs for sent in doc.sents]

In [15]:
df_sent = pd.DataFrame(sent_and_pres)

In [17]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]

    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame()
    for word in common_words:
        df[word] = pd.Series(data=[0] * len(sentences),
            index=pd.RangeIndex(len(sentences)), 
            dtype=np.uint8)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1].astype('category')
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
    
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.at[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 10000 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
bags = [bag_of_words(doc) for pres, doc in docs]

common_words_bag = set([word for bag in bags for word in bag])

In [18]:
word_counts = bow_features(df_sent, common_words_bag)

Processing row 0
Processing row 10000
Processing row 20000
Processing row 30000
Processing row 40000
Processing row 50000
Processing row 60000
Processing row 70000
Processing row 80000
Processing row 90000
Processing row 100000
Processing row 110000
Processing row 120000
Processing row 130000


In [19]:
word_counts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137048 entries, 0 to 137047
Columns: 9820 entries, Formosa to text_source
dtypes: category(1), object(1), uint16(9818)
memory usage: 2.5+ GB


In [20]:
word_counts.head()

Unnamed: 0,Formosa,Palestinian,strike,central,consolation,slide,pork,Manufacturers,33,debtor,...,fitness,emphasis,Rev.,GI,nonproductive,overall,Alzheimer,inch,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"( , The, personal, inconveniences, to, the, me...",adams
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, would, have, afforded, me, the, highest, ...",adams
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(While, other, states, are, desolated, with, f...",adams
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, is, with, extreme, regret, that, I, shall...",adams
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(But, if, the, tide, of, our, prosperity, is, ...",adams


In [28]:
#word_counts.dtypes.to_dict()
#pickle.dump(word_counts.dtypes.to_dict(), open("word_counts_dtypes", "wb"))

word_count_dtypes = pickle.load(open("word_counts_dtypes", "rb"))

In [119]:
#word_counts.to_csv("psa_word_counts.csv")

In [33]:
#df_chunks = []
#for chunk in pd.read_csv("psa_word_counts.csv", 
#                         dtype=word_count_dtypes,
#                        chunksize=5000):
#    df_chunks.append(chunk)
#    break

In [67]:
last_10_pres = ["obama", "gwbush", "bush", "clinton", "reagan", 
                "carter", "ford", "nixon", "johnson", "kennedy"]

pres_data = word_counts[word_counts["text_source"].isin(last_10_pres)]

In [71]:
occurs = pres_data.drop(["text_sentence", "text_source"],axis=1).sum()
occurs = occurs[(occurs > 10)].index.append(pd.Index(["text_source"]))

In [82]:
slice_pres_data = pres_data.filter(items=occurs, axis=1)
#for column in slice_pres_data.columns:
#    if slice_pres_data[column].dtype == np.uint16:
#        slice_pres_data[column] = slice_pres_data[column].astype(np.uint8)

In [85]:
sample = slice_pres_data.sample(frac=0.2)

In [105]:
pres_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55068 entries, 6933 to 118376
Columns: 9820 entries, Formosa to text_source
dtypes: category(1), object(1), uint16(9818)
memory usage: 1.0+ GB


In [99]:
X = slice_pres_data.drop(['text_source'], axis=1)
Y = slice_pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

In [102]:
Y.value_counts() / len(slice_pres_data)

obama      0.190492
reagan     0.181721
clinton    0.132908
gwbush     0.109501
kennedy    0.101565
bush       0.078557
carter     0.063394
nixon      0.053570
johnson    0.050792
ford       0.037499
Name: text_source, dtype: float64

In [None]:
# baseline model would guess Obama for every speech for accuracy of .19 .

In [103]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(41301, 4561) (41301,)
Training set score: 0.6400813539623738

Test set score: 0.4860899251834096


In [107]:
X = pres_data.drop(['text_source', 'text_sentence'], axis=1)
Y = pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

obama      0.190492
reagan     0.181721
clinton    0.132908
gwbush     0.109501
kennedy    0.101565
bush       0.078557
carter     0.063394
nixon      0.053570
johnson    0.050792
ford       0.037499
Name: text_source, dtype: float64
(41301, 9818) (41301,)
Training set score: 0.6721144766470546

Test set score: 0.493862134088763


In [127]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=10, 
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2',
                             smooth_idf=True)

In [137]:
#sent_and_pres = [[sent, pres] for pres, doc in docs for sent in doc.sents]
sents = [sent.text for sent, pres in sent_and_pres if pres in last_10_pres]
pres_for_sents = [pres for sent, pres in sent_and_pres if pres in last_10_pres]

In [129]:
tfidf = vectorizer.fit_transform(sents)

In [133]:
pres_data_tfidf = pd.DataFrame(tfidf.toarray())

In [139]:
pres_data_tfidf['pres'] = pres_for_sents

In [143]:
X = pres_data_tfidf.drop(['pres'], axis=1)
Y = pres_data_tfidf['pres']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

In [144]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(41301, 5779) (41301,)
Training set score: 0.573230672380814

Test set score: 0.45659911382290985
