In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from glob import glob
import re
import spacy
import pickle
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
%matplotlib inline

In [3]:
PATH = os.path.abspath("../Thinkful Projects/Corpus of Presential Speeches")

In [4]:
file_list = glob(os.path.join(PATH, "**/*.txt"), recursive=True)

In [5]:
president_files = {}
for president in os.listdir(PATH):
    SOURCE = os.path.join(PATH, president)
    for file in os.listdir(SOURCE):
        if president in president_files:
            p = os.path.join(SOURCE, file)
            president_files[president].append(p)
        else:
            p = os.path.join(SOURCE, file)
            president_files[president] = [p]

In [6]:
reg_ex = re.compile('<.*?>')
pres_texts = {}
for pres, file_list in president_files.items():
    pres_text = ""
    for file in file_list:
        f = open(file, 'r', encoding='utf8')
        text = f.read()
        cleaned = reg_ex.sub('', text).strip()
        pres_text = pres_text + " " + cleaned
        f.close()
    pres_texts[pres] = pres_text

In [8]:
# nlp = spacy.load('en', disable=['tagger', 'ner'])
# nlp.max_length = 1500000 
# docs = [[pres, nlp(text)] for pres, text in pres_texts.items()]
# pickle.dump(docs, open("parsed_docs", "wb"))

docs = pickle.load(open("parsed_docs.data", "rb"))

In [18]:
sent_and_pres = [[sent, pres] for pres, doc in docs for sent in doc.sents]

In [19]:
df_sent = pd.DataFrame(sent_and_pres)

In [20]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]

    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame()
    for word in common_words:
        df[word] = pd.Series(data=[0] * len(sentences),
            index=pd.RangeIndex(len(sentences)), 
            dtype=np.uint8)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1].astype('category')
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
    
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.at[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 10000 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
bags = [bag_of_words(doc) for pres, doc in docs]

common_words_bag = set([word for bag in bags for word in bag])

In [21]:
word_counts = bow_features(df_sent, common_words_bag)

Processing row 0
Processing row 10000
Processing row 20000
Processing row 30000
Processing row 40000
Processing row 50000
Processing row 60000
Processing row 70000
Processing row 80000
Processing row 90000
Processing row 100000
Processing row 110000
Processing row 120000
Processing row 130000


In [23]:
#word_counts.dtypes.to_dict()
#pickle.dump(word_counts.dtypes.to_dict(), open("word_counts_dtypes", "wb"))

word_count_dtypes = pickle.load(open("word_counts_dtypes.data", "rb"))

In [24]:
last_10_pres = ["obama", "gwbush", "bush", "clinton", "reagan", 
                "carter", "ford", "nixon", "johnson", "kennedy"]

pres_data = word_counts[word_counts["text_source"].isin(last_10_pres)]

In [25]:
occurs = pres_data.drop(["text_sentence", "text_source"],axis=1).sum()
occurs = occurs[(occurs > 10)].index.append(pd.Index(["text_source"]))

In [26]:
slice_pres_data = pres_data.filter(items=occurs, axis=1)
#for column in slice_pres_data.columns:
#    if slice_pres_data[column].dtype == np.uint16:
#        slice_pres_data[column] = slice_pres_data[column].astype(np.uint8)

In [27]:
sample = slice_pres_data.sample(frac=0.2)

In [28]:
vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=10, 
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2',
                             smooth_idf=True)

In [130]:
#sent_and_pres = [[sent, pres] for pres, doc in docs for sent in doc.sents]
sents = [sent.text.strip().replace('\n', ' ') for sent, pres in sent_and_pres if pres in last_10_pres]
pres_for_sents = [pres for sent, pres in sent_and_pres if pres in last_10_pres]

In [30]:
tfidf = vectorizer.fit_transform(sents)

In [46]:
X_train_tfidf, X_test_tfidf= train_test_split(tfidf, 
                                              test_size=0.2, 
                                              random_state=0)

In [31]:
pres_data_tfidf = pd.DataFrame(tfidf.toarray())

In [32]:
pres_data_tfidf['pres'] = pres_for_sents

In [33]:
print(tfidf.get_shape()[1])

5779


In [34]:
X = pres_data_tfidf.drop(['pres'], axis=1)
Y = pres_data_tfidf['pres']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

In [102]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(250)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Percent variance captured by all components: 30.736752385937326


In [103]:
X_train, X_test, Y_train, Y_test = train_test_split(sents, pres_for_sents,
                                   test_size=0.2, 
                                   random_state=0)

In [104]:
#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Component 0:
But my colleagues, this I believe: With the help of God, who has blessed this land so richly, with the cooperation of the Congress, and with the support of the American people, we can and we will make the year 1974 a year of unprecedented progress toward our goal of building a structure of lasting peace in the world and a new prosperity without war in the United States of America.    0.482979
Let us not unnecessarily disturb the commerce and credit and industry of the country by declaring to the American people and to the world that the United States are still in a condition of civil war.                                                                                                                                                                                            0.467002
By your gracious cooperation in the transition process, you have shown a watching world that we are a united people pledged to maintaining a political system which guarantees individual liberty t

In [53]:
from sklearn.cluster import KMeans
from sklearn import metrics

In [58]:
def eval_cluster_model(model, pca_df, samples=5):
    c1_deviation = []
    c2_deviation = []
    scores = []
    means = []
    medians = []
    stds = []
    for sample in range(samples):
        print(sample)
        pca_df_samp = pca_df.sample(frac=0.25)
        model.fit(pca_df_samp)
        scores.append(metrics.calinski_harabaz_score(pca_df_samp, model.labels_))
        means.append((pca_df_samp[0].mean(), pca_df_samp[1].mean()))
        medians.append((pca_df_samp[0].median(), pca_df_samp[1].median()))
        stds.append((pca_df_samp[0].std(), pca_df_samp[1].std()))
    print("Score: ", np.mean(scores))
    means, stds, medians = np.array(means), np.array(stds), np.array(medians)
    print()
    print("Cross Sample Comparisons")
    print("------------------------")
    print("First Component")
    print("Means Deviation:   ", means[:, 0].std())
    print("Medians Deviation: ", medians[:, 0].std())
    print("Stds Deviation:    ", stds[:, 0].std())
    c1_deviation.append((means[:, 0].std(), medians[:, 0].std(),
                            stds[:, 0].std()))
    print("Second Component")
    print("Means Deviation:   ", means[:, 1].std())
    print("Medians Deviation: ", medians[:, 1].std())
    print("Stds Deviation:    ", stds[:, 1].std())  
    c2_deviation.append((means[:, 1].std(), medians[:, 1].std(),
                        stds[:, 1].std()))
    print()

    c1_deviation = np.array(c1_deviation)
    c2_deviation = np.array(c2_deviation)
    
    return c1_deviation, c2_deviation, model

In [62]:
kmeans2 = KMeans(n_clusters=2, random_state=42)
eval_cluster_model(kmeans2, paras_by_component)

0
1
2
3
4
Score:  82.19154892788875

Cross Sample Comparisons
------------------------
First Component
Means Deviation:    0.0004993584483313611
Medians Deviation:  0.0005051109431081177
Stds Deviation:     0.000301062458312
Second Component
Means Deviation:    0.00044409508838346465
Medians Deviation:  0.00015022819571535457
Stds Deviation:     0.002294595474630467



(array([[0.00049936, 0.00050511, 0.00030106]]),
 array([[0.0004441 , 0.00015023, 0.0022946 ]]),
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
     n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
     random_state=42, tol=0.0001, verbose=0))

In [76]:
kmeans3 = KMeans(n_clusters=3, random_state=42)
eval_cluster_model(kmeans3, paras_by_component)

0
1
2
3
4
Score:  3016.814656705914

Cross Sample Comparisons
------------------------
First Component
Means Deviation:    0.0005175594854425628
Medians Deviation:  0.00047365553414805637
Stds Deviation:     0.00011871073052252841
Second Component
Means Deviation:    0.0004745662039104361
Medians Deviation:  0.00021644013100946866
Stds Deviation:     0.0012189524662681531



(array([[0.00051756, 0.00047366, 0.00011871]]),
 array([[0.00047457, 0.00021644, 0.00121895]]),
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
     n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
     random_state=42, tol=0.0001, verbose=0))

In [77]:
paras_by_component['cluster'] = kmeans3.predict(paras_by_component)

In [78]:
paras_by_component['pres'] = Y_train

In [89]:
paras_by_component.groupby(['pres','cluster']).size() \
                  .groupby(level=0) \
                  .apply(lambda x: 100 * x / float(x.sum()))

pres     cluster
bush     0          81.469741
         1          14.495677
         2           4.034582
carter   0          80.780142
         1          16.453901
         2           2.765957
clinton  0          80.660620
         1          17.645045
         2           1.694335
ford     0          78.216019
         1          15.958738
         2           5.825243
gwbush   0          79.790080
         1          17.143445
         2           3.066475
johnson  0          72.386059
         1           7.059875
         2          20.554066
kennedy  0          82.348948
         1          11.699480
         2           5.951573
nixon    0          81.539108
         1          14.550042
         2           3.910849
obama    0          84.044328
         1          13.512869
         2           2.442803
reagan   0          82.959810
         1          14.260674
         2           2.779517
dtype: float64

In [64]:
kmeans4 = KMeans(n_clusters=4, random_state=42)
eval_cluster_model(kmeans4, paras_by_component)

0
1
2
3
4
Score:  74.1318100883419

Cross Sample Comparisons
------------------------
First Component
Means Deviation:    0.0003195443964970745
Medians Deviation:  0.0004897681953063193
Stds Deviation:     0.00027709867919151757
Second Component
Means Deviation:    0.00043728259717957857
Medians Deviation:  0.0001611502341979518
Stds Deviation:     0.0015803762771032187



(array([[0.00031954, 0.00048977, 0.0002771 ]]),
 array([[0.00043728, 0.00016115, 0.00158038]]),
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
     n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
     random_state=42, tol=0.0001, verbose=0))

In [65]:
kmeans5 = KMeans(n_clusters=5, random_state=42)
eval_cluster_model(kmeans5, paras_by_component)

0
1
2
3
4
Score:  72.47847583701427

Cross Sample Comparisons
------------------------
First Component
Means Deviation:    0.0005144291973786782
Medians Deviation:  0.0005808498132999947
Stds Deviation:     0.0006082240667711229
Second Component
Means Deviation:    0.000783549400072541
Medians Deviation:  0.00012246910426393857
Stds Deviation:     0.0035295661971701036



(array([[0.00051443, 0.00058085, 0.00060822]]),
 array([[0.00078355, 0.00012247, 0.00352957]]),
 KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
     n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
     random_state=42, tol=0.0001, verbose=0))

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(10,10))
plt.subplot(311)
plt.plot(c1_deviation[:, 0], label="Mean Deviation")
plt.plot(c1_deviation[:, 1], label="Median Deviation")
plt.plot(c1_deviation[:, 2], label="Stds Deviation")
plt.legend(loc='upper right', bbox_to_anchor=(1.25,1.055))
plt.subplot(312)
plt.plot(c2_deviation[:, 0], label="Mean Deviation")
plt.plot(c2_deviation[:, 1], label="Median Deviation")
plt.plot(c2_deviation[:, 2], label="Stds Deviation")
plt.legend(loc='upper right', bbox_to_anchor=(1.25,1.055))
plt.subplot(313)
plt.plot((c1_deviation+c2_deviation)[:, 0], label="Mean Deviation")
plt.plot((c1_deviation+c2_deviation)[:, 1], label="Median Deviation")
plt.plot((c1_deviation+c2_deviation)[:, 2], label="Stds Deviation")
plt.legend(loc='upper right', bbox_to_anchor=(1.25,1.055));

In [143]:
X = pres_data_tfidf.drop(['pres'], axis=1)
Y = pres_data_tfidf['pres']

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

In [99]:
X = slice_pres_data.drop(['text_source'], axis=1)
Y = slice_pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

In [102]:
Y.value_counts() / len(slice_pres_data)

obama      0.190492
reagan     0.181721
clinton    0.132908
gwbush     0.109501
kennedy    0.101565
bush       0.078557
carter     0.063394
nixon      0.053570
johnson    0.050792
ford       0.037499
Name: text_source, dtype: float64

In [None]:
# baseline model would guess Obama for every speech for accuracy of .19 .

In [103]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(41301, 4561) (41301,)
Training set score: 0.6400813539623738

Test set score: 0.4860899251834096


In [107]:
X = pres_data.drop(['text_source', 'text_sentence'], axis=1)
Y = pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

obama      0.190492
reagan     0.181721
clinton    0.132908
gwbush     0.109501
kennedy    0.101565
bush       0.078557
carter     0.063394
nixon      0.053570
johnson    0.050792
ford       0.037499
Name: text_source, dtype: float64
(41301, 9818) (41301,)
Training set score: 0.6721144766470546

Test set score: 0.493862134088763


In [144]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(41301, 5779) (41301,)
Training set score: 0.573230672380814

Test set score: 0.45659911382290985


In [90]:
paras_by_component

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,492,493,494,495,496,497,498,499,cluster,pres
"As far as I am concerned, I think that was probably a proper decision.\n\n\n\n",0.072008,-0.021567,-0.041884,0.024466,0.007493,-0.039871,-0.016474,-0.054569,0.180727,-0.031634,...,0.004809,0.086113,-0.003693,-0.004461,0.019847,-0.081307,0.015685,0.064727,0,nixon
"With unemployment rising, our nation needs more small businesses to open, more companies to invest and expand, more employers to put up the sign that says, ""Help Wanted.""\n",0.105195,-0.028224,0.005151,-0.016230,0.021816,-0.002699,-0.051872,-0.018513,-0.012896,0.006272,...,-0.030241,-0.064863,0.061728,-0.055119,-0.001629,-0.028984,-0.005444,-0.024338,0,gwbush
The West Indians.,0.017847,-0.005429,0.006440,-0.005599,0.003109,-0.004244,-0.003104,0.014833,-0.000289,-0.000197,...,0.021347,0.029266,0.079516,0.060627,0.000189,-0.020626,-0.004543,0.062206,0,obama
"It is a system which has conscripted vast human and material resources into the building of a tightly knit, highly efficient machine that combines military, diplomatic, intelligence, economic, scientific and political operations.\n",0.085096,-0.024363,0.024938,-0.013992,0.010321,-0.013858,-0.034225,0.031936,-0.005478,-0.008566,...,0.033661,0.006004,-0.095146,-0.014838,0.012997,0.041346,-0.052919,-0.011706,0,kennedy
"We'll invest in innovative programs that are already helping schools meet high standards and close achievement gaps, and we will expand our commitment to charter schools.\n",0.069592,-0.018597,-0.003409,-0.004732,0.019970,-0.006942,-0.027858,-0.013760,-0.010097,0.000084,...,-0.008438,0.016843,0.025247,-0.011309,-0.007954,-0.021554,-0.060723,0.008634,0,obama
"We will begin the long, necessary effort to clean up a productive recreational area and a special national resource -- the Chesapeake Bay.\n",0.073063,-0.019906,0.005159,-0.007113,0.035736,-0.018716,-0.026228,0.004233,-0.011581,0.013563,...,0.051917,-0.052712,-0.033917,0.020945,-0.059048,0.049563,-0.033167,0.072816,0,reagan
The applicant must swear or affirm as follows:\n,0.049040,-0.014520,0.024483,-0.006802,0.016525,-0.037322,0.031003,0.019538,-0.007317,0.029494,...,-0.015696,-0.026948,0.028373,-0.119458,0.019690,-0.003035,0.073343,-0.055413,0,johnson
"But we shouldn't cut people off just because they're poor, they're young, or even because they're unmarried.",0.203814,-0.058322,-0.182432,-0.143201,-0.096874,0.080831,0.059336,-0.028415,-0.012772,-0.017854,...,-0.050182,0.008994,0.020168,-0.004266,-0.024812,0.029884,-0.027413,0.074437,1,clinton
He had earned their respect because he went to places most leaders never visit and listened to people most leaders never hear and spoke simple truth most leaders never speak.\n,0.095900,-0.025564,-0.063451,-0.066611,-0.052058,0.032212,0.030421,0.011706,-0.009871,0.002342,...,0.027061,0.022770,-0.018266,-0.000187,-0.000157,-0.032680,-0.019215,-0.028915,0,clinton
"Its true strength consists in leaving individuals and States as much as possible to themselves; in making itself felt, not in its power, but in its beneficence; not in its control, but in its protection; not in binding the States more closely to the center, but leaving each to move unobstructed in its proper constitutional orbit.""",0.151013,-0.038621,0.166758,-0.058694,-0.034406,-0.202307,0.175155,0.031512,0.016131,-0.014718,...,0.007333,-0.006254,0.035383,0.025546,0.004555,0.003237,0.058368,-0.078023,2,johnson


In [95]:
paras_by_component.drop(['cluster', 'pres'], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
"As far as I am concerned, I think that was probably a proper decision.\n\n\n\n",0.072008,-0.021567,-0.041884,0.024466,0.007493,-0.039871,-0.016474,-0.054569,0.180727,-0.031634,...,-0.042177,-0.024589,0.004809,0.086113,-0.003693,-0.004461,0.019847,-0.081307,0.015685,0.064727
"With unemployment rising, our nation needs more small businesses to open, more companies to invest and expand, more employers to put up the sign that says, ""Help Wanted.""\n",0.105195,-0.028224,0.005151,-0.016230,0.021816,-0.002699,-0.051872,-0.018513,-0.012896,0.006272,...,-0.015237,0.011586,-0.030241,-0.064863,0.061728,-0.055119,-0.001629,-0.028984,-0.005444,-0.024338
The West Indians.,0.017847,-0.005429,0.006440,-0.005599,0.003109,-0.004244,-0.003104,0.014833,-0.000289,-0.000197,...,-0.009729,0.002685,0.021347,0.029266,0.079516,0.060627,0.000189,-0.020626,-0.004543,0.062206
"It is a system which has conscripted vast human and material resources into the building of a tightly knit, highly efficient machine that combines military, diplomatic, intelligence, economic, scientific and political operations.\n",0.085096,-0.024363,0.024938,-0.013992,0.010321,-0.013858,-0.034225,0.031936,-0.005478,-0.008566,...,0.016121,-0.029109,0.033661,0.006004,-0.095146,-0.014838,0.012997,0.041346,-0.052919,-0.011706
"We'll invest in innovative programs that are already helping schools meet high standards and close achievement gaps, and we will expand our commitment to charter schools.\n",0.069592,-0.018597,-0.003409,-0.004732,0.019970,-0.006942,-0.027858,-0.013760,-0.010097,0.000084,...,0.012657,0.004527,-0.008438,0.016843,0.025247,-0.011309,-0.007954,-0.021554,-0.060723,0.008634
"We will begin the long, necessary effort to clean up a productive recreational area and a special national resource -- the Chesapeake Bay.\n",0.073063,-0.019906,0.005159,-0.007113,0.035736,-0.018716,-0.026228,0.004233,-0.011581,0.013563,...,0.018779,-0.023772,0.051917,-0.052712,-0.033917,0.020945,-0.059048,0.049563,-0.033167,0.072816
The applicant must swear or affirm as follows:\n,0.049040,-0.014520,0.024483,-0.006802,0.016525,-0.037322,0.031003,0.019538,-0.007317,0.029494,...,0.033233,0.072071,-0.015696,-0.026948,0.028373,-0.119458,0.019690,-0.003035,0.073343,-0.055413
"But we shouldn't cut people off just because they're poor, they're young, or even because they're unmarried.",0.203814,-0.058322,-0.182432,-0.143201,-0.096874,0.080831,0.059336,-0.028415,-0.012772,-0.017854,...,0.009945,-0.004224,-0.050182,0.008994,0.020168,-0.004266,-0.024812,0.029884,-0.027413,0.074437
He had earned their respect because he went to places most leaders never visit and listened to people most leaders never hear and spoke simple truth most leaders never speak.\n,0.095900,-0.025564,-0.063451,-0.066611,-0.052058,0.032212,0.030421,0.011706,-0.009871,0.002342,...,0.007573,0.023457,0.027061,0.022770,-0.018266,-0.000187,-0.000157,-0.032680,-0.019215,-0.028915
"Its true strength consists in leaving individuals and States as much as possible to themselves; in making itself felt, not in its power, but in its beneficence; not in its control, but in its protection; not in binding the States more closely to the center, but leaving each to move unobstructed in its proper constitutional orbit.""",0.151013,-0.038621,0.166758,-0.058694,-0.034406,-0.202307,0.175155,0.031512,0.016131,-0.014718,...,-0.032142,0.052857,0.007333,-0.006254,0.035383,0.025546,0.004555,0.003237,0.058368,-0.078023


In [98]:
Y1

As far as I am concerned, I think that was probably a proper decision.\n\n\n\n                                                                                                                                                                                                                                                                                                                                                                                                              0
With unemployment rising, our nation needs more small businesses to open, more companies to invest and expand, more employers to put up the sign that says, "Help Wanted."\n                                                                                                                                                                                                                                                                                                                0
The West Indians.                           

In [99]:
X1 = paras_by_component.drop(['cluster', 'pres'], axis=1)
Y1 = paras_by_component['pres']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, 
                                                    Y1,
                                                    test_size=0.25,
                                                    random_state=42)

lr = LogisticRegression()
train = lr.fit(X1_train, y1_train)
print(X1_train.shape, y1_train.shape)
print('Training set score:', lr.score(X1_train, y1_train))
print('\nTest set score:', lr.score(X1_test, y1_test))

(33040, 500) (33040,)
Training set score: 0.4044491525423729

Test set score: 0.380334120210641


In [100]:
X = pres_data.drop(['text_source', 'text_sentence'], axis=1)
Y = pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(41301, 9818) (41301,)
Training set score: 0.672162901624658

Test set score: 0.49393477155516813


In [101]:
pres_data

Unnamed: 0,intensity,immoral,Laws,Unions,intention,ally,eat,windfall,unprovided,fragile,...,Detente,November,undismayed,use,Alabama,assertion,Adjutant,insular,text_sentence,text_source
6933,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"( , I, think, we, 've, seen, a, deterioration,...",bush
6934,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, think, for, a, while, as, a, nation, we, c...",bush
6935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(For, a, while, ,, as, I, recall, ,, it, even,...",bush
6936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, we, 've, seen, a, deterioration, in, valu...",bush
6937,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(We, got, away, ,, we, got, into, this, feelin...",bush
6938,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,"(And, I, do, n't, believe, that, at, all, I, d...",bush
6939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, ,, of, course, ,, as, far, as, the, how,...",bush
6940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(But, we, 've, got, to, do, a, lot, better, on...",bush
6941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(We, 've, got, to, get, after, the, users, mor...",bush
6942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(We, have, to, change, this, whole, culture, .)",bush


In [113]:
from sklearn import ensemble

X = pres_data.drop(['text_source', 'text_sentence'], axis=1)
Y = pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

rfc = ensemble.RandomForestClassifier(max_depth=40)
train = rfc.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

(41301, 9818) (41301,)
Training set score: 0.456235926490884

Test set score: 0.3153192416648507


In [116]:
from sklearn import ensemble

X = pres_data.drop(['text_source', 'text_sentence'], axis=1)
Y = pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

rfc = ensemble.GradientBoostingClassifier(verbose=1, subsample=.2,
                                         max_features="auto")
train = rfc.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

      Iter       Train Loss      OOB Improve   Remaining Time 
         1       18308.7288        1734.3374           36.68m


KeyboardInterrupt: 

In [107]:
pres_data.shape

(55068, 9820)

In [110]:
np.sqrt(9820)

99.09591313469996

In [121]:
from sklearn.naive_bayes import MultinomialNB

In [122]:
X = pres_data.drop(['text_source', 'text_sentence'], axis=1)
Y = pres_data['text_source'].cat.remove_unused_categories()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

lr = MultinomialNB()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(41301, 9818) (41301,)
Training set score: 0.586208566378538

Test set score: 0.49219147236144406


In [131]:
sents

["I think we've seen a deterioration of values.",
 'I think for a while as a nation we condoned those things we should have condemned.',
 "For a while, as I recall, it even seems to me that there was talk of legalizing or decriminalizing marijuana and other drugs, and I think that's all wrong.",
 "So we've seen a deterioration in values, and one of the things that I think we should do about it in terms of cause is to instill values into the young people in our schools.",
 'We got away, we got into this feeling that value-free education was the thing.',
 "And I don't believe that at all I do believe there are fundamental rights and wrongs as far as use.",
 'And, of course, as far as the how we make it better, yes, we can do better on interdiction.',
 "But we've got to do a lot better on education, and we have to do, be tougher on those who commit crimes.",
 "We've got to get after the users more.",
 'We have to change this whole culture.',
 'You know, I saw a movie, Crocodile Dundee.',
