In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('trump_lies_clean.csv', index_col = 0)
df.drop(['explanation', 'date'],axis = 1, inplace = True)
df['president'] = 'Trump'

In [3]:
df2 = pd.read_csv('nixon_quotes_clean.csv', index_col = 0)
df2.drop('explanation', axis = 1, inplace = True)
df2['president'] = 'Nixon'
df = df.append(df2, ignore_index = True)

In [4]:
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
string.punctuation = string.punctuation + '—'

In [6]:
def text_process(original):
    no_punctuation = [char for char in original 
                      if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return [word for word in no_punctuation.split() 
            if word.lower() not in stopwords.words('english')]

In [7]:
bow_transformer = CountVectorizer(analyzer = text_process).fit(df['quote'])
print(len(bow_transformer.vocabulary_))
print("\n")
print(df['quote'][3])

1074


Now, the audience was the biggest ever. But this crowd was massive. Look how far back it goes. This crowd was massive. (Official aerial photos show Obama's 2009 inauguration was much more heavily attended.)


In [8]:
quote_4 = df['quote'][3]
bow4 = bow_transformer.transform([quote_4])
print(bow4)
print(bow4.shape)

  (0, 13)	1
  (0, 137)	1
  (0, 159)	1
  (0, 161)	1
  (0, 244)	1
  (0, 286)	1
  (0, 287)	1
  (0, 290)	1
  (0, 312)	1
  (0, 405)	2
  (0, 473)	1
  (0, 494)	1
  (0, 533)	1
  (0, 562)	1
  (0, 590)	1
  (0, 679)	2
  (0, 705)	1
  (0, 773)	1
  (0, 897)	1
(1, 1074)


In [9]:
print(bow_transformer.get_feature_names()[16:78])

['2014', '2016', '2020', '2500', '3', '306', '325000', '350', '47', '48', '5', '55', '600', '60000', '7', '70', '700', '725', '746', '81', '84', '90', '90s', 'AMERICA', 'Administration', 'Amendment', 'American', 'Americans', 'Aristotle', 'Bad', 'Ban', 'Bannon', 'Barack', 'Big', 'Bill', 'Blow', 'Blumenthal', 'Bob', 'Break', 'Bush', 'Cabinet', 'Canada', 'Catholic', 'Chicago', 'China', 'Chris', 'Christ', 'Church', 'Clapper', 'Clinton', 'Colberts', 'College', 'Congress', 'Could', 'CubanAmericans', 'Cuomo', 'Cuomos', 'Defense', 'Delta', 'Deltas', 'Democrat', 'Democrats']


In [10]:
quote_bow = bow_transformer.transform(df['quote'])
print('Shape of Sparse Matrix: ', quote_bow.shape)
print('Amount of Non-Zero occurences: ', quote_bow.nnz)

Shape of Sparse Matrix:  (136, 1074)
Amount of Non-Zero occurences:  1902


In [11]:
sparsity = (100.0 * quote_bow.nnz / (quote_bow.shape[0] * quote_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 1


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

In [13]:
tfidf_transformer = TfidfTransformer().fit(quote_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 897)	0.17319006342931575
  (0, 773)	0.2100050191042455
  (0, 705)	0.2100050191042455
  (0, 679)	0.3643111604596727
  (0, 590)	0.2100050191042455
  (0, 562)	0.2100050191042455
  (0, 533)	0.17319006342931575
  (0, 494)	0.19371414169659013
  (0, 473)	0.19371414169659013
  (0, 405)	0.420010038208491
  (0, 312)	0.19371414169659013
  (0, 290)	0.18215558022983636
  (0, 287)	0.2100050191042455
  (0, 286)	0.2100050191042455
  (0, 244)	0.2100050191042455
  (0, 161)	0.2100050191042455
  (0, 159)	0.2100050191042455
  (0, 137)	0.2100050191042455
  (0, 13)	0.2100050191042455


In [14]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['Obama']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['Paris']])

4.12822145660007
5.22683374526818


In [15]:
quote_tfidf = tfidf_transformer.transform(quote_bow)
print(quote_tfidf.shape)

(136, 1074)


In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
trump_detect_model = MultinomialNB().fit(quote_tfidf, df['president'])

In [18]:
print('predicted:', trump_detect_model.predict(tfidf4)[0])
print('expected:', df.president[3])

predicted: Trump
expected: Trump


In [19]:
all_predictions = trump_detect_model.predict(quote_tfidf)
print(all_predictions)

['Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump' 'Trump'
 'Trump' 'Nixon' 'Trump' 'Nixon' 'Trump' 'Trump' 'Nixon' 'Nixon' 'Trump'
 'Nixon' 'Trump' 'Trump' 'Nixon' 'Nixon' 'Nixon' 'Nixon' 'Nixon' 'Trump'
 'Nixon' 'Trump' 'Nixon' 'Nixon' 'Nixon' 'Nixon' 'N

In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(df['president'], all_predictions))

             precision    recall  f1-score   support

      Nixon       1.00      0.75      0.86        36
      Trump       0.92      1.00      0.96       100

avg / total       0.94      0.93      0.93       136



In [22]:
from sklearn.model_selection import train_test_split

In [23]:
quote_train, quote_test, president_train, president_test = \
train_test_split(df['quote'], df['president'], test_size = 0.4, random_state = 101)

In [24]:
print(len(quote_train), len(quote_test), len(president_train) + len(president_test))

81 55 136


In [25]:
from sklearn.pipeline import Pipeline

In [26]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer = text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB()),
])

In [27]:
pipeline.fit(quote_train, president_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x1179d2e18>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [28]:
y_pred = pipeline.predict(quote_test)
print(classification_report(y_pred, president_test))

             precision    recall  f1-score   support

      Nixon       0.07      1.00      0.13         1
      Trump       1.00      0.76      0.86        54

avg / total       0.98      0.76      0.85        55

