In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import os

In [2]:
df_ = pd.read_csv('presidential_speeches.csv', sep=',', usecols=['Party', 'Transcript'], encoding='utf-8')

df_dems = df_[df_['Party'] == 'Democratic']
df_rebs = df_[df_['Party'] == 'Republican']
df = pd.concat([df_dems, df_rebs])
df['Party'] = df['Party'].apply(lambda x: 1 if x == 'Republican' else 0)
df.sample(10)

Unnamed: 0,Party,Transcript
941,0,"Mr. President, Mr. Secretary-General, my fello..."
257,0,To the House of Representatives: I return with...
458,1,"Mr. Chairman *, the message which you have for..."
119,0,Fellow Citizens: The practice of all my predec...
997,0,"THE PRESIDENT: Well, thank you. Please, if you..."
268,1,To the Senate of the United States: I transmit...
439,0,"Madam President, Ladies of the Association: I ..."
680,0,"THE PRESIDENT. Good afternoon, ladies and gent..."
758,1,Good evening. I am here tonight to announce my...
984,1,"Thank you to our First Lady, Melania, who has ..."


In [3]:
df.isnull().sum()

Party         0
Transcript    0
dtype: int64

In [3]:
X = df['Transcript'].values
y = df['Party'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((702,), (176,), (702,), (176,))

In [14]:
print(X_train[0])



In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((702,), (176,), (702,), (176,))

In [19]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_ = tokenizer.texts_to_sequences(X_train)
X_test_ = tokenizer.texts_to_sequences(X_test)

voc_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(X_train[2])
print(X_train_[2])

[457, 139, 2, 1, 269, 3, 230, 2, 354, 1, 324, 2, 1, 2419, 3572, 57, 15, 2970, 5, 27, 112, 1846, 424, 87, 665, 1964, 9, 969, 3, 2043, 4, 1, 2, 25, 127, 9, 1, 251, 3, 3373, 488, 2, 118, 387, 23, 11, 52, 1866, 1, 102, 2179, 1015, 23, 15, 1534, 5, 12, 92, 538, 210, 66, 95, 685, 425, 4, 57, 21, 1686, 20, 1, 664, 830, 2, 1, 36, 1069, 2396, 1, 173, 239, 2, 1, 63, 14, 3611, 57, 24, 1, 822, 1207, 2, 1, 3889, 130, 34, 1, 69, 1417, 3, 5, 1, 837, 2552, 17, 104, 43, 38, 1126, 1230, 4, 10, 4223, 5, 1981, 210, 1, 1776, 2, 964, 95, 32, 15, 31, 1369, 4223, 24, 3, 5, 25, 173, 2990, 92, 3, 381, 5, 25, 986, 2, 1, 63, 32, 21, 431, 3, 574, 93, 18, 1, 3132, 2, 1981, 1, 659, 65, 1, 285, 9, 603, 3, 778, 5, 1232, 9, 43, 38, 1126, 26, 31, 3191, 390, 3, 1, 2177, 3313, 2, 1981, 3, 3229, 5, 1232, 9, 1126, 26, 31, 100, 251, 1, 1809, 2, 1, 2916, 3573, 4397, 28, 3771, 9, 43, 38, 1126, 115, 1, 338, 2, 43, 38, 1126, 4223, 8, 91, 1463, 106, 106, 1, 3889, 2, 2553, 1456, 26, 31, 1212, 17, 6, 100, 76, 4104, 2, 173, 20, 6, 7

In [20]:
num_words = 5000

def vectorize_sequences(sequences, dimension=num_words): 
    results = np.zeros((len(sequences), dimension)) 
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results 

X_train = vectorize_sequences(X_train_)
X_test = vectorize_sequences(X_test_)

In [6]:
vect = CountVectorizer()
vect.fit(X_train)

CountVectorizer()

In [8]:
vect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [9]:
vect.get_feature_names()[-20:]

['zeros',
 'zest',
 'zigs',
 'zika',
 'zimbabwe',
 'zimbabweans',
 'zimmerman',
 'zinc',
 'zinke',
 'zion',
 'zionism',
 'zip',
 'zone',
 'zones',
 'zoning',
 'zoological',
 'zooming',
 'zoot',
 'zte',
 'zuckert']

In [7]:
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((702, 33651), (176, 33651), (702,), (176,))

## Model 1: Dense Network

In [10]:
num_words = X_train.shape[1]

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Input(shape=(num_words,)))
model.add(tf.keras.layers.Dense(200, activation='relu'))
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.add(tf.keras.layers.Flatten())
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               6730400   
                                                                 
 dense_1 (Dense)             (None, 100)               20100     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
 flatten (Flatten)           (None, 1)                 0         
                                                                 
Total params: 6,750,601
Trainable params: 6,750,601
Non-trainable params: 0
_________________________________________________________________


In [11]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=30)

Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [12]:
from sklearn.metrics import classification_report

predictions = [1. if x > 0.5 else 0. for x in model.predict(X_test)]
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       1.00      0.47      0.64       176

    accuracy                           0.47       176
   macro avg       0.50      0.23      0.32       176
weighted avg       1.00      0.47      0.64       176



  _warn_prf(average, modifier, msg_start, len(result))


## Model 2: Simple RNN

In [20]:
from keras.preprocessing import sequence

maxlen = 50

X_train = sequence.pad_sequences(X_train.toarray(), maxlen=maxlen, truncating="pre", padding="post")
X_test = sequence.pad_sequences(X_test.toarray(), maxlen=maxlen, truncating="pre", padding="post")

In [21]:
max([len(x) for x in X_test])

50

In [22]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(num_words, 32))
model.add(tf.keras.layers.SimpleRNN(32)) # return_sequences=True for getting the complete output sequence
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          1076832   
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,078,945
Trainable params: 1,078,945
Non-trainable params: 0
_________________________________________________________________


In [24]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics=['accuracy'])

num_epochs = 30
history = model.fit(X_train, y_train, epochs=num_epochs)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [25]:
predictions = [1. if x > 0.5 else 0. for x in model.predict(X_test)]
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

         0.0       0.98      0.56      0.71       165
         1.0       0.11      0.82      0.19        11

    accuracy                           0.57       176
   macro avg       0.54      0.69      0.45       176
weighted avg       0.92      0.57      0.68       176



## Model 3: Long-Short Term Memory (LSTM)

In [27]:
maxlen = 200

X_train = sequence.pad_sequences(X_train, maxlen=maxlen, truncating="pre", padding="post")
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, truncating="pre", padding="post")

In [28]:
from keras.layers import LSTM

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(num_words, 32))
model.add(tf.keras.layers.LSTM(32))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          1076832   
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense_4 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,085,185
Trainable params: 1,085,185
Non-trainable params: 0
_________________________________________________________________


In [29]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=optimizer,
                loss='binary_crossentropy',
                metrics=['accuracy'])

num_epochs = 30
history = model.fit(X_train, y_train, epochs=num_epochs)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [30]:
predictions = [1. if x > 0.5 else 0. for x in model.predict(X_test)]
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.53      0.70       176
         1.0       0.00      0.00      0.00         0

    accuracy                           0.53       176
   macro avg       0.50      0.27      0.35       176
weighted avg       1.00      0.53      0.70       176



  _warn_prf(average, modifier, msg_start, len(result))
