In [54]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import time
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
from utilities import train_test_split_common

# Load data

In [56]:
df = pd.read_csv("inaug_addr_cleaned.csv", encoding="latin").dropna()
full_text = df["text"].to_string(index=False).replace("\n", " ")
sentences = df["text"].tolist()
df.head()

Unnamed: 0,doc index,text,P1,P2,Final,IsSame
0,0,Fellow-Citizens of the Senate and of the House...,0.0,0.0,0.0,True
1,0,"On the one hand, I was summoned by my Country,...",0.0,0.0,0.0,True
2,0,"On the other hand, the magnitude and difficult...",0.0,0.0,0.0,True
3,0,In this conflict of emotions all I dare aver i...,0.0,0.0,0.0,True
4,0,"All I dare hope is that if, in executing this ...",0.0,0.0,0.0,True


In [57]:
len(df[df["Final"]==0])

4630

# Preprocessing
- tokenize only the top 2500 words
- vectorize the text and convert it into sequence of integers 
- pad_sequences to convert the sequences into 2-D numpy array

In [63]:
tokenizer = Tokenizer(num_words=2500, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(df['text'].values)
#print(tokenizer.word_index)  # To see the dicstionary
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

In [64]:
Y = pd.get_dummies(df['Final']).values
Y.shape

(4847, 2)

In [65]:
Y2=df['Final']

In [61]:
print("{} rows of data; {} features.".format(X.shape[0], X.shape[1]))


4847 rows of data; 663 features.


# Built LSTM network
- hyperparameters
    - embed_dim = 128
    - lstm_out = 196
    - batch_size = 32
- activation function
    - softmax


In [66]:
# hyperparameters
embed_dim = 128
lstm_out = 196
batch_size = 32

## Metrics

### metrics 1

In [67]:
def as_keras_metric(method):
    import functools
    from keras import backend as K
    import tensorflow as tf
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper

In [68]:
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)

### metrics 2

In [36]:
import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
 
    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(" — val_f1: %f — val_precision: %f — val_recall %f".format(_val_f1, _val_precision, _val_recall))
        return

In [37]:
metrics = Metrics()

### metrics 3

In [42]:
import keras.backend as K
from sklearn.metrics import precision_recall_fscore_support
def custom_metrics(y_true, y_pred):
    
    #>>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
    #>>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
    precision, recall, f1, _=precision_recall_fscore_support(y_true, y_pred, average='binary')
    return (precision, recall, f1)

In [74]:
model = Sequential()
model.add(Embedding(2500, embed_dim, input_length = X.shape[1]))
model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy', precision, recall])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 663, 128)          320000    
_________________________________________________________________
lstm_11 (LSTM)               (None, 196)               254800    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 197       
Total params: 574,997
Trainable params: 574,997
Non-trainable params: 0
_________________________________________________________________
None


# Training

In [70]:
# train-test split: test_size=0.2
X_train, X_test, Y_train, Y_test = train_test_split_common(X, Y2)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(3877, 663) (3877,)
(970, 663) (970,)


In [22]:
Y_test[:, 0].sum()

923

In [75]:
t0=time.time()
model.fit(X_train, Y_train, batch_size = batch_size, epochs = 1, validation_split=0.2)
print("Run time: {} s".format(time.time()-t0))

Train on 3101 samples, validate on 776 samples
Epoch 1/1
Run time: 112.31559586524963 s


# Testing

In [None]:
from sklearn.metrics import classification_report

In [76]:
score, acc, test_precision, test_recall = model.evaluate(X_test, Y_test, batch_size = batch_size)
print("Score: %.4f" % (score))
print("Validation Accuracy: %.4f" % (acc))
print("Validation Precision: %.4f" % (test_precision))
print("Validation Recall: %.4f" % (test_recall))
print("Validation F1 score: %.4f" % (2*test_precision*test_recall/(test_precision+test_recall)))

Score: 15.1699
Validation Accuracy: 0.0485
Validation Precision: 0.0444
Validation Recall: 1.0000
Validation F1 score: 0.0850
