In [157]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import Callback

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

# Setence encoder from https://github.com/facebookresearch/InferSent
from InferSent.models import InferSent

# Others
import nltk
import string
import torch

import pandas as pd
import numpy as np

In [None]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()

In [169]:
def create_CNN():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(4096, 1)))
    model.add(MaxPooling1D(pool_size=4))
    # We add a vanilla hidden layer:
    model.add(Flatten())
    model.add(Dense(32))
    model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [79]:
V = 1
MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'InferSent/dataset/GloVe/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

In [80]:
# Hyperparmeters
BATCH_SIZE = 256
NUM_EPOCHS = 3

In [81]:
# Load Data
df = pd.read_csv("inaug_addr_cleaned.csv", encoding="latin").dropna()
full_text = df["text"].to_string(index=False).replace("\n", " ")
sentences = df["text"].tolist()

In [196]:
print(df[df['Final'] == 1].count())
print(df[df['Final'] == 0].count())

doc index    217
text         217
P1           217
P2           217
Final        217
IsSame       217
dtype: int64
doc index    4630
text         4630
P1           4630
P2           4630
Final        4630
IsSame       4630
dtype: int64


In [83]:
# Enocde sentence
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True)

Found 9780(/9859) words with w2v vectors
Vocab size : 9780


In [165]:
# reshaped for CNN
embeddings_reshaped = np.expand_dims(embeddings, axis=2)
target = np.expand_dims(np.array(df["Final"]), axis=1)

In [164]:
# Split data
seed = 7
X_train, X_test, y_train, y_test = train_test_split(embeddings_reshaped, target, test_size=0.2, random_state=seed)

In [174]:
model = create_CNN()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = 3, callbacks=[metrics])

Train on 3877 samples, validate on 970 samples
Epoch 1/3


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


— val_f1: 0.000000 — val_precision: 0.000000 — val_recall 0.000000
Epoch 2/3
— val_f1: 0.000000 — val_precision: 0.000000 — val_recall 0.000000
Epoch 3/3
— val_f1: 0.000000 — val_precision: 0.000000 — val_recall 0.000000


<keras.callbacks.History at 0x1211e4be0>

## Appendix

In [None]:
# Word embedding
glove_dict = {}
with open('glove.twitter.27B.25d.txt', 'r') as f:
    for line in f.readlines():
        glove_dict[line.split()[0]] = np.array(line.split()[1:], dtype=np.float32)

from keras.preprocessing.text import text_to_word_sequence
words = set(text_to_word_sequence(full_text))
vocab_size = len(words)

glove_dict.get(words.pop())

In [114]:
# CNN + LSTM + Embedding
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(4096, 100, input_length=4096))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

model_conv = create_conv_model()
model_conv.fit(embeddings, df["Final"], validation_split=0.4, epochs = 3,)

Train on 2908 samples, validate on 1939 samples
Epoch 1/3
Epoch 2/3
 640/2908 [=====>........................] - ETA: 5:37 - loss: 0.1863 - acc: 0.9547

KeyboardInterrupt: 