In [110]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import Callback

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

# Setence encoder from https://github.com/facebookresearch/InferSent
from InferSent.models import InferSent

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import tensorflow as tf

# Others

import string
import torch
nltk.download('punkt')
nltk.download('stopwords')

import pandas as pd
import numpy as np
import string

from utilities import train_test_split_common

[nltk_data] Downloading package punkt to /Users/chingyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chingyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [69]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    filtered_text = []
    for line in text:
        filtered_line = []
        for word in line:
            if word not in stop_words:
                filtered_line.append(word)
        filtered_text.append(filtered_line)

    return filtered_text

In [52]:
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()

In [53]:
def as_keras_metric(method):
    import functools
    from keras import backend as K
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper


In [45]:
def create_CNN_2():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(4096, 1)))
    model.add(MaxPooling1D(pool_size=4))
    # We add a vanilla hidden layer:
    model.add(Flatten())
    model.add(Dense(64))
    model.add(Dropout(0.2))
    model.add(Dense(2, kernel_initializer='normal', activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy', precision, recall])
    print(model.summary())
    return model

In [48]:
def create_CNN():
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(4096, 1)))
    model.add(MaxPooling1D(pool_size=4))
    # We add a vanilla hidden layer:
    model.add(Flatten())
    model.add(Dense(64))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy', precision, recall])
    print(model.summary())
    return model

In [89]:
def tokenize_sentence(text):
    result = []
    for line in text:
        result.append(nltk.word_tokenize(line))
    return result

In [104]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    filtered_text = []
    for line in text:
        filtered_line = []
        for word in line:
            if word not in stop_words:
                filtered_line.append(word)
        filtered_text.append(filtered_line)

    return filtered_text

## Features

In [220]:
def extract_features(text):
    features = {}

    # adds bag of word representation to features
#     features.update(bag_of_words(text))

    # your code here
    features.update(extract_bigrams(text))
    features.update(extract_trigrams(text))

    return features

In [221]:
def featurize(data):
    featurized_data = []
    for label, text in data:
        feats = extract_features(text)
        featurized_data.append((label, feats))
    return featurized_data

In [157]:
def get_grams(text):
    bigrams = set()
    trigrams = set()
    [bigrams.update(list(nltk.bigrams(line))) for line in text]
    [trigrams.update(list(nltk.trigrams(line))) for line in text]
    return bigrams, trigrams

In [166]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [229]:
def extract_bigrams(sentence):
    feature_dict = {}
    for pair in nltk.bigrams(sentence):
        if pair in feature_dict:
            feature_dict[pair] += 1
        else:
            feature_dict[pair] = 1
    return feature_dict

In [230]:
def extract_trigrams(sentence):
    feature_dict = {}
    for pair in nltk.trigrams(sentence):
        if pair in feature_dict:
            feature_dict[pair] += 1
        else:
            feature_dict[pair] = 1
    return feature_dict

In [231]:
featurized_data = featurize(dataset)

In [234]:
classifier = nltk.NaiveBayesClassifier.train(featurized_data)

TypeError: unhashable type: 'dict'

In [9]:
V = 1
MODEL_PATH = 'InferSent/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'InferSent/dataset/GloVe/glove.840B.300d.txt'
infersent.set_w2v_path(W2V_PATH)

In [10]:
# Hyperparmeters
BATCH_SIZE = 32
NUM_EPOCHS = 3

In [223]:
# Load Data
df = pd.read_csv("inaug_addr_cleaned.csv", encoding="latin").dropna()
full_text = df["text"].to_string(index=False).replace("\n", " ").replace("\r", " ").lower()
sentences = df["text"].tolist()
tokenized_sentences = tokenize_sentence(df["text"].tolist())
cleaned_tokenized_sentences = remove_stopwords(tokenized_sentences)
dataset = list(zip(df["Final"], cleaned_tokenized_sentences))

In [158]:
# Features
bigrams, trigrams = get_grams(cleaned_tokenized_sentences)

In [7]:
print(df[df['Final'] == 1].count())
print(df[df['Final'] == 0].count())

doc index    217
text         217
P1           217
P2           217
Final        217
IsSame       217
dtype: int64
doc index    4630
text         4630
P1           4630
P2           4630
Final        4630
IsSame       4630
dtype: int64


In [12]:
# Enocde sentence
infersent.build_vocab(sentences, tokenize=True)
embeddings = infersent.encode(sentences, tokenize=True)

Found 9780(/9859) words with w2v vectors
Vocab size : 9780


In [49]:
# reshaped for CNN
embeddings_reshaped = np.expand_dims(embeddings, axis=2)
target = np.expand_dims(np.array(df["Final"]), axis=1)
# target = pd.get_dummies(df['Final']).values

In [42]:
print(embeddings.shape)
print(target.shape)

(4847, 4096)
(4847, 2)


In [50]:
# Split data: test_size=0.2
X_train, X_test, y_train, y_test = train_test_split_common(embeddings_reshaped, target)

In [22]:
X_train.shape

(3877, 4096, 1)

In [54]:
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)
model = create_CNN()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs = 3, batch_size = BATCH_SIZE)
score, acc, precision, recall = model.evaluate(X_test, y_test, verbose = 2, batch_size = BATCH_SIZE)
print("Score: %.4f" % (score))
print("Validation Accuracy: %.4f" % (acc))
print("Validation Precision: %.4f" % (precision))
print("Validation Recall: %.4f" % (recall))
print("Validation F1 score: %.4f" % (2*precision*recall/(precision+recall)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_11 (Conv1D)           (None, 4092, 64)          384       
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 1023, 64)          0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 65472)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 64)                4190272   
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 65        
Total params: 4,190,721
Trainable params: 4,190,721
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x13011ecf8>

In [47]:
precision = as_keras_metric(tf.metrics.precision)
recall = as_keras_metric(tf.metrics.recall)
model = create_CNN_2()
model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = 1, verbose = 2)
score, acc, precision, recall = model.evaluate(X_test, y_test, verbose = 2, batch_size = BATCH_SIZE)
print("Score: %.4f" % (score))
print("Validation Accuracy: %.4f" % (acc))
print("Validation Precision: %.4f" % (precision))
print("Validation Recall: %.4f" % (recall))
print("Validation F1 score: %.4f" % (2*precision*recall/(precision+recall)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 4092, 64)          384       
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 1023, 64)          0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 65472)             0         
_________________________________________________________________
dense_17 (Dense)             (None, 64)                4190272   
_________________________________________________________________
dropout_9 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 2)                 130       
Total params: 4,190,786
Trainable params: 4,190,786
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.predict(X_test)

array([[0.9816565 , 0.01733556],
       [0.9731406 , 0.02289199],
       [0.95631623, 0.04263417],
       ...,
       [0.96244633, 0.03607179],
       [0.979934  , 0.01616571],
       [0.97211504, 0.02514414]], dtype=float32)

In [32]:
for i in y_test:
    print(i)

[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[0 1]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0]
[1 0

## Appendix

In [None]:
# Word embedding
glove_dict = {}
with open('glove.twitter.27B.25d.txt', 'r') as f:
    for line in f.readlines():
        glove_dict[line.split()[0]] = np.array(line.split()[1:], dtype=np.float32)

from keras.preprocessing.text import text_to_word_sequence
words = set(text_to_word_sequence(full_text))
vocab_size = len(words)

glove_dict.get(words.pop())

In [None]:
# CNN + LSTM + Embedding
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(4096, 100, input_length=4096))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

model_conv = create_conv_model()
model_conv.fit(embeddings, df["Final"], validation_split=0.4, epochs = 3,)