## Deep Learning Models with F1 score

In [93]:
import pandas as pd
import numpy as np
import pickle
import string
import re
import numpy as np
import os
from collections import Counter
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')

#sklearn
from sklearn.model_selection import train_test_split

#keras
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras import models
from keras import layers
from keras import regularizers
from keras.callbacks import Callback

# data preprocessing
import data_preprocessing

# nlp packages
from gensim.models import Word2Vec

# machine learning packages
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

# plot
import matplotlib.pyplot as plt
plt.style.use('ggplot')

###  load data 

In [63]:
data = data_preprocessing.read_data('./data/data.csv')
train_corpus, test_corpus, train_labels, test_labels = data_preprocessing.split(data)

HBox(children=(IntProgress(value=0, max=47428), HTML(value='')))




HBox(children=(IntProgress(value=0, max=47428), HTML(value='')))




### Text preprocessing for deep learning models

**keep speeches whose length are between a minimum occurence and a maximum occurence**

In [64]:
# remove speeches whose length is shorter than 5 or longer than 200 for training data
def get_fixed_length_range_data(corpus,labels,min_len,max_len):
    '''
    remove speeches whose length is shorter than min_len or longer than max_len 
    input: speeches
    output: target corpus, target labels
    '''
    new_corpus = []
    new_labels = []
    for i in range(len(corpus)):
        if len(corpus[i].split()) > min_len and len(corpus[i].split()) <= max_len :
            new_corpus.append(corpus[i])
            new_labels.append(labels[i])
        else:
            continue
    return new_corpus, new_labels

In [65]:
new_train_corpus, new_train_labels = get_fixed_length_range_data(train_corpus,train_labels,5,200)
new_test_corpus, new_test_labels = get_fixed_length_range_data(test_corpus,test_labels,5,200)

In [66]:
# relabel '-1' to '0' for train_labels for later deep learning model
for i in range(len(new_train_labels)):
    if new_train_labels[i]==-1:
        new_train_labels[i]=0    

In [67]:
# relabel '-1' to '0' for test_labels for later deep learning model
for j in range(len(new_test_labels)):
    if new_test_labels[j]==-1:
        new_test_labels[j]=0   

**keep sppeches only with tokens with a minimum occurence**

In [68]:
def get_common_tokens(min_occurence,max_occurence):
    '''
    get the vocab of the whole corpus
    '''
    vocab = Counter()
    corpus = new_train_corpus + new_test_corpus
    for speech in corpus:
        tokens = speech.split()
        vocab.update(tokens)
    # keep tokens with a min occurence
    common_tokens = [k for k,c in vocab.items() if c > min_occurence and c < max_occurence]
    
    # new vocab
    new_dic = {}
    for k,c in vocab.items(): 
        if c > min_occurence and c < max_occurence:
            new_dic[k]=c
        
    return common_tokens

In [69]:
def clean_corpus(corpus,min_occurence,max_occurence):
    '''
    ensure all speeches in a corpus only keep tokens with a min occurence
    input: corpus, common tokens with a min occurence in the whole corpus
    output: new target corpus
    '''
    common_tokens = get_common_tokens(min_occurence,max_occurence)
    new_corpus = []
    for i in tqdm(range(len(corpus))):
        tokens = corpus[i].split()
        tokens = [w for w in tokens if w in common_tokens]
        new_speech = ' '.join(tokens)
        new_corpus.append(new_speech)
    return new_corpus

In [70]:
new_train_corpus2 = clean_corpus(new_train_corpus,1000,80000)

HBox(children=(IntProgress(value=0, max=18673), HTML(value='')))




In [71]:
new_test_corpus2 = clean_corpus(new_test_corpus,1000,80000)

HBox(children=(IntProgress(value=0, max=7896), HTML(value='')))




###  Models

**pretrained GloVe word embedding + LSTM**

In [72]:
#tokenization
corpus = new_train_corpus2 + new_test_corpus2
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
#tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i > num_words}

In [73]:
train_sequences = tokenizer.texts_to_sequences(new_train_corpus2)
test_sequences = tokenizer.texts_to_sequences(new_test_corpus2)
train_padded = pad_sequences(train_sequences, maxlen=200)
test_padded = pad_sequences(test_sequences, maxlen=200)

In [74]:
# vocab_size
vocab_size = len(tokenizer.word_index) + 1

In [75]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./GloVe/glove.6B.100d.txt') 
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [76]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [77]:
glove_model = models.Sequential()
glove_model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=200, trainable=False))
glove_model.add(Dropout(0.2))
glove_model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.2))
glove_model.add(layers.Dense(1, activation='sigmoid'))
#With the set_weights method we load the pre-trained embeddings in the Embedding layer (here layer 0). 
#By setting the trainable attribute to False, we make sure not to change the pre-trained embeddings.
glove_model.layers[0].set_weights([embedding_matrix])
glove_model.layers[0].trainable = False
glove_model.summary()
#plot_model(glove_model,to_file='glove_lstm.png',show_shapes=True)
glove_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 200, 100)          41000     
_________________________________________________________________
dropout_6 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 101       
Total params: 121,501
Trainable params: 80,501
Non-trainable params: 41,000
_________________________________________________________________


In [79]:
glove_model.fit(train_padded, 
                np.array(new_train_labels), 
                epochs=10,
                verbose=False,
                validation_split=0.3)

<keras.callbacks.History at 0x1a66d63ac8>

In [94]:
# get F1 score
predictions = glove_model.predict(test_padded)
new_predictions = []
for i in predictions.reshape(1,-1)[0]:
    if i > 0.5:
        new_predictions.append(1)
    else:
        new_predictions.append(0)
print(classification_report(np.array(new_test_labels),new_predictions))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      5765
           1       0.99      0.75      0.86      2131

   micro avg       0.93      0.93      0.93      7896
   macro avg       0.95      0.88      0.91      7896
weighted avg       0.94      0.93      0.93      7896



**pretrained GloVe word embedding + CNN + LSTM**

In [95]:
# define the model
def define_cnn_lstm_model(vocab_size):
    model_conv = Sequential()
    model_conv.add(Embedding(vocab_size, 100, input_length=200))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(32, 3, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=2))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.layers[0].set_weights([embedding_matrix])
    model_conv.layers[0].trainable = False
    model_conv.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    model_conv.summary()
    #plot_model(model_conv, to_file='cnn_lstm.png', show_shapes=True)
    return model_conv

In [96]:
# define model
glove_model_CNN_LSTM = define_cnn_lstm_model(vocab_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 200, 100)          41000     
_________________________________________________________________
dropout_7 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 198, 32)           9632      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 99, 32)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 103,933
Trainable params: 62,933
Non-trainable params: 41,000
_________________________________________________________________


In [98]:
# fit model
glove_model_CNN_LSTM.fit(train_padded, 
                        np.array(new_train_labels), 
                        epochs=10,
                        verbose=False,
                        validation_split=0.3)

# save the model
#glove_model_CNN_LSTM.save('glove_model_CNN_LSTM.h5')

<keras.callbacks.History at 0x1a5d47e0b8>

In [99]:
# get F1 score
predictions = glove_model_CNN_LSTM.predict(test_padded)
new_predictions = []
for i in predictions.reshape(1,-1)[0]:
    if i > 0.5:
        new_predictions.append(1)
    else:
        new_predictions.append(0)
print(classification_report(np.array(new_test_labels),new_predictions))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      5765
           1       0.96      0.78      0.86      2131

   micro avg       0.93      0.93      0.93      7896
   macro avg       0.94      0.88      0.91      7896
weighted avg       0.93      0.93      0.93      7896

