# Part 0: Preprocessing

In [1]:
# Import modules
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt
import seaborn as sns
import os, json, random, pickle
random.seed(42)
# ignore deprecation warnings in sklearn

import warnings
warnings.filterwarnings("ignore")

# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

# Set model directory

model_dir = os.path.join(os.path.dirname(os.getcwd()), 'Model')

# Set embedding directory

embedding_dir = 'Z:\Jupyter\Embeddings'

# Set data paths

train_path = os.path.join(data_dir, 'train.csv')

train_processed_path = os.path.join(data_dir, 'interim', 'train_preprocessed.txt')

meta_feat_path = os.path.join(data_dir, 'interim', 'meta_feat.txt')

train = pd.read_csv(train_path)
train_processed = pd.read_json(train_processed_path)
meta_feat = pd.read_json(meta_feat_path)

In [2]:
# Imports
import tensorflow as tf
import tensorflow.keras.backend as K
tf.set_random_seed(42)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils.np_utils import to_categorical
from keras.callbacks import Callback

from keras_tqdm import TQDMNotebookCallback
tf.logging.set_verbosity(tf.logging.ERROR)

Using TensorFlow backend.


In [3]:
# Define some support function to adjust class weights and 
from sklearn.utils.class_weight import compute_sample_weight

class_weights = compute_sample_weight('balanced', 
                                      np.unique(train_processed.sentiment),
                                      train_processed.sentiment)

def get_label(row):
    """
    Get regular label from one hot encoded labels
    """
    for label in [0,1,2]:
        if row[label] == 1:
            return label

# Part 1: Machine Learning with Recurrent Neural Network

# <font color='Blue'>Bi-Directional LSTM </font>

In [4]:
# Set hyper-parameters

vocab_size = 5000
input_length = 120
embed_dim = 100
lstm_out = 100
batch_size = 32
num_epochs = 5

In [5]:
# Tokenization and build model input

tokenizer = Tokenizer(num_words=vocab_size, split=' ')
tokenizer.fit_on_texts(train_processed['text'].values)

X = tokenizer.texts_to_sequences(train_processed['text'].values)
X = pad_sequences(X, maxlen = input_length)

y = to_categorical(train_processed['sentiment'].values)

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
print(Xtrain.shape,ytrain.shape)
print(Xtest.shape,ytest.shape)

(3959, 120) (3959, 3)
(1320, 120) (1320, 3)


%%time
# Build Neural Network architecture
model = Sequential()
model.add(Embedding(vocab_size, 
                    embed_dim, 
                    input_length = X.shape[1], 
                    dropout=0.2))

model.add(Bidirectional(LSTM(lstm_out, 
                       dropout_U=0.2,
                       dropout_W=0.2)))
model.add(Dense(3,
                activation='softmax'))
model.compile(loss = 'categorical_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

# Fit model
model.fit(Xtrain, ytrain, 
          batch_size=batch_size,
          epochs=num_epochs,
          class_weight = class_weights,
          verbose = 0,
          callbacks = [TQDMNotebookCallback()])

# Save model
model_json = model.to_json()
with open(os.path.join(model_dir, "LSTM_120inputlen_32bsize_5epoch.json"), 'w') as json_file:
    json_file.write(model_json)
model.save_weights("LSTM_120inputlen_32bsize_5epoch.h5")

In [6]:
json_file = open(os.path.join(model_dir, "LSTM_120inputlen_32bsize_5epoch.json"), 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("LSTM_120inputlen_32bsize_5epoch.h5")
print("loaded model from disk")

loaded model from disk


In [7]:
# Get model results
ypred = model.predict(Xtest)

# Turn probability into predictions
ypred_df = pd.DataFrame(ypred)
ypred_max = ypred_df.apply(max, axis = 1)
for index, row in ypred_df.iterrows():
    for label, item in row.items():
        if item == ypred_max[index]:
            row[label] = 1
        else:
            row[label] = 0

# Get confusion matrix
ypred_label = ypred_df.apply(get_label, axis = 1)
ytest_label = pd.DataFrame(ytest).apply(get_label, axis = 1)

cm = confusion_matrix(ytest_label, ypred_label)
print(cm, "\n\n")

f1 = f1_score(ytest_label, ypred_label, average = 'macro')

print("Model achieve %.3f F1 Macro Score" % f1)

# Save score
f1_lstm = f1

[[ 27  18  88]
 [ 22  50 154]
 [ 86  73 802]] 


Model achieve 0.425 F1 Macro Score


# <font color='Blue'>LSTM with Pre-trained Embeddings</font>

### Prepare text data

In [8]:
vocab_size = 5000
input_length = 120
embed_dim = 100
lstm_out = 100
batch_size = 32
num_epochs = 5

In [9]:
tokenizer = Tokenizer(nb_words = vocab_size)
tokenizer.fit_on_texts(train_processed['text'].values)

word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

X = tokenizer.texts_to_sequences(train_processed['text'].values)
X = pad_sequences(X, maxlen = input_length)

y = to_categorical(train_processed['sentiment'].values)

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
print(Xtrain.shape,ytrain.shape)
print(Xtest.shape,ytest.shape)

Found 37408 unique tokens.
(3959, 120) (3959, 3)
(1320, 120) (1320, 3)


# split data into training and validation set
indices = np.arange(Xtrain.shape[0])
np.random.shuffle(indices)
Xtrain1 = Xtrain[indices]
ytrain1 = ytrain[indices]
nb_validation_samples = int(0.2 * Xtrain.shape[0])

Xtrain1 = Xtrain1[:-nb_validation_samples]
ytrain1 = ytrain1[:-nb_validation_samples]
Xval = Xtrain1[-nb_validation_samples:]
yval = ytrain1[-nb_validation_samples:]

### Prepare embeddings

# Read GloVe embeddings
embeddings_index = {}
f = open(os.path.join(embedding_dir, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print("Found %s word vectors." % len(embeddings_index))

# Build embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Build LSTM and run model

# Build Neural Network architecture
model = Sequential()
model.add(Embedding(len(word_index) + 1, 
                    embed_dim, 
                    weights = [embedding_matrix],
                    input_length = X.shape[1], 
                    trainable = False))

model.add(Bidirectional(LSTM(lstm_out, 
                       dropout_U=0.2,
                       dropout_W=0.2)))
model.add(Dense(3,
                activation='softmax'))
model.compile(loss = 'categorical_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

# Fit model
model.fit(Xtrain, ytrain,
          epochs=num_epochs,
          class_weight = class_weights,
          verbose = 0,
          callbacks = [TQDMNotebookCallback()])

# Save model
model_json = model.to_json()
with open(os.path.join(model_dir, "LSTM_pretrained_GloVe.json"), 'w') as json_file:
    json_file.write(model_json)
model.save_weights("LSTM_pretrained_GloVe.h5")

In [10]:
# Load model
json_file = open(os.path.join(model_dir, "LSTM_pretrained_GloVe.json"), 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("LSTM_pretrained_GloVe.h5")
print("loaded model from disk")

loaded model from disk


In [11]:
# Get model results
ypred = model.predict(Xtest)

# Turn probability into predictions
ypred_df = pd.DataFrame(ypred)
ypred_max = ypred_df.apply(max, axis = 1)
for index, row in ypred_df.iterrows():
    for label, item in row.items():
        if item == ypred_max[index]:
            row[label] = 1
        else:
            row[label] = 0

# Get confusion matrix
ypred_label = ypred_df.apply(get_label, axis = 1)
ytest_label = pd.DataFrame(ytest).apply(get_label, axis = 1)

unique, counts = np.unique(ytest_label, return_counts = True)
dict(zip(unique, counts))

cm = confusion_matrix(ytest_label, ypred_label)
print(cm, "\n\n")

f1 = f1_score(ytest_label, ypred_label, average = 'macro')

print("Model achieve %.3f F1 Macro Score" % f1)

# Save score
f1_lstm_pretrained = f1

[[  1   1 131]
 [  0  11 215]
 [  2  12 947]] 


Model achieve 0.314 F1 Macro Score


# <font color='Blue'>LSTM with Word2Vec Embeddings</font>

### Prepare text data

In [12]:
vocab_size = 5000
input_length = 120
embed_dim = 200
lstm_out = 100
batch_size = 32
num_epochs = 5

In [13]:
tokenizer = Tokenizer(nb_words = vocab_size)
tokenizer.fit_on_texts(train_processed['text'].values)

word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

X = tokenizer.texts_to_sequences(train_processed['text'].values)
X = pad_sequences(X, maxlen = input_length)

y = to_categorical(train_processed['sentiment'].values)

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
print(Xtrain.shape,ytrain.shape)
print(Xtest.shape,ytest.shape)

Found 37408 unique tokens.
(3959, 120) (3959, 3)
(1320, 120) (1320, 3)


# split data into training and validation set
indices = np.arange(Xtrain.shape[0])
np.random.shuffle(indices)
Xtrain1 = Xtrain[indices]
ytrain1 = ytrain[indices]
nb_validation_samples = int(0.2 * Xtrain.shape[0])

Xtrain1 = Xtrain1[:-nb_validation_samples]
ytrain1 = ytrain1[:-nb_validation_samples]
Xval = Xtrain1[-nb_validation_samples:]
yval = ytrain1[-nb_validation_samples:]

### Prepare embeddings

from gensim.models import Word2Vec

# Load word2vec model
w2v_model = Word2Vec.load(os.path.join(embedding_dir, 'w2v_best.bin'))

print(w2v_model)

word_vectors = w2v_model.wv
print("Number of word vectors: %d" %len(word_vectors.vocab))

# Create embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    if word not in list(word_vectors.vocab.keys()):
        embedding_matrix[i] = np.zeros(embed_dim)
    else:
        embedding_vector = word_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### Build LSTM and run model

# Build Neural Network architecture
model = Sequential()
model.add(Embedding(len(word_index) + 1, 
                    embed_dim, 
                    weights = [embedding_matrix],
                    input_length = X.shape[1], 
                    trainable = False))

model.add(Bidirectional(LSTM(lstm_out, 
                       dropout_U=0.2,
                       dropout_W=0.2)))
model.add(Dense(3,
                activation='softmax'))
model.compile(loss = 'categorical_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

# Fit model
model.fit(Xtrain, ytrain,
          epochs=num_epochs,
          class_weight = class_weights,
          verbose = 0,
          callbacks = [TQDMNotebookCallback()])

# Save model
model_json = model.to_json()
with open(os.path.join(model_dir, "LSTM_w2v.json"), 'w') as json_file:
    json_file.write(model_json)
model.save_weights("LSTM_w2v.h5")

In [14]:
# Load model
json_file = open(os.path.join(model_dir, "LSTM_w2v.json"), 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("LSTM_w2v.h5")
print("loaded model from disk")

loaded model from disk


In [15]:
# Get model results
ypred = model.predict(Xtest)

# Turn probability into predictions
ypred_df = pd.DataFrame(ypred)
ypred_max = ypred_df.apply(max, axis = 1)
for index, row in ypred_df.iterrows():
    for label, item in row.items():
        if item == ypred_max[index]:
            row[label] = 1
        else:
            row[label] = 0

# Get confusion matrix
ypred_label = ypred_df.apply(get_label, axis = 1)
ytest_label = pd.DataFrame(ytest).apply(get_label, axis = 1)

unique, counts = np.unique(ytest_label, return_counts = True)
dict(zip(unique, counts))

cm = confusion_matrix(ytest_label, ypred_label)
print(cm, "\n\n")

f1 = f1_score(ytest_label, ypred_label, average = 'macro')

print("Model achieve %.3f F1 Macro Score" % f1)

# Save score
f1_lstm_w2v = f1

[[ 11   7 115]
 [  3  39 184]
 [ 29  42 890]] 


Model achieve 0.400 F1 Macro Score


# <font color='Blue'>LSTM results</font>

In [17]:
results = {"model":["Vanilla LSTM", "LSTM With Pre-trained Glove Embeddings", "LSTM with Custom-trained Word2Vec Embeddings"],
           "F1-macro-score":[f1_lstm, f1_lstm_pretrained, f1_lstm_w2v]}
pd.DataFrame(results).sort_values("F1-macro-score", ascending = False)

Unnamed: 0,model,F1-macro-score
0,Vanilla LSTM,0.424657
2,LSTM with Custom-trained Word2Vec Embeddings,0.400438
1,LSTM With Pre-trained Glove Embeddings,0.31433
