# Model building

In this notebook we build LSTM models using a processed text data from the previous notebook.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline
fig_params = {
    'legend.fontsize': 18,
    'axes.labelsize': 18,
    'axes.titlesize': 20,
    'xtick.labelsize': 16,
    'ytick.labelsize': 16,
    'axes.facecolor': '#D9DDD1'
}
plt.rcParams.update(fig_params)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df_train = pd.read_pickle('./data/train_processed.pkl')
df_val = pd.read_pickle('./data/val_processed.pkl')
df_test = pd.read_pickle('./data/test_processed.pkl')

In [3]:
tokenizer = Tokenizer(num_words=12000, oov_token='<UKN>')

In [4]:
tokenizer.fit_on_texts(df_train['processed'])
pickle.dump(tokenizer, open('./tokenizer_model.model', 'wb'))

In [5]:
# cv = CountVectorizer()
# cv.fit(df_train['processed'])

# cv.transform(df_train['processed']).shape

In [6]:
def get_sequence(tokenizer, df, maxlen):
    sequence = tokenizer.texts_to_sequences(df)
    padded = pad_sequences(sequences=sequence,
                           maxlen=maxlen,
                           padding='post',
                           truncating='post')
    return padded

In [7]:
maxlen = 550
X_tr = np.array(get_sequence(tokenizer, df_train['processed'], maxlen))
X_cv = np.array(get_sequence(tokenizer, df_val['processed'], maxlen))
X_test = np.array(get_sequence(tokenizer, df_test['processed'], maxlen))

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df_train['stars'])

LabelEncoder()

In [9]:
y_tr = le.transform(df_train['stars'])
y_cv = le.transform(df_val['stars'])
y_test = le.transform(df_test['stars'])

In [21]:
print(X_tr.shape, y_tr.shape)
print(X_cv.shape, y_cv.shape)
print(X_test.shape, y_test.shape)

(147140, 550) (147140,)
(36785, 550) (36785,)
(45982, 550) (45982,)


## Create a model

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [23]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)

In [24]:
model1 = Sequential()
model1.add(Embedding(input_dim=12000, output_dim=16, input_length=maxlen))
model1.add(LSTM(100))
model1.add(Dense(5, activation='softmax'))

model1.compile(loss='sparse_categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 550, 16)           192000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               46800     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 505       
Total params: 239,305
Trainable params: 239,305
Non-trainable params: 0
_________________________________________________________________


In [None]:
model1.fit(X_tr, y_tr, validation_data=(X_cv, y_cv), epochs=15, batch_size=64,verbose=1,
          callbacks=[early_stop])


Train on 147140 samples, validate on 36785 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15

In [None]:
history = pd.DataFrame(model1.history.history)
history[['loss', 'val_loss']].plot()

In [None]:
y_tr_pred = model1.predict_classes(X_tr)
y_cv_pred = model1.predict_classes(X_cv)

print('=' * 20)
print('training data \n', classification_report(y_tr, y_tr_pred))
print('=' * 20)
print('validation data \n', classification_report(y_cv, y_cv_pred))
print('=' * 20)

## RNN with dropout layer

In [None]:
model2 = Sequential()
model2.add(Embedding(input_dim=12000, output_dim=16, input_length=maxlen))
model2.add(Dropout(0.4))
model2.add(LSTM(100))
model2.add(Dropout(0.4))
model2.add(Dense(5, activation='softmax'))

model2.compile(loss='sparse_categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
model2.summary()

In [None]:
model2.fit(X_tr, y_tr, validation_data=(X_cv, y_cv), epochs=25, batch_size=64, verbose=1,
          callbacks=[early_stop])

In [None]:
history = pd.DataFrame(model2.history.history)
history[['loss', 'val_loss']].plot()

In [None]:
y_tr_pred = model2.predict_classes(X_tr)
y_cv_pred = model2.predict_classes(X_cv)

print('=' * 20)
print('training data \n', classification_report(y_tr, y_tr_pred))
print('=' * 20)
print('validation data \n', classification_report(y_cv, y_cv_pred))
print('=' * 20)

## RNN with bidirectional LSTM  

In [None]:
model3 = Sequential()
model3.add(Embedding(input_dim=12000, output_dim=16, input_length=maxlen))
model3.add(Dropout(0.2))
model3.add(Bidirectional(LSTM(100)))
model3.add(Dropout(0.4))
model3.add(Dense(5, activation='softmax'))

model3.compile(loss='sparse_categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
model3.summary()

In [None]:
model3.fit(X_tr,
           y_tr,
           validation_data=(X_cv, y_cv),
           epochs=25,
           batch_size=64,
           verbose=1,
           callbacks=[early_stop])

In [None]:
history = pd.DataFrame(model3.history.history)
history[['loss', 'val_loss']].plot()

In [None]:
y_tr_pred = model3.predict_classes(X_tr)
y_cv_pred = model3.predict_classes(X_cv)

print('=' * 20)
print('training data \n', classification_report(y_tr, y_tr_pred))
print('=' * 20)
print('validation data \n', classification_report(y_cv, y_cv_pred))
print('=' * 20)

## Test data

In [None]:
y_test_pred1 = model1.predict_classes(X_test)
y_test_pred2 = model2.predict_classes(X_test)
y_test_pred3 = model3.predict_classes(X_test)

In [None]:
print('=' * 20)
print(classification_report(y_test, y_test_pred1))
print('=' * 20)
print(classification_report(y_test, y_test_pred2))
print('=' * 20)
print(classification_report(y_test, y_test_pred3))
print('=' * 20)