In [6]:
# adopted from https://colab.research.google.com/drive/1tGdPsqG-jAmgwRItq1z7oXM5HfYNOuEs?usp=sharing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import regularizers, optimizers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding, Dense, Dropout, InputLayer, LSTM, \
Flatten, GlobalMaxPool1D
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.initializers import Constant

from nltk.corpus import stopwords
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

# import my custom library into colab
import sys
sys.path.append('/content/gdrive/MyDrive/Github/capstone/')
import mylibrary as mylib

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
%cd /content/gdrive/My Drive/Github/capstone

/content/gdrive/My Drive/Github/capstone


In [9]:
def confusion_plot(y_true, y_pred, labels=None):
  conf = confusion_matrix(y_true, y_pred, normalize='true')
  ax = sns.heatmap(conf, annot=True, xticklabels=labels, yticklabels=labels,
                   cmap="Greens" )
  return ax

In [10]:
def plot_history(history, title=None):
    """
    Given a model history will plot the model training history and 
    return the last scores for each loss and metric in the  model.
    Returns None.
    """
    hist = pd.DataFrame(history.history)
    fig = plt.figure(figsize = (10,5))
    ax = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    loss = hist.columns[hist.columns.str.endswith('loss')]
    accuracy = hist.columns[hist.columns.str.endswith('accuracy')]
    hist[loss].plot(title='Loss', ax=ax)
    hist[accuracy].plot(title='accuracy', ax=ax2)
    plt.title(title)
    plt.show()
    
    for l in loss:
        print(f'final {l}: {hist[l].iloc[-1]}')
    for r in accuracy:
        print(f'final {r}: {hist[r].iloc[-1]}')
    plt.show()

In [11]:
X = pd.read_pickle('data/X_2class.pkl')
y = pd.read_pickle('data/y_2class.pkl')

In [12]:
#encode labels for multiclass classification in Keras
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=123)
X_val, X_holdout, y_val, y_holdout = train_test_split(X_test, y_test, test_size = .5, random_state=123)

In [13]:
#max length of input.  
#Will truncate inputs longer and add 0s to the end of sequences that are longer.
output_sequence_length = X_train.str.len().max()

#define the length of the output sequences.  
#Here we are using the length of the longest sentence.  You can make it shorter.
vectorizer = TextVectorization(output_sequence_length=output_sequence_length,
                               #We want lists of integers as our output.
                               #These will be lookup indices in our embedding layer
                               output_mode='int',
                               #We will use the default standardization strategy
                               #We could also pass a custom function for custom
                               #standardization strategies.
                               standardize='lower_and_strip_punctuation')

vectorizer = TextVectorization(standardize='lower_and_strip_punctuation')
# #Fit the vectorizer to the training data.
# #We need to transform it into a numpy array for this.
vectorizer.adapt(X_train.to_numpy())

# #We will need the total length of the vocabulary for the embedding layer.
vocab_len = vectorizer.vocabulary_size()

In [14]:
def create_LSTM():
  model = Sequential()

  #Define your input layer for one feature (the whole string) and the dtype.
  #If you don't let Keras know to expect a string, it will assume it's looking
  #for a float and you'll get an error.
  model.add(InputLayer(input_shape=(1,), dtype=tf.string))

  #Fitted TextVectorization layer
  model.add(vectorizer)

  #Untrained Embedding Layer, embedding dimensions of 300
  #We'll talk about this layer more a little farther down
  model.add(Embedding(vocab_len, 300, input_length=output_sequence_length))
  
  #The recurrent LSTM layer.  We have it return all of its Y outputs for 
  #each cycle of each layer.
  model.add(LSTM(50, return_sequences=True, 
                 dropout=0.3, 
                 kernel_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-3)))
  #The below below layer returns the highest activation of each LSTM node
  #for each input and pass them to the dense layer.
  model.add(GlobalMaxPool1D())
  
  model.add(Dense(50, activation='relu', 
                  kernel_regularizer = regularizers.l1_l2(l1=1e-4, l2=1e-3)))  
  model.add(Dropout(0.3))
  
  model.add(Dense(50, activation='relu', 
                  kernel_regularizer = regularizers.l1_l2(l1=1e-4, l2=1e-3)))  
  model.add(Dropout(0.3))
  
  #Add an output layer.  4 nodes for 4 classes and a softmax activation
  model.add(Dense(2, activation='softmax'))

  optimizer = optimizers.Adam(learning_rate=.01)
  model.compile(loss = 'categorical_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

  return model

In [None]:
%%time
tf.keras.backend.clear_session()

self_train = create_LSTM()

self_train.summary()

self_trained_history = self_train.fit(X_train,
                    y_train,
                    validation_data = (X_val, y_val),
                    epochs = 3,                           ########
                    batch_size = 10)                      ########

self_trained_score = self_train.evaluate(X_val, y_val)

print(f'Accuracy on Test Set {self_trained_score[1]}, Loss: {self_trained_score[0]}')
plot_history(self_trained_history)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         33970800  
_________________________________________________________________
lstm (LSTM)                  (None, None, 50)          70200     
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2