In [1]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers,Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D,Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
import pathlib
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder

In [2]:
class LearningRateReducerCb(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs={}):
    old_lr = self.model.optimizer.lr.read_value()
    new_lr = old_lr * 0.99
    print("\nEpoch: {}. Reducing Learning Rate from {} to {}".format(epoch, old_lr, new_lr))
    self.model.optimizer.lr.assign(new_lr)
    
def create_checkpoint_callback(model_name):
  return tf.keras.callbacks.ModelCheckpoint(filepath=f"{model_name}/checkpoint.ckpt",
                                                          #  monitor="val_accuracy",
                                                           save_best_only=True,
                                                           save_weights_only=True,
                                                           save_freq="epoch")

es_cb = EarlyStopping(monitor='val_loss', patience=10)

In [12]:
data=pd.read_csv('nlp.csv')
data

Unnamed: 0.1,Unnamed: 0,Text,Class
0,0,I have outdated information on my credit repor...,C
1,2,An account on my credit report has a mistaken ...,C
2,3,This company refuses to provide me verificatio...,A
3,4,This complaint is in regards to Square Two Fin...,A
4,5,Started the refinance of home mortgage process...,B
...,...,...,...
128951,179770,Barclay closed my Barclay XXXX MasterCard acco...,D
128952,179771,Our son was taken to XXXX XXXX XXXX XXXX XXXX ...,A
128953,179773,I had an account with XXXX in XX/XX/XXXX this ...,A
128954,179774,I was contacted on XX/XX/XXXX email by XXXX fr...,B


In [13]:
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [14]:
one_hot_encoder = OneHotEncoder(sparse=False)
label = one_hot_encoder.fit_transform(data["Class"].to_numpy().reshape(-1,1))

# raw_df["label"] = label
class_names = np.array(one_hot_encoder.categories_, dtype="str")[0]
class_names

array(['A', 'B', 'C', 'D'], dtype='<U1')

In [15]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(data.Text.to_numpy(),
                                                                            label,
                                                                            test_size=0.2,
                                                                            random_state=42)

len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(103164, 25792, 103164, 25792)

In [16]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

In [17]:
training_sequences = tokenizer.texts_to_sequences(train_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(val_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(len(class_names), activation='softmax')
])
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)
# Fit the model
model_history = model.fit(training_padded,train_labels,
                              validation_data=(testing_padded,val_labels),
                              epochs=10,
                              callbacks=[create_checkpoint_callback(model.name)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
model_0_preds = model.predict(testing_padded)
model_0_preds=tf.argmax(model_0_preds, axis=1)
model_0_preds.numpy()

array([1, 3, 0, ..., 0, 0, 0], dtype=int64)

In [34]:
data['Class'].to_numpy().reshape(-1,1).shape

(128956, 1)

In [35]:
model_0_preds.numpy().reshape(-1,1).shape

(25792, 1)

In [52]:
model_0_preds = model.predict(testing_padded)

In [54]:
one_hot_encoder.inverse_transform(model_0_preds)

array([['B'],
       ['D'],
       ['A'],
       ...,
       ['A'],
       ['A'],
       ['A']], dtype=object)

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

In [11]:
sentences = data["Title"]
label = one_hot_encoder.fit_transform(data["Conference"].to_numpy().reshape(-1,1))

In [12]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences.to_numpy(),
                                                                            label,
                                                                            test_size=0.2,
                                                                            random_state=42)

len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(2005, 502, 2005, 502)

In [13]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

9

In [16]:
vocab = " ".join(train_sentences)
vocab = set(vocab.split(" "))
vocab_len = len(vocab)
vocab_len

6631

In [26]:
# Setup text vectorization with custom variables
max_vocab_length = 1000 # max number of words to have in our vocabulary
max_length = 9 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)
text_vectorizer = TextVectorization(max_tokens=max_vocab_length, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=max_length) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None
text_vectorizer.adapt(train_sentences)

In [27]:
embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=64, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

In [34]:
inputs = layers.Input(shape=(1,), dtype=tf.string, name="input_layer")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(tf.keras.layers.LSTM(32))(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)
model = tf.keras.Model(inputs, outputs, name="model_LSTM")
# Compile the model
model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)
# Fit the model
model_history = model.fit(train_sentences,train_labels,
                              validation_data=(val_sentences,val_labels),
                              epochs=10,
                              callbacks=[create_checkpoint_callback(model.name)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
