In [16]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
import tensorflow as tf
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [111]:
df = pd.read_json('../dataset/processed_data.json', lines=True)
df = df[['text', 'Complaint']]
df.head(2)

Unnamed: 0,text,Complaint
0,film special foil adapter ring fz82 60x12...,0
1,centre awesome collection plushies lizardele...,0


In [112]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6604 unique tokens.


In [115]:
X = df['text']
Y = df['Complaint']

raw_X_train, raw_X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

def convert_text(texts):
    X = tokenizer.texts_to_sequences(texts.values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    return X
X_train = convert_text(raw_X_train)
X_test = convert_text(raw_X_test)

print('Shape of label tensor:', Y.shape)
print('Shape of data tensor:', X.shape)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

Shape of label tensor: (2421,)
Shape of data tensor: (2421,)
(1694, 250) (1694,)
(727, 250) (727,)


In [116]:

import tensorflow_hub as hub

Precision = tf.keras.metrics.Precision()
Recall = tf.keras.metrics.Recall()
# This is fixed.
EMBEDDING_DIM = 20

# embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
# embedding = 'https://tfhub.dev/google/Wiki-words-500-with-normalization/2'
# hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=False)
embed_dim = 100


model = Sequential()
# model.add(hub_layer)
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=250))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=[Recall, Precision])
# print(model.summary())

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [147]:
import tensorflow_addons as tfa
from sklearn.metrics import precision_score, recall_score

results = model.evaluate(X_test, Y_test, verbose=2)


# TEST on HOLD-OUT set.
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))


def fbeta_score(Y_test, y_test_pred, beta):
    metric = tfa.metrics.FBetaScore(num_classes=1, beta=beta, threshold=0.5)
    Actuals = np.reshape(Y_test.values, (Y_test.shape[0], 1))
    metric.update_state(Actuals, y_test_pred)
    result = metric.result()
    return result.numpy()[0]

output = model.predict(X_test)
y_test_pred = np.round(output)
prec = precision_score(Y_test, y_test_pred)
recall = recall_score(Y_test, y_test_pred)
f125 = fbeta_score(Y_test, y_test_pred, beta=1.25)
    
print(f'Precision: {prec}\tRecall: {recall}\tFbeta: {f125}')
# print(f'F2:{f2}\tF1.25: {f125}')


23/23 - 1s - loss: 0.1612 - recall_16: 0.8173 - precision_16: 0.7870 - 723ms/epoch - 31ms/step
loss: 0.161
recall_16: 0.817
precision_16: 0.787
Precision: 0.7870370370370371	Recall: 0.8173076923076923	Fbeta: 0.8052217960357666


In [85]:
model.save('../experiments/lstm_local_embedding')

INFO:tensorflow:Assets written to: ../experiments/lstm_local_embedding/assets


INFO:tensorflow:Assets written to: ../experiments/lstm_local_embedding/assets


In [106]:
df = pd.DataFrame()
df['test_text'] = raw_X_test
df['Actuals'] = Y_test
df['Predicted'] = np.reshape(y_test_pred, (y_test_pred.shape[0], 1))
df

Unnamed: 0,test_text,Actuals,Predicted
479,lockdown could excuse non communication custo...,0,0.0
1010,make battery deal wrong people,0,0.0
2197,bad service unsatisfied customer default pr...,1,1.0
2274,entry no keycap 5 team challenge complete...,0,0.0
610,last light dmcg81 lumix g vario 1235f28 1640 s...,0,0.0
...,...,...,...
1127,know bad make look sell merchandise ...,1,0.0
1838,close operation india last 1 week chase servi...,1,1.0
270,memories,0,0.0
1347,s1r s1 land thud compete poorly medium format,0,0.0
