In [1]:
# The basics
import numpy as np
import pandas as pd

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Softmax, LSTM

# utils
import os
import subprocess

In [2]:
parent_relative_path = ".."

In [3]:
colab = 'google.colab' in str(get_ipython())

In [4]:
if colab:
    print("Colab babe")
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)

    subprocess.run(["cp", "/content/gdrive/My Drive/Colab Notebooks/.kaggle/kaggle.json", "/root/.kaggle/kaggle.json"])
    subprocess.run(["git", "clone", "https://github.com/codefupanda/customer_interaction_summary.git"])
    os.system('cd customer_interaction_summary && make requirements && make data > logs.logs')
    parent_relative_path = "./customer_interaction_summary"

In [5]:
isear = pd.read_csv(parent_relative_path + '/data/raw/isear.csv', sep='|', error_bad_lines=False, usecols=['Field1', 'SIT', 'EMOT'])

In [6]:
number_of_classes = len(isear.EMOT.unique())

In [7]:
maxlen = 1000
max_words = 10000

In [8]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(isear['SIT'])
sequences = tokenizer.texts_to_sequences(isear['SIT'])

In [9]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9063 unique tokens.


In [10]:
data = pad_sequences(sequences, maxlen=maxlen, padding='post')

In [11]:
x_train, x_test, y_train, y_test = train_test_split(data, isear['EMOT'])

## Model creation time

In [12]:
model = Sequential()
model.add(Embedding(max_words, output_dim=50, input_length=maxlen))
#model.add(Flatten())
model.add(LSTM(128))
model.add(Dense(number_of_classes + 1,  activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 50)          500000    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               91648     
_________________________________________________________________
dense (Dense)                (None, 8)                 1032      
Total params: 592,680
Trainable params: 592,680
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(x_train, to_categorical(y_train),
          epochs=5,
          batch_size=32,
          validation_data=(x_test, to_categorical(y_test)))

Train on 5749 samples, validate on 1917 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fd2ca589cd0>

In [14]:
y_pred = model.predict_classes(x_test)
y_pred

array([4, 4, 4, ..., 4, 4, 4])

In [15]:
confusion_matrix(y_test, y_pred)

array([[  0,   0,   0, 267,   0,   0,   0],
       [  0,   0,   0, 275,   0,   0,   0],
       [  0,   0,   0, 265,   0,   0,   0],
       [  0,   0,   0, 280,   0,   0,   0],
       [  0,   0,   0, 298,   0,   0,   0],
       [  0,   0,   0, 269,   0,   0,   0],
       [  0,   0,   0, 263,   0,   0,   0]])