In [1]:
# The basics
import numpy as np
import pandas as pd

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Softmax

# utils
import os

In [2]:
isear = pd.read_csv('../data/raw/isear.csv', sep='|', error_bad_lines=False, usecols=['Field1', 'SIT', 'EMOT'])

In [3]:
number_of_classes = len(isear.EMOT.unique())

In [4]:
maxlen = 1000
max_words = 10000

In [5]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(isear['SIT'])
sequences = tokenizer.texts_to_sequences(isear['SIT'])

In [6]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9063 unique tokens.


In [7]:
data = pad_sequences(sequences, maxlen=maxlen, padding='post')

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, isear['EMOT'])

## Model creation time

In [10]:
model = Sequential()
model.add(Embedding(max_words, output_dim=50, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(number_of_classes + 1,  activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 50)          500000    
_________________________________________________________________
flatten (Flatten)            (None, 50000)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                1600032   
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
Total params: 2,100,296
Trainable params: 2,100,296
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.fit(x_train, to_categorical(y_train),
          epochs=20,
          batch_size=32,
          validation_data=(x_test, to_categorical(y_test)))

Train on 5749 samples, validate on 1917 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8ea1485f90>

In [12]:
y_pred = model.predict_classes(x_test)
y_pred

array([7, 2, 6, ..., 3, 6, 5])

In [13]:
confusion_matrix(y_test, y_pred)

array([[146,  11,  15,  61,  12,  21,  18],
       [ 14, 123,  18,   9,  24,  70,  12],
       [  9,   8,  85,  26,  21,  72,  54],
       [ 23,   5,  25, 153,  11,  26,  31],
       [  4,  17,  28,   8, 114,  58,  39],
       [ 12,  13,  27,  15,  30, 132,  55],
       [  6,   6,  27,  22,  20,  58, 123]])