In [1]:
# The basics
import numpy as np
import pandas as pd

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Softmax, SimpleRNN

# utils
import os

In [2]:
isear = pd.read_csv('../data/raw/isear.csv', sep='|', error_bad_lines=False, usecols=['Field1', 'SIT', 'EMOT'])

In [3]:
number_of_classes = len(isear.EMOT.unique())

In [4]:
maxlen = 1000
max_words = 10000

In [5]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(isear['SIT'])
sequences = tokenizer.texts_to_sequences(isear['SIT'])

In [6]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 9063 unique tokens.


In [7]:
data = pad_sequences(sequences, maxlen=maxlen, padding='post')

In [8]:
x_train, x_test, y_train, y_test = train_test_split(data, isear['EMOT'])

In [9]:
glove_dir = '../data/external'
embeddings_index = {}

f = open(os.path.join(glove_dir, 'glove.6B.50d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [10]:
embeddings_index['king'] - embeddings_index['man'] + embeddings_index['woman'] - embeddings_index['queen']

array([ 0.03882596, -0.91902995,  0.25977004,  0.04227898,  0.13896999,
        0.20638007,  0.02683   , -0.03402001, -0.09974198,  0.08821005,
        0.007544  , -0.36111003,  0.320001  , -0.6568099 ,  0.63689005,
        0.33902   , -0.7714    , -0.59767103,  0.711054  ,  0.44857004,
       -0.08957994, -0.01644999, -0.08847399, -0.17911002,  0.43223003,
       -0.28219986, -0.09410012, -0.9570599 ,  0.01775998,  0.32842597,
        0.11489999, -0.36365002, -0.52906007,  0.11805284, -0.10499001,
       -0.219787  , -0.33020002, -0.137476  ,  0.10705006,  0.92508006,
       -0.223916  ,  0.71121997, -0.29156998,  0.24670503, -0.05374995,
        0.2274299 , -0.09986001,  0.33875012, -0.69724536,  0.10499996],
      dtype=float32)

In [11]:
embedding_dim = 50 # if chaning this, update the file name above 

embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Model creation time

In [12]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(SimpleRNN(50, return_sequences=True))
model.add(Flatten())
model.add(Dense(number_of_classes + 1,  activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 50)          500000    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 1000, 50)          5050      
_________________________________________________________________
flatten (Flatten)            (None, 50000)             0         
_________________________________________________________________
dense (Dense)                (None, 8)                 400008    
Total params: 905,058
Trainable params: 905,058
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(x_train, to_categorical(y_train),
          epochs=5,
          batch_size=32,
          validation_data=(x_test, to_categorical(y_test)))

Train on 5749 samples, validate on 1917 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fcdd9eaf910>

In [14]:
y_pred = model.predict_classes(x_test)
y_pred

array([6, 2, 4, ..., 7, 6, 7])

In [15]:
confusion_matrix(y_test, y_pred)

array([[114,  27,  15,  51,  17,  24,  14],
       [ 16, 111,  17,  34,  29,  36,  22],
       [ 18,  26,  47,  64,  37,  59,  30],
       [ 24,  20,  31, 143,  17,  38,  23],
       [ 19,  34,  22,  38,  73,  50,  20],
       [ 35,  37,  22,  58,  22,  56,  46],
       [ 24,  22,  31,  55,  24,  59,  66]])