In [56]:
import pandas as pd
import numpy as np
from keras import Sequential
from keras.layers import LSTM, Dense, CuDNNLSTM, Dropout
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

## Data preparation

In [35]:

data = pd.read_csv('induce-data-2019-08-08.csv').iloc[:, :]
vocab = ['C_E_F_T',
         'C_E_F_C',
         'C_E_F_O',
         'A_E_F_T',
         'A_E_F_O',
         'A_E_F_C',
         'G_E_F_C',
         'G_E_F_T',
         'G_E_F_O',
         'A_E_M_T',
         'A_E_M_O',
         'A_E_M_C',
         'G_E_M_O',
         'G_E_M_C',
         'G_E_M_T',
         'C_E_M_O',
         'C_E_M_C',
         'C_E_M_T',
         'C_H_F_CO',
         'C_H_F_CT',
         'C_H_F_OT',
         'G_H_F_OT',
         'G_H_F_CO',
         'G_H_F_CT',
         'A_H_F_CT',
         'A_H_F_OT',
         'A_H_F_CO',
         'C_H_M_CO',
         'C_H_M_CT',
         'C_H_M_OT',
         'A_H_M_CT',
         'A_H_M_OT',
         'A_H_M_CO',
         'G_H_M_OT',
         'G_H_M_CO',
         'G_H_M_CT', ]

labels = ['correct',
          'wrong',
          'type',
          'orientation',
          'color']

types = ['INTRO',
         'CORE',
         'FLEX',
         'TRIK',
         'DELY'
]

feat = ['type',
        'color',
        'orientation',
        'dual'
]

qts_id = ['A_E_F_C_INTRO', 'A_E_F_O_INTRO', 'A_E_F_T_INTRO', 'A_E_M_C_INTRO',
        'A_E_M_O_INTRO', 'A_E_M_T_INTRO', 'A_H_F_CO_CORE', 'A_H_F_CO_DELY',
        'A_H_F_CO_TRIK', 'A_H_F_CT_CORE', 'A_H_F_CT_DELY', 'A_H_F_CT_TRIK',
        'A_H_F_OT_CORE', 'A_H_F_OT_DELY', 'A_H_F_OT_TRIK', 'A_H_M_CO_CORE',
        'A_H_M_CO_FLEX', 'A_H_M_CT_CORE', 'A_H_M_CT_FLEX', 'A_H_M_OT_CORE',
        'A_H_M_OT_FLEX', 'C_E_F_C_INTRO', 'C_E_F_O_INTRO', 'C_E_F_T_INTRO',
        'C_E_M_C_INTRO', 'C_E_M_O_INTRO', 'C_E_M_T_INTRO', 'C_H_F_CO_CORE',
        'C_H_F_CO_DELY', 'C_H_F_CO_TRIK', 'C_H_F_CT_CORE', 'C_H_F_CT_DELY',
        'C_H_F_CT_TRIK', 'C_H_F_OT_CORE', 'C_H_F_OT_DELY', 'C_H_F_OT_TRIK',
        'C_H_M_CO_CORE', 'C_H_M_CO_FLEX', 'C_H_M_CT_CORE', 'C_H_M_CT_FLEX',
        'C_H_M_OT_CORE', 'C_H_M_OT_FLEX', 'G_E_F_C_INTRO', 'G_E_F_O_INTRO',
        'G_E_F_T_INTRO', 'G_E_M_C_INTRO', 'G_E_M_O_INTRO', 'G_E_M_T_INTRO',
        'G_H_F_CO_CORE', 'G_H_F_CO_DELY', 'G_H_F_CO_TRIK', 'G_H_F_CT_CORE',
        'G_H_F_CT_DELY', 'G_H_F_CT_TRIK', 'G_H_F_OT_CORE', 'G_H_F_OT_DELY',
        'G_H_F_OT_TRIK', 'G_H_M_CO_CORE', 'G_H_M_CO_FLEX', 'G_H_M_CT_CORE',
        'G_H_M_CT_FLEX', 'G_H_M_OT_CORE', 'G_H_M_OT_FLEX']




def seq_to_int(qts,vocab, labels, types, feat, qts_id, n_steps):
    integ = list()
    for i,x in enumerate(qts):
        if i != n_steps:
            features = list()
            features.append(vocab.index(qts[i, 4]))
            features.append(labels.index(qts[i, 5]))
            features.append(qts[i, 2])
            features.append(types.index(qts[i, 9]))
            features.append(feat.index(qts[i, 10]))
            features.append(qts_id.index(qts[i, 8]))
        else:
            features = list()
            features.append(vocab.index(qts[i, 4]))
            features.append(-1)
            features.append(qts[i, 2])
            features.append(types.index(qts[i, 9]))
            features.append(feat.index(qts[i, 10]))
            features.append(qts_id.index(qts[i, 8]))
        integ.append(features)
    return integ

def split_sequence(data, n_steps, vocab, labels, types, feat, qts_id):
    X, Y = list(), list()
    users = list(dict.fromkeys(data.loc[:, "user"]))
    for u in users:
        sequence = data[data.user == u]
        for i in range(len(sequence)):
            end_idx = i + n_steps
            if end_idx > len(sequence)-1:
                break
            x = seq_to_int(sequence.iloc[i:end_idx+1, :].values, vocab, labels, types, feat, qts_id, n_steps)
            y = labels.index(str(sequence.iloc[end_idx, 5]))
            X.append(x)
            Y.append(y)
    return np.array(X), np.array(Y)


### Test split_sequence


In [36]:
print(split_sequence(data.iloc[:20, :], 10, vocab, labels, types, feat, qts_id))


(array([[[ 0,  0,  0,  0,  0, 23],
        [ 1,  0,  1,  0,  1, 21],
        [ 2,  0,  2,  0,  2, 22],
        [ 3,  0,  3,  0,  0,  2],
        [ 4,  0,  4,  0,  2,  1],
        [ 5,  1,  5,  0,  1,  0],
        [ 6,  0,  6,  0,  1, 42],
        [ 7,  0,  7,  0,  0, 44],
        [ 8,  0,  8,  0,  2, 43],
        [ 9,  0,  9,  0,  0,  5],
        [10, -1, 10,  0,  2,  4]],

       [[ 1,  0,  1,  0,  1, 21],
        [ 2,  0,  2,  0,  2, 22],
        [ 3,  0,  3,  0,  0,  2],
        [ 4,  0,  4,  0,  2,  1],
        [ 5,  1,  5,  0,  1,  0],
        [ 6,  0,  6,  0,  1, 42],
        [ 7,  0,  7,  0,  0, 44],
        [ 8,  0,  8,  0,  2, 43],
        [ 9,  0,  9,  0,  0,  5],
        [10,  0, 10,  0,  2,  4],
        [11, -1, 11,  0,  1,  3]],

       [[ 2,  0,  2,  0,  2, 22],
        [ 3,  0,  3,  0,  0,  2],
        [ 4,  0,  4,  0,  2,  1],
        [ 5,  1,  5,  0,  1,  0],
        [ 6,  0,  6,  0,  1, 42],
        [ 7,  0,  7,  0,  0, 44],
        [ 8,  0,  8,  0,  2, 43],
        [

## The model


In [66]:
n_features = 6
n_steps = 5


X, y = split_sequence(data, n_steps, vocab, labels, types, feat, qts_id)
y = np_utils.to_categorical(y)
X = np_utils.to_categorical(X)
X = X.reshape((X.shape[0], X.shape[1], n_features*X.shape[-1]))
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
print('train set size: '+str(len(y_train)))
print('test set size: '+str(len(y_test)))

model = Sequential()
model.add(CuDNNLSTM(X.shape[1], return_sequences=True, input_shape=(n_steps+1, X.shape[-1])))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(X.shape[1], return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(X.shape[1], return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(X.shape[1]))
model.add(Dropout(0.2))

model.add(Dense(5))
model.compile(optimizer='adam', loss='mse', metrics=['categorical_accuracy'])

(12857, 6, 378)
train set size: 11571
test set size: 1286


### Training

In [67]:
# fit model
model.fit(X_train, y_train, epochs=2000, verbose=2)

Epoch 1/2000
 - 5s - loss: 0.1346 - categorical_accuracy: 0.4152
Epoch 2/2000
 - 3s - loss: 0.1187 - categorical_accuracy: 0.5117
Epoch 3/2000
 - 3s - loss: 0.1122 - categorical_accuracy: 0.5646
Epoch 4/2000
 - 3s - loss: 0.1100 - categorical_accuracy: 0.5683
Epoch 5/2000
 - 3s - loss: 0.1082 - categorical_accuracy: 0.5759
Epoch 6/2000
 - 3s - loss: 0.1069 - categorical_accuracy: 0.5792
Epoch 7/2000
 - 3s - loss: 0.1064 - categorical_accuracy: 0.5757
Epoch 8/2000
 - 3s - loss: 0.1054 - categorical_accuracy: 0.5762
Epoch 9/2000
 - 3s - loss: 0.1049 - categorical_accuracy: 0.5764
Epoch 10/2000
 - 3s - loss: 0.1045 - categorical_accuracy: 0.5798
Epoch 11/2000
 - 3s - loss: 0.1040 - categorical_accuracy: 0.5838
Epoch 12/2000
 - 3s - loss: 0.1037 - categorical_accuracy: 0.5824
Epoch 13/2000
 - 3s - loss: 0.1034 - categorical_accuracy: 0.5867
Epoch 14/2000
 - 3s - loss: 0.1031 - categorical_accuracy: 0.5833
Epoch 15/2000
 - 3s - loss: 0.1026 - categorical_accuracy: 0.5870
Epoch 16/2000
 - 3s

<keras.callbacks.History at 0x13fac6969b0>

### Validation

In [68]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

print(X_test[0])
print(model.predict(np.array([X_test[0]])))
print(np.array([X_test[1]]))
print(y_test[0])



Accuracy: 70.53%
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0.02034289 0.00592759 0.4767301  0.02915283 0.4704451 ]]
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 1. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
[0. 0. 0. 0. 1.]
