In [90]:
import pandas as pd
import numpy as np
from keras import Sequential
from keras.layers import LSTM, Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('induce-data-2019-08-08.csv').iloc[:, :]
vocab = ['C_E_F_T',
         'C_E_F_C',
         'C_E_F_O',
         'A_E_F_T',
         'A_E_F_O',
         'A_E_F_C',
         'G_E_F_C',
         'G_E_F_T',
         'G_E_F_O',
         'A_E_M_T',
         'A_E_M_O',
         'A_E_M_C',
         'G_E_M_O',
         'G_E_M_C',
         'G_E_M_T',
         'C_E_M_O',
         'C_E_M_C',
         'C_E_M_T',
         'C_H_F_CO',
         'C_H_F_CT',
         'C_H_F_OT',
         'G_H_F_OT',
         'G_H_F_CO',
         'G_H_F_CT',
         'A_H_F_CT',
         'A_H_F_OT',
         'A_H_F_CO',
         'C_H_M_CO',
         'C_H_M_CT',
         'C_H_M_OT',
         'A_H_M_CT',
         'A_H_M_OT',
         'A_H_M_CO',
         'G_H_M_OT',
         'G_H_M_CO',
         'G_H_M_CT', ]

labels = ['correct',
          'wrong',
          'type',
          'orientation',
          'color']

types = ['INTRO',
         'CORE',
         'FLEX',
         'TRIK',
         'DELY'
]

topics = ['cards',
          'animals',
          'geometry'
    
]

feat = ['type',
        'color',
        'orientation',
        'dual'
]

age = ['8-10','11-13']

In [3]:
def seq_to_int(qts,vocab, labels, types, feat, topics, n_steps, age):
    integ = list()
    for i,x in enumerate(qts):
        if i != n_steps:
            features = list()
            features.append(vocab.index(qts[i, 4]))
            features.append(labels.index(qts[i, 5]))
            features.append(qts[i, 2])
            features.append(types.index(qts[i, 9]))
            features.append(feat.index(qts[i, 10]))
            features.append(topics.index(qts[i, 6]))
            features.append(age.index(qts[i, 7]))
        else:
            features = list()
            features.append(vocab.index(qts[i, 4]))
            features.append(-1)
            features.append(qts[i, 2])
            features.append(types.index(qts[i, 9]))
            features.append(feat.index(qts[i, 10]))
            features.append(topics.index(qts[i, 6]))
            features.append(age.index(qts[i, 7]))
        integ.append(features)
    return integ

def split_sequence(data, n_steps, vocab, labels, types, feat, topics, age):
    X, Y = list(), list()
    users = list(dict.fromkeys(data.loc[:, "user"]))
    for u in users:
        sequence = data[data.user == u]
        for i in range(len(sequence)):
            end_idx = i + n_steps
            if end_idx > len(sequence)-1:
                break
            x = seq_to_int(sequence.iloc[i:end_idx+1, :].values, vocab, labels, types, feat, topics, n_steps, age)
            y = labels.index(str(sequence.iloc[end_idx, 5]))
            X.append(x)
            Y.append(y)
    return np.array(X), np.array(Y)

In [4]:

data_train, data_test = np.split(data.sample(frac=1, random_state=42), 
                                   [int(0.8 * len(data))])
print(len(data_train),len(data_test))



11173 2794


In [79]:
# n_steps = 10

X_train_split = list()
y_train_split = list()


for i in tqdm(range(5, 10)):
    X_seq, y_seq = split_sequence(data_train, i, vocab, labels, types, feat, topics, age)
    for x in X_seq:
        X_train_split.append(x)
    for _y in y_seq:
        y_train_split.append(_y)
y_train = np_utils.to_categorical(y_train_split)
X_train = np.array(X_train_split)



X_test_split = list()
y_test_split = list()


for i in tqdm(range(5, 10)):
    X_seq, y_seq = split_sequence(data_test, i, vocab, labels, types, feat, topics, age)
    for x in X_seq:
        X_test_split.append(x)
    for _y in y_seq:
        y_test_split.append(_y)
y_test = np_utils.to_categorical(y_test_split)
X_test = np.array(X_test_split)


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:04<00:18,  4.55s/it][A
 40%|████      | 2/5 [00:08<00:13,  4.48s/it][A
 60%|██████    | 3/5 [00:13<00:09,  4.58s/it][A
 80%|████████  | 4/5 [00:18<00:04,  4.53s/it][A
100%|██████████| 5/5 [00:25<00:00,  5.07s/it][A

  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:07,  1.82s/it][A
 40%|████      | 2/5 [00:02<00:04,  1.59s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.39s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.25s/it][A
100%|██████████| 5/5 [00:05<00:00,  1.08s/it][A


In [80]:
def one_hot_encode(row):
    encoded = []
    for i in range(len(row)):
        encoded.append([1 if row[i] == j else 0 for j in range(99)])
    return encoded

In [81]:
X_train = pad_sequences(X_train, value=99)
print(X_train)
print(X_train.shape)

[[[99 99 99 ... 99 99 99]
  [99 99 99 ... 99 99 99]
  [99 99 99 ... 99 99 99]
  ...
  [24  2 44 ...  3  1  1]
  [22  4 25 ...  3  2  1]
  [23 -1 56 ...  3  2  1]]

 [[99 99 99 ... 99 99 99]
  [99 99 99 ... 99 99 99]
  [99 99 99 ... 99 99 99]
  ...
  [22  4 25 ...  3  2  1]
  [23  2 56 ...  3  2  1]
  [21 -1 42 ...  3  2  1]]

 [[99 99 99 ... 99 99 99]
  [99 99 99 ... 99 99 99]
  [99 99 99 ... 99 99 99]
  ...
  [23  2 56 ...  3  2  1]
  [21  3 42 ...  3  2  1]
  [32 -1 35 ...  3  1  1]]

 ...

 [[17  0 14 ...  0  0  0]
  [22  4 49 ...  3  2  0]
  [20  3 60 ...  3  0  0]
  ...
  [27  4 39 ...  3  0  0]
  [35  4 57 ...  3  2  0]
  [ 4 -1  1 ...  2  1  0]]

 [[22  4 49 ...  3  2  0]
  [20  3 60 ...  3  0  0]
  [34  4 50 ...  3  2  0]
  ...
  [35  4 57 ...  3  2  0]
  [ 4  1  1 ...  2  1  0]
  [13 -1 16 ...  1  2  0]]

 [[20  3 60 ...  3  0  0]
  [34  4 50 ...  3  2  0]
  [26  3 62 ...  3  1  0]
  ...
  [ 4  1  1 ...  2  1  0]
  [13  0 16 ...  1  2  0]
  [29 -1 55 ...  3  0  0]]]
(48095, 10

In [114]:
n_hidden = int(2/3 * (X_train.shape[1]+X_train.shape[2]))
print(n_hidden)

11


In [118]:
model = Sequential()
model.add(LSTM(units=n_hidden, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=n_hidden, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=5))
amsgrad = Adam(amsgrad=False)
model.add(Activation('softmax'))
model.compile(optimizer=amsgrad, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_55 (LSTM)               (None, 10, 11)            836       
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 11)            0         
_________________________________________________________________
lstm_56 (LSTM)               (None, 11)                1012      
_________________________________________________________________
dropout_2 (Dropout)          (None, 11)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 5)                 60        
_________________________________________________________________
activation_7 (Activation)    (None, 5)                 0         
Total params: 1,908
Trainable params: 1,908
Non-trainable params: 0
_________________________________________________________________


In [119]:
model.fit(X_train, y_train, epochs=20, batch_size=300, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a5cf05490>