In [3]:
import pandas as pd
import numpy as np
from keras import Sequential
from keras.layers import LSTM, Dense, Dropout, CuDNNLSTM
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from tqdm import tqdm

## Data preparation

In [4]:

data = pd.read_csv('induce-data-2019-08-08.csv').iloc[:, :]
vocab = ['C_E_F_T',
         'C_E_F_C',
         'C_E_F_O',
         'A_E_F_T',
         'A_E_F_O',
         'A_E_F_C',
         'G_E_F_C',
         'G_E_F_T',
         'G_E_F_O',
         'A_E_M_T',
         'A_E_M_O',
         'A_E_M_C',
         'G_E_M_O',
         'G_E_M_C',
         'G_E_M_T',
         'C_E_M_O',
         'C_E_M_C',
         'C_E_M_T',
         'C_H_F_CO',
         'C_H_F_CT',
         'C_H_F_OT',
         'G_H_F_OT',
         'G_H_F_CO',
         'G_H_F_CT',
         'A_H_F_CT',
         'A_H_F_OT',
         'A_H_F_CO',
         'C_H_M_CO',
         'C_H_M_CT',
         'C_H_M_OT',
         'A_H_M_CT',
         'A_H_M_OT',
         'A_H_M_CO',
         'G_H_M_OT',
         'G_H_M_CO',
         'G_H_M_CT', ]

labels = ['correct',
          'wrong',
          'type',
          'orientation',
          'color']

types = ['INTRO',
         'CORE',
         'FLEX',
         'TRIK',
         'DELY'
]

topics = ['cards',
          'animals',
          'geometry'
    
]

feat = ['type',
        'color',
        'orientation',
        'dual'
]

age = ['8-10','11-13']

# qts_id = ['A_E_F_C_INTRO', 'A_E_F_O_INTRO', 'A_E_F_T_INTRO', 'A_E_M_C_INTRO',
#         'A_E_M_O_INTRO', 'A_E_M_T_INTRO', 'A_H_F_CO_CORE', 'A_H_F_CO_DELY',
#         'A_H_F_CO_TRIK', 'A_H_F_CT_CORE', 'A_H_F_CT_DELY', 'A_H_F_CT_TRIK',
#         'A_H_F_OT_CORE', 'A_H_F_OT_DELY', 'A_H_F_OT_TRIK', 'A_H_M_CO_CORE',
#         'A_H_M_CO_FLEX', 'A_H_M_CT_CORE', 'A_H_M_CT_FLEX', 'A_H_M_OT_CORE',
#         'A_H_M_OT_FLEX', 'C_E_F_C_INTRO', 'C_E_F_O_INTRO', 'C_E_F_T_INTRO',
#         'C_E_M_C_INTRO', 'C_E_M_O_INTRO', 'C_E_M_T_INTRO', 'C_H_F_CO_CORE',
#         'C_H_F_CO_DELY', 'C_H_F_CO_TRIK', 'C_H_F_CT_CORE', 'C_H_F_CT_DELY',
#         'C_H_F_CT_TRIK', 'C_H_F_OT_CORE', 'C_H_F_OT_DELY', 'C_H_F_OT_TRIK',
#         'C_H_M_CO_CORE', 'C_H_M_CO_FLEX', 'C_H_M_CT_CORE', 'C_H_M_CT_FLEX',
#         'C_H_M_OT_CORE', 'C_H_M_OT_FLEX', 'G_E_F_C_INTRO', 'G_E_F_O_INTRO',
#         'G_E_F_T_INTRO', 'G_E_M_C_INTRO', 'G_E_M_O_INTRO', 'G_E_M_T_INTRO',
#         'G_H_F_CO_CORE', 'G_H_F_CO_DELY', 'G_H_F_CO_TRIK', 'G_H_F_CT_CORE',
#         'G_H_F_CT_DELY', 'G_H_F_CT_TRIK', 'G_H_F_OT_CORE', 'G_H_F_OT_DELY',
#         'G_H_F_OT_TRIK', 'G_H_M_CO_CORE', 'G_H_M_CO_FLEX', 'G_H_M_CT_CORE',
#         'G_H_M_CT_FLEX', 'G_H_M_OT_CORE', 'G_H_M_OT_FLEX']




def seq_to_int(qts,vocab, labels, types, feat, topics, n_steps, age):
    integ = list()
    for i,x in enumerate(qts):
        if i != n_steps:
            features = list()
            features.append(vocab.index(qts[i, 4]))
            features.append(labels.index(qts[i, 5]))
            features.append(qts[i, 2])
            features.append(types.index(qts[i, 9]))
            features.append(feat.index(qts[i, 10]))
            features.append(topics.index(qts[i, 6]))
            features.append(age.index(qts[i, 7]))
        else:
            features = list()
            features.append(vocab.index(qts[i, 4]))
            features.append(-1)
            features.append(qts[i, 2])
            features.append(types.index(qts[i, 9]))
            features.append(feat.index(qts[i, 10]))
            features.append(topics.index(qts[i, 6]))
            features.append(age.index(qts[i, 7]))
        integ.append(features)
    return integ

def split_sequence(data, n_steps, vocab, labels, types, feat, topics, age):
    X, Y = list(), list()
    users = list(dict.fromkeys(data.loc[:, "user"]))
    for u in users:
        sequence = data[data.user == u]
        for i in range(len(sequence)):
            end_idx = i + n_steps
            if end_idx > len(sequence)-1:
                break
            x = seq_to_int(sequence.iloc[i:end_idx+1, :].values, vocab, labels, types, feat, topics, n_steps, age)
            y = labels.index(str(sequence.iloc[end_idx, 5]))
            X.append(x)
            Y.append(y)
    return np.array(X), np.array(Y)


### Test split_sequence


In [5]:
print(split_sequence(data.iloc[:20, :], 10, vocab, labels, types, feat, topics, age))


(array([[[ 0,  0,  0,  0,  0,  0,  1],
        [ 1,  0,  1,  0,  1,  0,  1],
        [ 2,  0,  2,  0,  2,  0,  1],
        [ 3,  0,  3,  0,  0,  1,  1],
        [ 4,  0,  4,  0,  2,  1,  1],
        [ 5,  1,  5,  0,  1,  1,  1],
        [ 6,  0,  6,  0,  1,  2,  1],
        [ 7,  0,  7,  0,  0,  2,  1],
        [ 8,  0,  8,  0,  2,  2,  1],
        [ 9,  0,  9,  0,  0,  1,  1],
        [10, -1, 10,  0,  2,  1,  1]],

       [[ 1,  0,  1,  0,  1,  0,  1],
        [ 2,  0,  2,  0,  2,  0,  1],
        [ 3,  0,  3,  0,  0,  1,  1],
        [ 4,  0,  4,  0,  2,  1,  1],
        [ 5,  1,  5,  0,  1,  1,  1],
        [ 6,  0,  6,  0,  1,  2,  1],
        [ 7,  0,  7,  0,  0,  2,  1],
        [ 8,  0,  8,  0,  2,  2,  1],
        [ 9,  0,  9,  0,  0,  1,  1],
        [10,  0, 10,  0,  2,  1,  1],
        [11, -1, 11,  0,  1,  1,  1]],

       [[ 2,  0,  2,  0,  2,  0,  1],
        [ 3,  0,  3,  0,  0,  1,  1],
        [ 4,  0,  4,  0,  2,  1,  1],
        [ 5,  1,  5,  0,  1,  1,  1],
       

## The model


In [5]:

# n_steps = 10

X = list()
y = list()

for i in tqdm(range(5, 10)):
    X_seq, y_seq = split_sequence(data, i, vocab, labels, types, feat, topics, age)
    for x in X_seq:
        X.append(x)
    for _y in y_seq:
        y.append(_y)
y = np_utils.to_categorical(y)
# X = np_utils.to_categorical(X)


  0%|          | 0/5 [00:00<?, ?it/s] 20%|██        | 1/5 [00:04<00:19,  4.82s/it] 40%|████      | 2/5 [00:09<00:14,  4.77s/it] 60%|██████    | 3/5 [00:14<00:09,  4.76s/it] 80%|████████  | 4/5 [00:20<00:05,  5.11s/it]100%|██████████| 5/5 [00:29<00:00,  6.24s/it]


In [17]:
n_features = 7
# X = X.reshape((X.shape[0], X.shape[1], n_features*X.shape[-1]))
# print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
print('train set size: '+str(len(y_train)))
print('test set size: '+str(len(y_test)))

model = Sequential()
model.add(CuDNNLSTM(256, input_shape=(None, n_features)))
# model.add(Dropout(0.2))
# model.add(LSTM(X.shape[1], return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(X.shape[1]))
# model.add(Dropout(0.2))
amsgrad = Adam(amsgrad=True)
model.add(Dense(5))
model.compile(optimizer=amsgrad, loss='mse', metrics=['categorical_accuracy'])

train set size: 55858
test set size: 6207


### Training

In [24]:
def train_generator():
    while True:
        idx = np.random.choice(np.arange(len(y_train)), 1, replace=False)[0]
        x_train = X_train[idx]
        _y_train = y_train[idx]
        yield np.array([x_train]), np.array([_y_train])

model.fit_generator(train_generator(), steps_per_epoch=10000, epochs=2000, verbose=2)

Epoch 1/2000
 - 41s - loss: 0.1233 - categorical_accuracy: 0.4512
Epoch 2/2000
 - 41s - loss: 0.1136 - categorical_accuracy: 0.5199
Epoch 3/2000
 - 41s - loss: 0.1031 - categorical_accuracy: 0.5925
Epoch 4/2000
 - 41s - loss: 0.0969 - categorical_accuracy: 0.6255
Epoch 5/2000
 - 41s - loss: 0.0910 - categorical_accuracy: 0.6485
Epoch 6/2000
 - 41s - loss: 0.0863 - categorical_accuracy: 0.6643
Epoch 7/2000
 - 41s - loss: 0.0858 - categorical_accuracy: 0.6591
Epoch 8/2000
 - 40s - loss: 0.0853 - categorical_accuracy: 0.6606
Epoch 9/2000
 - 41s - loss: 0.0842 - categorical_accuracy: 0.6621
Epoch 10/2000
 - 40s - loss: 0.0840 - categorical_accuracy: 0.6611
Epoch 11/2000
 - 41s - loss: 0.0826 - categorical_accuracy: 0.6726
Epoch 12/2000
 - 41s - loss: 0.0828 - categorical_accuracy: 0.6716
Epoch 13/2000
 - 40s - loss: 0.0822 - categorical_accuracy: 0.6721
Epoch 14/2000
 - 41s - loss: 0.0814 - categorical_accuracy: 0.6767
Epoch 15/2000
 - 41s - loss: 0.0810 - categorical_accuracy: 0.6799
Epoc

<keras.callbacks.History at 0x1ec00082ba8>

### Validation

In [18]:
def test_generator():
    while True:
        idx = np.random.choice(np.arange(len(y_test)), 1, replace=False)[0]
        x_test = X_test[idx]
        _y_test = y_test[idx]
        yield np.array([x_test]), np.array([_y_test])
scores = model.evaluate_generator(test_generator(), 1000)

def test_18_generator():
    while True:
        idx = np.random.choice(np.arange(len(y_test)), 1, replace=False)[0]
        x_test = X_test[idx]
        _ytest = y_test[idx]
        while _ytest[0] == 1 or _ytest[1]==1: # changer psk c'est du categorical
            idx = np.random.choice(np.arange(len(y_test)), 1, replace=False)[0]
            x_test = X_test[idx]
            _ytest = y_test[idx]
        yield np.array([x_test]), np.array([_ytest])
        
def test_inf_18_generator():
    while True:
        idx = np.random.choice(np.arange(len(y_test)), 1, replace=False)[0]
        x_test = X_test[idx]
        _ytest = y_test[idx]
        while _ytest[0] != 1 and _ytest[1]!=1: # changer psk c'est du categorical
            idx = np.random.choice(np.arange(len(y_test)), 1, replace=False)[0]
            x_test = X_test[idx]
            _ytest = y_test[idx]
        yield np.array([x_test]), np.array([_ytest])





In [19]:
#model.save("model.h5")
from keras.models import load_model
model = load_model("model.h5")

In [26]:
scores = model.evaluate_generator(test_generator(), 1000)
print("Accuracy: %.2f%%" % (scores[1]*100))

print(X_test[0])
print(model.predict(np.array([X_test[0]])))
print(np.array([X_test[1]]))
print(y_test[0])

Accuracy: 80.10%
[[33  2 33  1  3  2  0]
 [34  4 34  1  3  2  0]
 [35  4 35  1  3  2  0]
 [21  2 36  3  3  2  0]
 [33  2 37  2  3  2  0]
 [18  4 38  3  3  0  0]
 [27  4 39  2  3  0  0]
 [24 -1 40  3  3  1  0]]
[[ 4.8574805e-04 -9.7772479e-04  9.1076994e-01 -6.8823531e-02
   1.5689443e-01]]
[[[15  0 12  0  2  0  0]
  [16  0 13  0  1  0  0]
  [17  0 14  0  0  0  0]
  [12  0 15  0  2  2  0]
  [13  0 16  0  1  2  0]
  [14  0 17  0  0  2  0]
  [24  2 18  1  3  1  0]
  [25 -1 19  1  3  1  0]]]
[0. 0. 0. 0. 1.]


#### Questions greater than 18

In [27]:
scores = model.evaluate_generator(test_18_generator(), 1000)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 81.30%


#### Questions lower than 18

In [28]:
scores = model.evaluate_generator(test_inf_18_generator(), 1000)
print("Accuracy: %.2f%%" % (scores[1]*100))



Accuracy: 90.20%
