In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math

# Tensorflow
import tensorflow as tf

%matplotlib inline

In [55]:
events = pd.read_csv('data/calcularis_small_events.csv', encoding='latin', low_memory=False)
subtasks = pd.read_csv('data/calcularis_small_subtasks.csv', encoding='latin', low_memory=False)
users = pd.read_csv('data/calcularis_small_users.csv', encoding='latin', low_memory=False)

In [56]:
events.head(15)

Unnamed: 0,event_id,user_id,mode,game_name,learning_time_ms,number_range,start,end,skill_id,type
0,0,1,NORMAL,Subitizing,8835.0,R10,2022-11-02T08:39:12.355Z,2022-11-02T08:39:25.130Z,1.0,task
1,1,1,NORMAL,Conversion,21167.0,R10,2022-11-11T10:26:27.893Z,2022-11-11T10:26:49.260Z,4.0,task
2,2,1,NORMAL,Conversion,11182.0,R10,2022-11-18T10:34:01.044Z,2022-11-18T10:34:12.423Z,7.0,task
3,3,1,NORMAL,Landing,6823.0,R10,2022-11-25T10:32:43.428Z,2022-11-25T10:32:56.986Z,19.0,task
4,4,1,END_OF_NR,Conversion,9107.0,R10,2022-12-02T10:44:40.555Z,2022-12-02T10:44:49.874Z,7.0,task
5,5,1,END_OF_NR,Conversion,10703.0,R10,2022-12-09T10:12:16.068Z,2022-12-09T10:12:26.984Z,4.0,task
6,6,1,NORMAL,Comparison,1383.0,R20,2022-12-16T10:25:42.441Z,2022-12-16T10:25:45.508Z,33.0,task
7,7,1,END_OF_NR,Landing,6052.0,R20,2023-01-20T10:13:41.496Z,2023-01-20T10:13:49.096Z,50.0,task
8,8,1,END_OF_NR,Landing,6055.0,R20,2023-01-27T10:18:17.427Z,2023-01-27T10:18:26.356Z,49.0,task
9,9,1,END_OF_NR,Estimation on Number Line,10541.0,R20,2023-02-03T10:20:25.581Z,2023-02-03T10:20:38.687Z,39.0,task


In [57]:
subtasks.head(15)

Unnamed: 0,subtask_id,event_id,user_id,aim,answer,answerMode,availableNumbers,correct,correctAnswerObject,correctNumber,...,startPosition,subtask_finished_timestamp,target,timeoutInSeconds,timeoutInSteps,type,upperBound,divisor,orderIndependent,step
0,0,0,1,,4,,,True,4,4.0,...,,2022-11-02T08:39:24.930Z,,,,ConciseSubitizingTaskDescription,,,,
1,1,0,1,,1,,,True,,,...,,2022-11-02T08:39:24.930Z,,0.0,2.0,ConciseTimeoutDescription,,,,
2,2,1,1,,3,,,True,3,,...,,2022-11-11T10:26:49.007Z,,,,ConciseConversionTaskDescription,,,,
3,3,2,1,,5,,,True,5,,...,,2022-11-18T10:34:12.191Z,,,,ConciseConversionTaskDescription,,,,
4,4,3,1,3.0,"{'a': 2, 'b': 2.0402703}",,,False,"{'a': 3, 'b': 3.0}",,...,0.5,2022-11-25T10:32:56.805Z,,,,ConciseLandingTaskDescription,3.5,,,
5,5,4,1,,9,,,True,9,,...,,2022-12-02T10:44:49.621Z,,,,ConciseConversionTaskDescription,,,,
6,6,5,1,,7,,,False,9,,...,,2022-12-09T10:12:26.729Z,,,,ConciseConversionTaskDescription,,,,
7,7,6,1,,16,,,True,16,,...,,2022-12-16T10:25:45.293Z,,,,ConciseSetComparisonTaskDescription,,,,
8,8,6,1,,1.3659,,,True,,,...,,2022-12-16T10:25:45.293Z,,4.0,0.0,ConciseTimeoutDescription,,,,
9,9,7,1,4.0,"{'a': 4, 'b': 4.148817}",,,True,"{'a': 4, 'b': 4.0}",,...,0.5,2023-01-20T10:13:48.879Z,,,,ConciseLandingTaskDescription,5.0,,,


In [58]:
users.head(15)

Unnamed: 0,user_id,learning_time_ms,start,end,logged_in_time_ms,language,country
0,1,14032710,2022-11-02T08:37:56.549Z,2023-02-09T11:08:02.599Z,22151340,de,CH
1,2,16268350,2022-09-07T07:53:38.865Z,2023-02-09T08:39:14.692Z,85421273,nl,NL
2,3,8012030,2021-09-27T07:45:51.806Z,2022-01-13T12:14:09.565Z,16651482,de,DE
3,4,1414421,2019-11-12T12:18:15.724Z,2020-10-02T09:20:28.798Z,4561768,de,CH
4,5,17502108,2022-04-26T11:38:44.114Z,2022-08-29T15:52:11.087Z,25601470,de,CH
5,6,8353125,2022-09-02T07:27:20.675Z,2023-02-01T10:35:15.218Z,19249399,nl,NL
6,7,7226229,2015-03-19T18:47:32.621Z,2015-05-29T18:17:32.889Z,9225357,,
7,8,64629753,2021-08-23T09:04:32.478Z,2022-06-20T12:13:06.081Z,125236245,de,CH
8,9,7647781,2019-08-20T17:02:06.155Z,2020-03-08T14:08:06.321Z,9089877,de,DE
9,10,22569161,2017-08-30T06:27:19.365Z,2021-07-08T14:09:17.450Z,33642991,de,


In [59]:
# extract game_name and number_range from the events dataframe to match the subtasks dataframe (given the event_id)
game_name = subtasks.merge(events[['event_id', 'game_name']], on='event_id', how='left').game_name
number_range = subtasks.merge(events[['event_id', 'number_range']], on='event_id', how='left').number_range

#split the game_name into skills
skill = game_name.apply(lambda x: 'Number representation' 
                        if x in ['Subitizing', 'Conversion', 'Estimation', 'Estimation on Number Line', 'Scale: Conversion', 'Landing'] 
                        else 'Number comparison/manipulation' if x in ['Comparison', 'Secret Number', 'Distance', 'Scale: Composition', 'Order'] 
                        else 'Addition/Substraction' if x in ['Scale: Decomposition', 'Scale: Subtraction', 'Scale: Subtraction by Tens and Ones', 'Sliderule', 'Plus-minus', 'Difference', 'Completion', 'Calculator'] 
                        else 'Multiplication/Division' if x in ['Write as multiplication', 'Shelf: Jump Ahead', 'Shelf: given height', 'Shelf: random height', 'Distribution', 'Calculator: Multiplication', 'Calculator: Multiplication with ?', 'Calculator: Division', 'Jump backwards', 'Series'] 
                        else 'Other')

# concatenate skills and number range and add it to the subtasks dataframe
subtasks['skill_name'] = skill + ' ' + number_range
subtasks

Unnamed: 0,subtask_id,event_id,user_id,aim,answer,answerMode,availableNumbers,correct,correctAnswerObject,correctNumber,...,subtask_finished_timestamp,target,timeoutInSeconds,timeoutInSteps,type,upperBound,divisor,orderIndependent,step,skill_name
0,0,0,1,,4,,,True,4,4.00000,...,2022-11-02T08:39:24.930Z,,,,ConciseSubitizingTaskDescription,,,,,Number representation R10
1,1,0,1,,1,,,True,,,...,2022-11-02T08:39:24.930Z,,0.00000,2.00000,ConciseTimeoutDescription,,,,,Number representation R10
2,2,1,1,,3,,,True,3,,...,2022-11-11T10:26:49.007Z,,,,ConciseConversionTaskDescription,,,,,Number representation R10
3,3,2,1,,5,,,True,5,,...,2022-11-18T10:34:12.191Z,,,,ConciseConversionTaskDescription,,,,,Number representation R10
4,4,3,1,3.00000,"{'a': 2, 'b': 2.0402703}",,,False,"{'a': 3, 'b': 3.0}",,...,2022-11-25T10:32:56.805Z,,,,ConciseLandingTaskDescription,3.50000,,,,Number representation R10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55042,55042,37415,998,46.00000,"{'a': 47, 'b': 47.33128}",,,True,"{'a': 46, 'b': 46.0}",,...,2021-01-06T14:14:46.133Z,,,,ConciseLandingTaskDescription,50.00000,,,,Number representation R100
55043,55043,37417,1000,,1,,,False,3,3.00000,...,2019-09-30T10:04:58.024Z,,,,ConciseSubitizingTaskDescription,,,,,Number representation R10
55044,55044,37417,1000,,2,,,True,,,...,2019-09-30T10:04:58.024Z,,0.00000,2.00000,ConciseTimeoutDescription,,,,,Number representation R10
55045,55045,37418,1000,,3,,,True,3,,...,2020-01-20T10:03:51.556Z,,,,ConciseConversionTaskDescription,,,,,Number representation R10


# BKT Model

In [7]:
#from pyBKT.models import Model
from pyBKT.models import Model

In [8]:
model = Model(seed=0, defaults={'order_id' : 'subtask_id'})
model.fit(data=subtasks, 
          multilearn = 'user_id',
          multigs = 'user_id',
          forgets = True)

KeyboardInterrupt: 

In [None]:
params = model.params()
params

# DKT model

In [60]:
from sklearn import feature_extraction, model_selection

In [61]:
# Useful functions for what comes next
def create_iterator(data):
    '''
    Create an iterator to split interactions in data into train and test, with the same student not appearing in two diverse folds.
    :param data:        Dataframe with student's interactions.
    :return:            An iterator.
    '''    
    # Both passing a matrix with the raw data or just an array of indexes works
    X = np.arange(len(data.index)) 
    # Groups of interactions are identified by the user id (we do not want the same user appearing in two folds)
    groups = data['user_id'].values 
    return model_selection.GroupShuffleSplit(n_splits=1, train_size=.8, test_size=0.2, random_state=0).split(X, groups=groups)

def prepare_seq(df):
    # Step 1 - Enumerate skill id, each different skill has a different id
    df['skill'], skill_codes = pd.factorize(df['skill_name'], sort=True)

    # Step 2 - Cross skill id with answer to form a synthetic feature, being 2*skill_id + correct, which is 1 if subtask was correct, 0 else 
    df['skill_with_answer'] = df['skill'] * 2 + df['correct']

    # Step 3 - Convert to a sequence per user id and shift features 1 timestep, take all skill_with_answer except last value, take all skill and correct values except first
    seq = df.groupby('user_id').apply(lambda r: (r['skill_with_answer'].values[:-1], r['skill'].values[1:], r['correct'].values[1:],))
    
    # Step 4 - Get max skill depth and max feature depth
    skill_depth = df['skill'].max() 
    features_depth = df['skill_with_answer'].max() + 1

    return seq, features_depth, skill_depth

def prepare_data(seq, params, features_depth, skill_depth):
    
    # Step 1 - Transform sequence to Tensorflow Dataset
    dataset = tf.data.Dataset.from_generator(generator=lambda: seq, output_types=(tf.int32, tf.int32, tf.float32))

    # Step 2 - Encode categorical features and merge skills with labels to compute target loss.
    dataset = dataset.map(
        lambda feat, skill, label: (
            tf.one_hot(feat, depth=features_depth),
            tf.concat(values=[tf.one_hot(skill, depth=skill_depth), tf.expand_dims(label, -1)], axis=-1)
        )
    )

    # Step 3 - Pad sequences per batch
    dataset = dataset.padded_batch(
        batch_size=params['batch_size'],
        padding_values=(params['mask_value'], params['mask_value']),
        padded_shapes=([None, None], [None, None]),
        drop_remainder=True
    )

    return dataset.repeat(), len(seq)

### Split data in train and test sets

In [62]:
# Obtain indexes
train_index, test_index = next(create_iterator(subtasks))
# Split the data
X_train, X_test = subtasks.iloc[train_index], subtasks.iloc[test_index]

# Split train data in train and validation sets
# Obtain indexes for necessary validation set
train_val_index, val_index = next(create_iterator(X_train))
# Split the training data into training and validation
X_train_val, X_val = X_train.iloc[train_val_index], X_train.iloc[val_index]

### Data preparation

In [63]:
# Specify parameters
params = {}
params['batch_size'] = 32
params['mask_value'] = -1.0

In [64]:
# Prepare the data
seq, features_depth, skill_depth = prepare_seq(subtasks)
seq_train = seq[X_train.user_id.unique()]
seq_val = seq[X_train_val.user_id.unique()]
seq_test = seq[X_test.user_id.unique()]

tf_train, length = prepare_data(seq_train, params, features_depth, skill_depth)
tf_val, val_length  = prepare_data(seq_val, params, features_depth, skill_depth)
tf_test, test_length = prepare_data(seq_test, params, features_depth, skill_depth)

# Specify further params
params['train_size'] = int(length // params['batch_size'])
params['val_size'] = int(val_length // params['batch_size'])
params['test_size'] = int(test_length // params['batch_size'])

In [65]:
seq

user_id
1       ([25, 25, 25, 25, 24, 25, 24, 23, 23, 31, 31, ...
2       ([25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 25, ...
3       ([24, 25, 16, 16, 23, 23, 23, 23, 27, 27, 26, ...
4                      ([24, 25], [12, 12], [True, True])
5       ([25, 25, 25, 31, 30, 31, 31, 30, 30, 27, 27, ...
                              ...                        
995     ([25, 25, 25, 24, 30, 31, 26, 27, 19, 19, 19, ...
996     ([24, 25, 24, 25, 25, 25, 17, 17, 24, 25], [12...
997     ([24, 25, 25, 25, 24, 31, 23, 23, 23, 23, 23, ...
998     ([25, 25, 24, 25, 1, 25, 25, 24, 31, 31, 31, 3...
1000     ([24, 25, 25], [12, 12, 12], [True, True, True])
Length: 926, dtype: object

### Model creation

In [66]:
# Define model parameters
params['verbose'] = 1 # Verbose = {0,1,2}
params['best_model_weights'] = 'weights/bestmodel' # File to save the model
params['optimizer'] = 'adam' # Optimizer to use
params['backbone_nn'] = tf.keras.layers.RNN # Backbone neural network
params['recurrent_units'] = 16 # Number of RNN units
params['epochs'] = 10  # Number of epochs to train
params['dropout_rate'] = 0.3 # Dropout rate

In [67]:
# Function that removes predictions on time step associated with padding, and match outputs to specific skills
def get_target(y_true, y_pred, mask_value=params['mask_value']):
    
    # Get skills and labels from y_true
    mask = 1. - tf.cast(tf.equal(y_true, mask_value), y_true.dtype)
    y_true = y_true * mask

    skills, y_true = tf.split(y_true, num_or_size_splits=[-1, 1], axis=-1)

    # Get predictions for each skill
    y_pred = tf.reduce_sum(y_pred * skills, axis=-1, keepdims=True)

    return y_true, y_pred

In [68]:
# Define the metrics
class AUC(tf.keras.metrics.AUC):
    def update_state(self, y_true, y_pred, sample_weight=None):
        true, pred = get_target(y_true, y_pred)
        super(AUC, self).update_state(y_true=true, y_pred=pred, sample_weight=sample_weight)

class RMSE(tf.keras.metrics.RootMeanSquaredError):
    def update_state(self, y_true, y_pred, sample_weight=None):
        true, pred = get_target(y_true, y_pred)
        super(RMSE, self).update_state(y_true=true, y_pred=pred, sample_weight=sample_weight)
        
def CustomBinaryCrossEntropy(y_true, y_pred):    
    y_true, y_pred = get_target(y_true, y_pred)
    return tf.keras.losses.binary_crossentropy(y_true, y_pred)   

In [69]:
# Model creation 
def create_model(nb_features, nb_skills, params):
    
    # Create the model architecture
    inputs = tf.keras.Input(shape=(None, nb_features), name='inputs')
    x = tf.keras.layers.Masking(mask_value=params['mask_value'])(inputs)
    x = tf.keras.layers.LSTM(params['recurrent_units'], return_sequences=True, dropout=params['dropout_rate'])(x)
    dense = tf.keras.layers.Dense(nb_skills, activation='sigmoid')
    outputs = tf.keras.layers.TimeDistributed(dense, name='outputs')(x)
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs, name='DKT')

    # Compile the model
    model.compile(loss=CustomBinaryCrossEntropy, 
                  optimizer=params['optimizer'], 
                  metrics=[AUC(), RMSE()])
    
    return model

model = create_model(features_depth, skill_depth, params)

In [70]:
model.summary()

Model: "DKT"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, None, 32)]        0         
                                                                 
 masking_1 (Masking)         (None, None, 32)          0         
                                                                 
 lstm_1 (LSTM)               (None, None, 16)          3136      
                                                                 
 outputs (TimeDistributed)   (None, None, 15)          255       
                                                                 
Total params: 3,391
Trainable params: 3,391
Non-trainable params: 0
_________________________________________________________________


### Model fitting and evaluation

In [71]:
# Model fit
ckp_callback = tf.keras.callbacks.ModelCheckpoint(params['best_model_weights'], save_best_only=True, save_weights_only=True)
history = model.fit(tf_train, epochs=params['epochs'], steps_per_epoch=params['train_size'], 
                    validation_data=tf_val,  validation_steps = params['val_size'], 
                    callbacks=[ckp_callback], verbose=params['verbose'])

# Model evaluation
model.load_weights(params['best_model_weights'])
metrics_dkt_small = model.evaluate(tf_test, verbose=params['verbose'], steps = params['test_size'])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [72]:
# Getting the metrics
# Binary cross entropy, AUC, RMSE
metrics_dkt_small

[1.6921628713607788, 0.6680315732955933, 0.45636826753616333]

In [83]:
prediction = model.predict(x = tf_val, steps = 5)



In [93]:
prediction

<tf.RaggedTensor [[[0.5643526, 0.539063, 0.5067277, ..., 0.5752451, 0.52491564, 0.53036386],
  [0.59281737, 0.5631015, 0.49441013, ..., 0.6025807, 0.5426111, 0.5461789],
  [0.6218778, 0.58773005, 0.48795396, ..., 0.62230724, 0.5653233,
   0.5623036],
  ...,
  [0.7170984, 0.7721602, 0.5919929, ..., 0.6113148, 0.72336537, 0.58951926],
  [0.7170984, 0.7721602, 0.5919929, ..., 0.6113148, 0.72336537, 0.58951926],
  [0.7170984, 0.7721602, 0.5919929, ..., 0.6113148, 0.72336537, 0.58951926]],
 [[0.5699824, 0.5274181, 0.5143932, ..., 0.54954123, 0.527062, 0.5363871],
  [0.59659976, 0.5533096, 0.49897668, ..., 0.582076, 0.54126, 0.5508278],
  [0.59659976, 0.5533096, 0.49897668, ..., 0.582076, 0.54126, 0.5508278],
  ...,
  [0.59659976, 0.5533096, 0.49897668, ..., 0.582076, 0.54126, 0.5508278],
  [0.59659976, 0.5533096, 0.49897668, ..., 0.582076, 0.54126, 0.5508278],
  [0.59659976, 0.5533096, 0.49897668, ..., 0.582076, 0.54126, 0.5508278]] ,
 [[0.5643526, 0.539063, 0.5067277, ..., 0.5752451, 0.524

In [82]:
X_val

Unnamed: 0,subtask_id,event_id,user_id,aim,answer,answerMode,availableNumbers,correct,correctAnswerObject,correctNumber,...,subtask_finished_timestamp,target,timeoutInSeconds,timeoutInSteps,type,upperBound,divisor,orderIndependent,step,skill_name
14,14,12,2,,1,,,True,1,1.00000,...,2022-09-07T07:55:20.991Z,,,,ConciseSubitizingTaskDescription,,,,,Number representation R10
15,15,12,2,,1,,,True,,,...,2022-09-07T07:55:20.991Z,,0.00000,2.00000,ConciseTimeoutDescription,,,,,Number representation R10
16,16,13,2,,2,,,True,2,2.00000,...,2022-09-13T09:03:55.646Z,,,,ConciseSubitizingTaskDescription,,,,,Number representation R10
17,17,13,2,,0,,,True,,,...,2022-09-13T09:03:55.646Z,,0.00000,2.00000,ConciseTimeoutDescription,,,,,Number representation R10
18,18,14,2,,2,,,True,2,2.00000,...,2022-09-14T07:24:54.895Z,,,,ConciseSubitizingTaskDescription,,,,,Number representation R10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55008,55008,37387,997,19.00000,"{'a': 19, 'b': 19.173439}",,,True,"{'a': 19, 'b': 19.0}",,...,2021-06-07T11:26:45.986Z,,,,ConciseLandingTaskDescription,20.00000,,,,Number representation R20
55009,55009,37388,997,,8,,,True,8,,...,2021-06-10T09:33:23.334Z,,,,ConciseConversionTaskDescription,,,,,Number representation R20
55010,55010,37389,997,4.00000,"{'a': 4, 'b': 3.5028758}",,,True,"{'a': 4, 'b': 4.0}",,...,2021-06-10T09:38:38.457Z,,,,ConciseLandingTaskDescription,5.00000,,,,Number representation R20
55011,55011,37390,997,,20,,,True,20,,...,2021-06-14T10:23:24.287Z,,,,ConciseConversionTaskDescription,,,,,Number representation R20
