In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import random
from copy import deepcopy
import _pickle as pickle
import gc
from multiprocess import Pool
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import KBinsDiscretizer

from tensorflow.keras.optimizers import Adam, SGD
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile, protocol=4)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

class Discretiser:
    def __init__(self, nbins):
        self.nbins = nbins-1
        self.map_to = np.arange(self.nbins)/self.nbins
        
    def fit(self, X):
        ## X is a one dimension np array
        self.map_from = np.quantile(X, self.map_to)
        
    def transform(self, X):
        X1 = (np.interp(X, self.map_from, self.map_to, left=0, right=1, period=None) * self.nbins).astype(int)
        return X1

## Batch  Generation

In [None]:
# train = load('train_train')

In [None]:
# test_user = train[train['user_id'] == 115]

In [None]:
# def build_user_sequence(df_user):
    
#     df_user =  df_user.sort_values(by = 'timestamp')
#     df_user.index = list(range(df_user.shape[0]))
    
#     df_user['content_type'] =  df_user['content_type_id'].apply(lambda x : 'q' if x == 0 else 'l')
#     df_user['content_seq'] = df_user['content_type'].astype(str) + '_' + df_user['content_id'].astype(str)
    
#     ## Encoder
#     exercise_id = df_user['content_seq'].values
#     container_id = df_user['task_container_id'].values
#     timestamp = df_user['timestamp'].values
    
#     ## Decoder
#     correctness = df_user['answered_correctly'].values
#     answer = df_user['user_answer'].values
    
#     elapsed_time = df_user['prior_question_elapsed_time'].values[1:] ## Already Padded
#     prior_question_had_explanation = df_user['prior_question_had_explanation'].values[1:]*1 ## Already Padded
    
#     lag_time = timestamp[1:] - timestamp[:1] + elapsed_time
    
#     dico = {
#         'exercise_id' : exercise_id,
#         'container_id' : container_id,
#         'timestamp' : timestamp,
#         'correctness' : correctness,
#         'answer' : answer, 
#         'elapsed_time' : elapsed_time,
#         'prior_question_had_explanation' : prior_question_had_explanation,
#         'lag_time' : lag_time
#     }
#     return dico
    

In [None]:
# batch_size = 2000
# count = 0
# vect = []
# count = 0
# p = Pool(12)

# for elt in tqdm(train.groupby('user_id'), total = train['user_id'].nunique()):
#     vect.append(elt)
#     if len(vect) == batch_size:
#         vect = np.array(vect)
#         vect_user = vect[:,0]
#         vect_data = vect[:,1]
#         vect = []
        
#         processed_dico = p.map(build_user_sequence, vect_data)
        
#         dico_user = {}
#         for i, elt in enumerate(vect_user):
#             dico_user[elt] = processed_dico[i]
#         save(dico_user, 'batch_'+str(count), 'user_batch_saint')
#         count += 1
        
# p.close()

## Tokenization

In [None]:
## timestamp encoder
train = load('train_train')
t = train['timestamp'].values
timestamp_enc = Discretiser(300)
timestamp_enc.fit(t)

In [None]:
## Elapsed time encoder
dico_user = load('batch_'+str(0), 'user_batch_saint_2000')
el = []
for elt in dico_user:
    ela = dico_user[elt]['elapsed_time']
    ela[np.isnan(ela)] = 0
    el += list(ela)
elapsed_enc = Discretiser(300)
elapsed_enc.fit(el)   

In [None]:
## Question mean encoder
dico_question = load('dico_questions_mean')
val = list(dico_question.values())
qmean_enc = Discretiser(300)
qmean_enc.fit(val) 

In [None]:
## Saving
save((timestamp_enc, elapsed_enc, qmean_enc), 'discrete_encoders')

In [None]:
## Building tokenizer
lectures = pd.read_csv('lectures.csv')
questions = pd.read_csv('questions.csv')
user_answer = np.array([-1,0,1,2,3])
answered_correctly = np.array([-1,0,1])

lectures_id = lectures['lecture_id'].unique()
question_id = questions['question_id'].unique()

lectures_id = ['l_' +  elt for elt in  lectures_id.astype(str)]
question_id = ['q_' +  elt for elt in  question_id.astype(str)]

all_tokens = np.array(['[PAD]', '[CLS]', '[SEP]', '[MASK]'] + lectures_id + question_id)

tokenizer = Tokenizer(filters = '')

tokenizer.fit_on_texts(
    all_tokens
)

save(tokenizer, 'tokenizer')

In [None]:
def create_dictionnaries():
    df = pd.read_csv('questions.csv')
    df1 = pd.read_csv('lectures.csv')

    def apply(x):
        return 'q_'+str(x)

    def apply1(x):
        return 'l_'+str(x)

    def to_tab(x):
        if str(x)!='nan':
            x = np.array(str(x).split(' ')).astype(int)
        else:
            x = []
        x.sort()
        return x

    df['tag'] = df['tags'].apply(to_tab)
    df['qu'] = df['question_id'].apply(apply)
    df1['l'] = df1['lecture_id'].apply(apply1)

    ## unique tags part
    tags_to_utags = {}
    count = 0
    for elt in df1['tag']:
        if elt in tags_to_utags:
            1
        else:
            tags_to_utags[str(elt)] = count
            count+=1

    for elt in df['tags']:
        if elt in tags_to_utags:
            1
        else:
            tags_to_utags[elt] = count
            count+=1
    df['utags'] = df['tags'].astype(str).replace(tags_to_utags)
    df1['utags'] = df1['tag'].astype(str).replace(tags_to_utags)

    ## Graph tags part
    dico_l = {}
    for t, data in df1.groupby('tag'):
        dico_l[t] = data['l'].unique()

    import networkx as nx
    G = nx.Graph()
    G.add_nodes_from(df['qu'])
    G.add_nodes_from(df1['l'])

    for i, elt in enumerate(tqdm(df['tag'])):
        for j in elt:
            try:
                lec = dico_l[j]
            except:
                lec = []
            for k in lec:
                G.add_edge(df['qu'].iloc[i], k)

    co = list(nx.connected_components(G))

    tags_to_gtags = {}
    count = 0
    for i, elt in enumerate(tqdm(co)):
        for j in elt:
            tags_to_gtags[j] = i

    df['gtags'] = df['qu'].replace(tags_to_gtags)
    df1['gtags'] = df1['l'].replace(tags_to_gtags)

    dico_utags = {}
    dico_gtags = {}
    dico_parts = {}
    for pair in zip(df['qu'], df['utags'], df['gtags'], df['part']):
        dico_utags[pair[0]] = pair[1]
        dico_gtags[pair[0]] = pair[2]
        dico_parts[pair[0]] = pair[3]
        
    for pair in zip(df1['l'], df1['utags'], df1['gtags'], df1['part']):
        dico_utags[pair[0]] = pair[1]
        dico_gtags[pair[0]] = pair[2]
        dico_parts[pair[0]] = pair[3]
            
    return dico_utags, dico_gtags, dico_parts

dico_utags, dico_gtags, dico_parts = create_dictionnaries()

save((dico_utags, dico_gtags, dico_parts), 'dico_tags')

## Data Generator

In [None]:
import tensorflow as tf
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self,batch_size=32, max_len = 128, folder = 'user_batch_saint_100', strategy = 'begin'):
        self.batch_size = batch_size
        self.tokenizer = load('tokenizer')
        self.max_len = max_len
        self.folder = folder
        self.dico_question = load('dico_questions_mean')
        self.dico_utags, self.dico_gtags, self.dico_parts = load('dico_tags')
        self.timestamp_enc, self.elapsed_enc, self.qmean_enc = load('discrete_encoders')
        self.strategy = strategy
        
    def __len__(self):
        return 1000000
    
    def initiate_dico(self):
        list_encoder = ['exercise', 'part', 'utag', 'gtag', 'timestamp', 'question_mean']
        list_decoder = ['correct', 'answer', 'elapsed_time', 'lag_time', 'was_explained']
        list_output = ['exercise', 'answer', 'correct']
        
        dico_input = {}
        for elt in list_encoder + list_decoder:
            if elt == 'exercise':
                dico_input[elt] = np.zeros((self.batch_size, self.max_len)).astype(str)
            else:
                dico_input[elt] = np.zeros((self.batch_size, self.max_len)).astype('int32')
        
        dico_output = {}
        for elt in list_output:
            if elt == 'exercise':
                dico_output[elt] = np.zeros((self.batch_size, self.max_len)).astype(str)
            else:
                dico_output[elt] = np.zeros((self.batch_size, self.max_len)).astype('int32')
        return dico_input, dico_output

    def map_part(self, ids):
        def replace_dico_part(x):
            try:
                return self.dico_parts[x]
            except:
                return 0
        return np.array(list(map(replace_dico_part,ids)))
    
    def map_utags(self, ids):
        def replace_dico_utags(x):
            try:
                if str(self.dico_utags[x]) != 'nan':
                    return str(self.dico_utags[x])
                else:
                    return 0
            except:
                return 0
        return np.array(list(map(replace_dico_utags,ids)))
    
    def map_gtags(self, ids):
        def replace_dico_gtags(x):
            try:
                if str(self.dico_gtags[x]) != 'nan':
                    return str(self.dico_gtags[x])
                else:
                    return 0
            except:
                return 0
        return np.array(list(map(replace_dico_gtags,ids)))
    
    def map_mean(self, ids):
        def replace_dico_question(x):
            try:
                return self.dico_question[x]
            except:
                return 0.5
        return np.array(list(map(replace_dico_question,ids)))
    
    def update_dico(self, dico_input, dico_output, input_vals, output_vals, i):
        list_encoder = ['exercise', 'part', 'utag', 'gtag', 'timestamp', 'question_mean']
        list_decoder = ['correct', 'answer', 'elapsed_time', 'lag_time', 'was_explained']
        list_output = ['exercise', 'answer', 'correct']
        
        for j, elt in enumerate(list_encoder + list_decoder):
            dico_input[elt][i] = input_vals[j]
        
        for j, elt in enumerate(list_output):
            dico_output[elt][i] = output_vals[j]
        return dico_input, dico_output

    def remove_na(self, x):
        x = np.array(list(x))
        x[np.isnan(x)] = 0
        return x
    

    def build_sequence(self, user_history):
        dico_sequence = deepcopy(user_history)        
        dico_sequence['elapsed_time'] = self.remove_na(dico_sequence['elapsed_time'])
        dico_sequence['lag_time'] = self.remove_na(dico_sequence['lag_time'])
        dico_sequence['prior_question_had_explanation'] = self.remove_na(dico_sequence['prior_question_had_explanation'])
        
        ## Cut sequence
        if self.strategy == 'begin':
            for elt in dico_sequence:
                dico_sequence[elt] = dico_sequence[elt][:self.max_len]
        else:
            for elt in dico_sequence:
                dico_sequence[elt] = dico_sequence[elt][-self.max_len:]
        
        ## Pad sequence
        pad_tokens = ['[PAD]', 0, 0, -1, -1, 0, 0, 0]
        for j, elt in enumerate(dico_sequence):
            size = len(dico_sequence[elt])
            if size <= self.max_len:
                adding = self.max_len - size
                tok = pad_tokens[j]
                if type(tok) == str:
                    add = np.array([tok for elt in range(adding)])
                else:
                    add = np.zeros(adding) + tok
                dico_sequence[elt] = np.concatenate([dico_sequence[elt], add], axis = 0)
#                 print(dico_sequence[elt].shape)
        
        input_vals = [
            dico_sequence['exercise_id'],
            self.map_part(dico_sequence['exercise_id']),
            self.map_utags(dico_sequence['exercise_id']),
            self.map_gtags(dico_sequence['exercise_id']),
            self.timestamp_enc.transform(dico_sequence['timestamp']),
            self.qmean_enc.transform(self.map_mean(dico_sequence['exercise_id'])),
            
            np.concatenate([np.array([0]), (dico_sequence['correctness'] + 1)[:-1]]),
            np.concatenate([np.array([0]), (dico_sequence['answer'] + 1)[:-1]]),
            np.concatenate([np.array([0]), self.elapsed_enc.transform(dico_sequence['elapsed_time'])[:-1]]),
            np.concatenate([np.array([0]), self.elapsed_enc.transform(dico_sequence['lag_time'])[:-1]]),
            np.concatenate([np.array([0]), dico_sequence['prior_question_had_explanation'][:-1]]),
        ]
        
        output_vals = [
            np.concatenate([dico_sequence['exercise_id'][1:], np.array(['[PAD]'])]),
            dico_sequence['answer'] + 1,
            dico_sequence['correctness'] + 1,
        ]
        
        
#         x = np.zeros((11,self.max_len))
#         y = np.zeros((3, self.max_len))
        return input_vals,output_vals
    

    def __getitem__(self, index):
        ## Load random batch
        file_name = random.choice(os.listdir('./'+self.folder))
        dico_user = load(file_name.split('.')[0], self.folder)
        
        list_user = np.random.choice(list(dico_user.keys()), size = self.batch_size)
        
        dico_input, dico_output = self.initiate_dico()
        
        
        for i, elt in enumerate(list_user):
            user_history = dico_user[elt]
            input_vals, output_vals = self.build_sequence(user_history)
            dico_input, dico_output = self.update_dico(dico_input, dico_output, input_vals, output_vals, i)
        
        x = deepcopy(dico_input['exercise'])
        dico_input['exercise'] = np.array(self.tokenizer.texts_to_sequences([" ".join(list(x)[elt]) for elt in range(len(x))]))
        
        x = deepcopy(dico_output['exercise'])
        dico_output['exercise'] = np.array(self.tokenizer.texts_to_sequences([" ".join(list(x)[elt]) for elt in range(len(x))]))
        
        X = list(np.array(list(dico_input.values())).astype('int32'))
        y = list(np.array(list(dico_output.values())).astype('int32')) 
        
        return X, y

    def on_epoch_end(self):
        pass

    def __get_data(self, batch):
        pass

In [None]:
gen = DataGenerator(batch_size=64, max_len = 128, folder = 'user_batch_saint_100')

In [None]:
%%time
x, y = gen[0]

In [None]:
for elt in x:
    print(elt.shape)

In [None]:
for elt in y:
    print(elt.shape)

In [None]:
from tf_transformers2 import *
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, LSTM
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
class SaintEncoder(tf.keras.layers.Layer):    
    def __init__(self, num_layers = 2, d_model = 512, num_heads = 8, 
                 dff = 1024, input_vocab_size = 15000, maximum_position_encoding = 512, 
                 rate=0.1, bidirectional_encoder = True):
        super(SaintEncoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.part_embedding = tf.keras.layers.Embedding(8, d_model)
        self.utag_embedding = tf.keras.layers.Embedding(2000, d_model)
        self.gtag_embedding = tf.keras.layers.Embedding(100, d_model)
        self.timestamp_embedding = tf.keras.layers.Embedding(301, d_model)
        self.question_mean_embedding = tf.keras.layers.Embedding(301, d_model)
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)

 
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
        self.bidirectional_encoder = bidirectional_encoder
        
    def call(self, x, training,
            part_id = None,
            utag_id = None,
            gtag_id = None,
            timestamp_id = None,
            question_mean_id = None,
            use = []
            ):
        seq_len = tf.shape(x)[1]
        
        if self.bidirectional_encoder == False:
            look_ahead_mask = create_look_ahead_mask(tf.shape(x)[1])
            dec_target_padding_mask = create_padding_mask(x, pad_token = 1)
            mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        else:
            mask = create_padding_mask(x, pad_token = 1)
        
        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        ## Adding all the embeddings
        part_emb = self.part_embedding(part_id)
        utag_emb = self.utag_embedding(utag_id)
        gtag_emb = self.gtag_embedding(gtag_id)
        timestamp_emb = self.timestamp_embedding(timestamp_id)
        question_mean_emb = self.question_mean_embedding(part_id)
        
        if 'part' in use:
            x += part_emb
            
        if 'utag' in use:
            x += utag_emb
            
        if 'gtag' in use:
            x += gtag_emb
            
        if 'timestamp' in use:
            x += timestamp_emb
            
        if 'question_mean' in use:
            x += question_mean_emb

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x, mask  # (batch_size, input_seq_len, d_model)

In [None]:
class SaintDecoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1, bidirectional_decoder = False):
        super(SaintDecoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.answer_embedding = tf.keras.layers.Embedding(5, d_model)
        self.elapsed_time_embedding = tf.keras.layers.Embedding(301, d_model)
        self.lag_time_embedding = tf.keras.layers.Embedding(301, d_model)
        self.was_explained_embedding = tf.keras.layers.Embedding(2, d_model)
        
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        
        self.bidirectional_decoder = bidirectional_decoder
    
    def call(self, x, enc_output, training = True, padding_mask = None, 
            answer_id = None,
            elapsed_time_id = None,
            lag_time_id = None,
            was_explained_id = None,
            calls = []
            ):

        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        if self.bidirectional_decoder == False:
            look_ahead_mask = create_look_ahead_mask(tf.shape(x)[1])
            dec_target_padding_mask = create_padding_mask(x, pad_token = 0)
            mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        else:
            mask = create_padding_mask(x, pad_token = 0)
        
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        
        ## Adding embeddings
        answer_emb = self.answer_embedding(answer_id)
        elapsed_time_emb = self.elapsed_time_embedding(elapsed_time_id)
        lag_time_emb = self.lag_time_embedding(lag_time_id)
        was_explained_emb = self.was_explained_embedding(was_explained_id)
        
        if 'answer' in calls:
            x += answer_emb
        
        if 'elapsed_time' in calls:
            x += elapsed_time_emb
        
        if 'lag_time' in calls:
            x += lag_time_emb
            
        if 'was_explained' in calls:
            x += was_explained_emb
            
        x = self.dropout(x, training=training)
                
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                 mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x

In [None]:
max_len = 128

list_encoder = ['exercise', 'part', 'utag', 'gtag', 'timestamp', 'question_mean']
list_decoder = ['correct', 'answer', 'elapsed_time', 'lag_time', 'was_explained']
list_output = ['exercise', 'answer', 'correct']

inputs_exercise = tf.keras.Input(shape = (max_len,))
inputs_part = tf.keras.Input(shape = (max_len,))
inputs_utag = tf.keras.Input(shape = (max_len,))
inputs_gtag = tf.keras.Input(shape = (max_len,))
inputs_timestamp = tf.keras.Input(shape = (max_len,))
inputs_question_mean = tf.keras.Input(shape = (max_len,))

inputs_correct = tf.keras.Input(shape = (max_len,))
inputs_answer = tf.keras.Input(shape = (max_len,))
inputs_elapsed_time = tf.keras.Input(shape = (max_len,))
inputs_lag_time = tf.keras.Input(shape = (max_len,))
inputs_lag_was_explained = tf.keras.Input(shape = (max_len,))

inputs = [
    inputs_exercise,
    inputs_part,
    inputs_utag,
    inputs_gtag,
    inputs_timestamp,
    inputs_question_mean,
    
    inputs_correct,
    inputs_answer,
    inputs_elapsed_time,
    inputs_lag_time,
    inputs_lag_was_explained
]

encoder = SaintEncoder(num_layers = 4, d_model = 512, num_heads = 8, 
                 dff = 1024, input_vocab_size = 15000, maximum_position_encoding = 512, 
                 rate=0, bidirectional_encoder = False)

decoder = SaintDecoder(num_layers = 4, d_model = 512, num_heads = 8, 
                       dff = 1024, target_vocab_size = 3, maximum_position_encoding = 512, 
                       rate=0, bidirectional_decoder = False)

calls_encoder = [
    'part', 
#     'utag', 
#     'gtag', 
    'timestamp', 
    'question_mean',
]

encoded, padding_mask = encoder(inputs_exercise, training = True,
            part_id = inputs_part,
            utag_id = inputs_utag,
            gtag_id = inputs_gtag,
            timestamp_id = inputs_timestamp,
            question_mean_id = inputs_question_mean,
            use = calls_encoder)

calls_decoder = [
    'answer', 
    'elapsed_time', 
    'lag_time', 
#     'was_explained',
]

decoded = decoder(inputs_correct, encoded, training = True, padding_mask = padding_mask, 
            answer_id = inputs_answer,
            elapsed_time_id = inputs_elapsed_time,
            lag_time_id = inputs_lag_time,
            was_explained_id = inputs_lag_was_explained,
            calls = calls_decoder
                 )

mix = encoded + decoded
mix = tf.keras.layers.Dense(512, activation = 'relu')(mix)

question_head = tf.keras.layers.Dense(15000, activation = 'softmax', name = 'question_head')(mix)
answer_head = tf.keras.layers.Dense(5, activation = 'softmax', name = 'answer_head')(mix)
correct_head = tf.keras.layers.Dense(3, activation = 'softmax', name = 'correct_head')(mix)

outputs = [question_head, answer_head, correct_head]

model = tf.keras.Model(inputs, outputs)

In [None]:
model.load_weights('./weights/saint+++_base_question_.h5')

In [None]:
model.summary()

In [None]:
## losses and metrics

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')

def loss_qu(real_qu, pred_qu):
    mask_qu = tf.math.logical_not(tf.math.equal(real_qu, 1))
    loss_qu_ = loss_object(real_qu, pred_qu)
    mask_qu = tf.cast(mask_qu , dtype=loss_qu_.dtype)
    loss_qu_ *= mask_qu
    loss_qu_ = tf.reduce_mean(loss_qu_)
    return loss_qu_

def loss_co(real_co, pred_co):
    mask_co = tf.math.logical_not(tf.math.equal(real_co, 0))
    loss_co_ = loss_object(real_co, pred_co)
    mask_co = tf.cast(mask_co , dtype=loss_co_.dtype)
    loss_co_ *= mask_co
    loss_co_ = tf.reduce_mean(loss_co_)
    return loss_co_*10

def loss_an(real_an, pred_an):
    mask_an = tf.math.logical_not(tf.math.equal(real_an, 0))
    loss_an_ = loss_object(real_an, pred_an)
    mask_an = tf.cast(mask_an , dtype=loss_an_.dtype)
    loss_an_ *= mask_an
    loss_an_ = tf.reduce_mean(loss_an_)
    return loss_an_*10

def acc_qu(true, pred):
    mask = tf.cast(tf.math.logical_not(tf.math.equal(true, 1)),dtype = true.dtype)
#     pred = pred[:,:,:3]
    pred = tf.math.argmax(pred, axis=-1, output_type=tf.dtypes.int64, name=None)
    pred = tf.cast(pred, dtype = true.dtype)
    pred = pred*mask
    true = true*mask
    equal = tf.cast(tf.math.equal(pred, true), dtype = true.dtype)
    n_equal = tf.math.reduce_sum(equal)
    n_mask = tf.math.reduce_sum(mask)
    n_tot = tf.math.reduce_sum(tf.cast(tf.math.greater(true, -1), dtype = true.dtype))
    n_masked = n_tot - n_mask
    return (n_equal - n_masked) / ((n_tot - n_masked))

def acc_co(true, pred):
    mask = tf.cast(tf.math.logical_not(tf.math.equal(true, 0)),dtype = true.dtype)
#     pred = pred[:,:,:3]
    pred = tf.math.argmax(pred, axis=-1, output_type=tf.dtypes.int64, name=None)
    pred = tf.cast(pred, dtype = true.dtype)
    pred = pred*mask
    true = true*mask
    equal = tf.cast(tf.math.equal(pred, true), dtype = true.dtype)
    n_equal = tf.math.reduce_sum(equal)
    n_mask = tf.math.reduce_sum(mask)
    n_tot = tf.math.reduce_sum(tf.cast(tf.math.greater(true, -1), dtype = true.dtype))
    n_masked = n_tot - n_mask
    return (n_equal - n_masked) / (n_tot - n_masked)

def acc_an(true, pred):
    mask = tf.cast(tf.math.logical_not(tf.math.equal(true, 0)),dtype = true.dtype)
#     pred = pred[:,:,:3]
    pred = tf.math.argmax(pred, axis=-1, output_type=tf.dtypes.int64, name=None)
    pred = tf.cast(pred, dtype = true.dtype)
    pred = pred*mask
    true = true*mask
    equal = tf.cast(tf.math.equal(pred, true), dtype = true.dtype)
    n_equal = tf.math.reduce_sum(equal)
    n_mask = tf.math.reduce_sum(mask)
    n_tot = tf.math.reduce_sum(tf.cast(tf.math.greater(true, -1), dtype = true.dtype))
    n_masked = n_tot - n_mask
    return (n_equal - n_masked) / (n_tot - n_masked)


In [None]:
from __future__ import division
import warnings

import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output

class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation=None, logs = {}, dico_params = {}, from_path = None):
        super(CustomCallback, self).__init__()
#         self.train = train
        self.validation = validation
        self.epoch = []
        
        self.loss = []
        self.val_loss = []
        self.roc_auc = []
        self.val_roc_auc = []
        
        self.question_head_loss = []
        self.val_question_head_loss = []
        self.question_head_acc_qu = []
        self.val_question_head_acc_qu = []
        
        self.correct_head_loss = []
        self.val_correct_head_loss = []
        self.correct_head_acc_co = []
        self.val_correct_head_acc_co = []
        
        self.answer_head_loss = []
        self.val_answer_head_loss = []
        self.answer_head_acc_an = []
        self.val_answer_head_acc_an = []
        
        self.lr = []
#         self.clshare = []
        self.qushare = []
        self.coshare = []
        self.anshare = []
        
        self.dico_params = dico_params
        
        if from_path is not None:
            (self.epoch,
            self.loss, self.val_loss, self.roc_auc, self.val_roc_auc,
            self.question_head_loss, self.val_question_head_loss, self.question_head_acc_qu, self.val_question_head_acc_qu,
            self.correct_head_loss, self.val_correct_head_loss, self.correct_head_acc_co, self.val_correct_head_acc_co,
            self.answer_head_loss, self.val_answer_head_loss, self.answer_head_acc_an, self.val_answer_head_acc_an,
            self.lr, self.qushare, self.coshare, self.anshare
            ) = load(from_path)
        
#     def on_epoch_begin(self, epoch, logs={}):
#         keys = list(logs.keys())
#         print("Start epoch {} of training; got log keys: {}".format(epoch, keys))

    def on_epoch_end(self, epoch, logs={}):
        keys = list(logs.keys())
        values = list(logs.values())
        
#         print(keys)
        
        curr_epoch = len(self.loss)
        self.epoch.append(curr_epoch)
        
        self.loss.append(logs['loss'])
        self.val_loss.append(logs['val_loss'])
        
        
        self.question_head_loss.append(logs['question_head_loss'])
        self.val_question_head_loss.append(logs['val_question_head_loss'])
        self.question_head_acc_qu.append(logs['question_head_acc_qu'])
        self.val_question_head_acc_qu.append(logs['val_question_head_acc_qu'])
        
        self.correct_head_loss.append(logs['correct_head_loss'])
        self.val_correct_head_loss.append(logs['val_correct_head_loss'])
        self.correct_head_acc_co.append(logs['correct_head_acc_co'])
        self.val_correct_head_acc_co.append(logs['val_correct_head_acc_co'])
        
        self.answer_head_loss.append(logs['answer_head_loss'])
        self.val_answer_head_loss.append(logs['val_answer_head_loss'])
        self.answer_head_acc_an.append(logs['answer_head_acc_an'])
        self.val_answer_head_acc_an.append(logs['val_answer_head_acc_an'])
        
        ## Roc auc calculation on test set
        x_val, y_val = self.validation[0], self.validation[1]
        pred = self.model.predict(x_val, verbose = 0)
        y_pred = pred[2][:,:,2]
        y_true = y_val[2]
        
        y_true = y_true.reshape(y_true.shape[0]*y_true.shape[1])
        y_pred = y_pred.reshape(y_pred.shape[0]*y_pred.shape[1])
        
        y_pred = y_pred[y_true != 0]
        y_true = y_true[y_true != 0] - 1
        
        roc_auc = roc_auc_score(y_true, y_pred)
        self.roc_auc.append(roc_auc)
        self.val_roc_auc.append(roc_auc)
        print(roc_auc)
        
        logs['roc_auc'] = roc_auc
            
        self.lr.append(self.dico_params['lr'])
#         self.clshare.append(self.dico_params['cl'])
        self.qushare.append(self.dico_params['qu'])
        self.coshare.append(self.dico_params['co'])
        self.anshare.append(self.dico_params['an'])        
        
        clear_output(wait=True)
        print(logs)
        n_rows = 5
        n_cols = 2
        fig, ax = plt.subplots(n_rows, n_cols, 
                        gridspec_kw={'hspace': 0.3, 'wspace': 0.2}, figsize = (20,n_rows*7))
        
        #General
        ax[0,0].plot(self.epoch, self.loss, label = 'loss')
        ax[0,0].plot(self.epoch, self.val_loss, label = 'val_loss')
        ax[0,0].set_title('losses')
        ax[0,0].legend()
        
        ax[0,1].plot(self.epoch, self.val_roc_auc, label = 'roc_auc_val')
        ax[0,1].set_title('roc_auc')
        ax[0,1].legend()
        
        
        ## Question target
        ax[1,0].plot(self.epoch, self.question_head_loss, label = 'question_loss')
        ax[1,0].plot(self.epoch, self.val_question_head_loss, label = 'val_question_loss')
        ax[1,0].set_title('question_loss')
        ax[1,0].legend()
        
        ax[1,1].plot(self.epoch, self.question_head_acc_qu, label = 'question_accuracy')
        ax[1,1].plot(self.epoch, self.val_question_head_acc_qu, label = 'val_question_accuracy')
        ax[1,1].set_title('question_accuracy')
        ax[1,1].legend()
        
        ## Correct classif target
        ax[2,0].plot(self.epoch, self.correct_head_loss, label = 'correct_loss')
        ax[2,0].plot(self.epoch, self.val_correct_head_loss, label = 'val_correct_loss')
        ax[2,0].set_title('correct_loss')
        ax[2,0].legend()
        
        ax[2,1].plot(self.epoch, self.correct_head_acc_co, label = 'correct_accuracy')
        ax[2,1].plot(self.epoch, self.val_correct_head_acc_co, label = 'val_correct_accuracy')
        ax[2,1].set_title('correct_accuracy')
        ax[2,1].legend()
        
        ## Answer target
        ax[3,0].plot(self.epoch, self.answer_head_loss, label = 'answer_loss')
        ax[3,0].plot(self.epoch, self.val_answer_head_loss, label = 'val_answer_loss')
        ax[3,0].set_title('answer_loss')
        ax[3,0].legend()
        
        ax[3,1].plot(self.epoch, self.answer_head_acc_an, label = 'answer_accuracy')
        ax[3,1].plot(self.epoch, self.val_answer_head_acc_an, label = 'val_answer_accuracy')
        ax[3,1].set_title('answer_accuracy')
        ax[3,1].legend()
        
        ## Lr et objective split
        ax[4,0].plot(self.epoch, self.lr, label = 'learning_rate')
        ax[4,0].set_title('learning_rate')
        ax[4,0].legend()
        
#         ax[4,1].plot(self.epoch, self.clshare, label = 'classification ratio')
        ax[4,1].plot(self.epoch, self.qushare, label = 'question ratio')
        ax[4,1].plot(self.epoch, self.coshare, label = 'correct ratio')
        ax[4,1].plot(self.epoch, self.anshare, label = 'answer ratio')
        ax[4,1].set_title('ratio of training objective')
        ax[4,1].legend()
 
        plt.legend()
        plt.show()
        
        params = (self.epoch,
        self.loss, self.val_loss, self.roc_auc, self.val_roc_auc,
            self.question_head_loss, self.val_question_head_loss, self.question_head_acc_qu, self.val_question_head_acc_qu,
            self.correct_head_loss, self.val_correct_head_loss, self.correct_head_acc_co, self.val_correct_head_acc_co,
            self.answer_head_loss, self.val_answer_head_loss, self.answer_head_acc_an, self.val_answer_head_acc_an,
            self.lr, self.qushare, self.coshare, self.anshare
        )
        save(params, 'history_epoch_'+str(curr_epoch), 'historysaint+++')
        

In [None]:
from tensorflow.keras.optimizers import Adam, SGD

# losses = [loss_cl, loss_qu, loss_co, loss_an]
lr = 3e-6
qu = 0.1
an = 0.1
co = 0.9

dico_params = {
    'lr':lr,
    'qu': qu,
    'an':an,
    'co':co
}


losses = {"question_head": loss_qu, 'answer_head': loss_an, 'correct_head': loss_co}

lossWeights = { "question_head": qu, 'answer_head': an, 'correct_head': co}

metrics = {"question_head": acc_qu, 'answer_head': acc_an, 'correct_head': acc_co}

loss_classif     =  losses # find the right loss for multi-class classification
optimizer        =  Adam(lr, 1e-8) # find the right optimizer
metrics_classif  =  []

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics,
             loss_weights=lossWeights)

In [None]:
train_gen = DataGenerator(batch_size=64, max_len = max_len, folder = 'user_batch_saint_100', strategy = 'end')
test_gen = DataGenerator(batch_size=512, max_len = max_len, folder = 'user_batch_saint_test', strategy = 'end')
x_test, y_test = test_gen[0]

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

history = CustomCallback(validation = (x_test,  y_test), dico_params = dico_params, from_path = './historysaint+++/history_epoch_46')

callbacks = [history]


batch_size = 64
n_epochs = 500
steps_per_epoch = 200

model.fit(train_gen, epochs=n_epochs,
                    steps_per_epoch = steps_per_epoch, 
                    validation_data=(x_test,  y_test), 
                    max_queue_size=20,
#                     workers=6,
                    callbacks = callbacks,
                    verbose = 1
                   )


In [None]:
model.save_weights('./weights/saint+++_base_question_.h5')

In [None]:
pred

In [None]:
true

In [None]:
help(model.save_weights)

In [None]:
model.save('./model')

In [None]:
x_test, y_test = test_gen[0]

In [None]:
pred1 = model.predict(x_test, verbose = 1)[2]
true1 = y_test[2]

In [None]:
m = 25
M = 26
bs = 512
pred = pred1[:,m:M,2]
true = true1[:,m:M]

In [None]:
y_pred = pred.reshape(bs*(M-m))
true = true.reshape(bs*(M-m))

qm = x_test[5][:,m:M].reshape(bs*(M-m))
qm = qm[true != 0]

y_pred = y_pred[true != 0]
true = true[true != 0]-1

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(true, (y_pred >= 0.5)*1)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(true, y_pred)

In [None]:
list_encoder = ['exercise', 'part', 'utag', 'gtag', 'timestamp', 'question_mean']
list_decoder = ['correct', 'answer', 'elapsed_time', 'lag_time', 'was_explained']
list_output = ['exercise', 'answer', 'correct']

In [None]:
from sklearn.metrics import roc_auc_score
def acc(true, pred):
    true1 = np.array(true)
    pred1 = np.array(pred)
    
    pred1 = pred1[true1 < 2]
    true1 = true1[true1 < 2]
    
    
    if true1.sum() == 0 or true1.sum() == len(true1):
        true1 = np.concatenate([true1, np.array([0,1])])
        pred1 = np.concatenate([pred1, np.array([0,1])])
    
    return roc_auc_score(true1, pred1)

def test(true, pred):
    p = []
    
    pred2 = pred.reshape(true.shape[0] * true.shape[1])
    true2 = true.reshape(true.shape[0] * true.shape[1])
    pred2 = pred2[true2 < 2]
    true2 = true2[true2 < 2]
    
    print(roc_auc_score(true2, pred2))
    
    for i, elt in enumerate(tqdm(true)):
#         print(pred[i])
        p.append(acc(elt, pred[i]))
    
    plt.figure(figsize = (25,15))
    plt.hist(p, bins = 50)
    
    print(np.mean(p))
    
    return p

In [None]:
pred = model.predict(X_test)

In [None]:
pred

In [None]:
true

In [None]:
pred.shape

In [None]:
pred = pred[:,:,:2]

def softmax(tab):
    e = np.exp(tab)
    s = np.sum(e, axis = -1)
        
    return e[:,:,1] / s

pred = softmax(pred)


In [None]:
x_test[5][:,0]

In [None]:
x_val = [[] for i in range(11)]
y_val = []
ind = 115
for i, elt in enumerate(x_test[0][ind]):
    if elt != 1:
        s = i
        x = [
            np.concatenate([x_test[0][ind, i:], np.zeros(s) + 1]),
            np.concatenate([x_test[1][ind, i:], np.zeros(s)]),
            np.concatenate([x_test[2][ind, i:], np.zeros(s)]),
            np.concatenate([x_test[3][ind, i:], np.zeros(s)]),
            np.concatenate([x_test[4][ind, i:], np.zeros(s)]),
            np.concatenate([x_test[5][ind, i:], np.zeros(s)]),
            
            x_test[6][ind, :],
            x_test[7][ind, :],
            x_test[8][ind, :],
            x_test[9][ind, :],
            x_test[10][ind, :],
        ]
        
        for j, elt in enumerate(x):
            x_val[j].append(elt)
        
        y_val.append(y_test[2][ind,i])
        
x_val = [np.array(elt).astype('int32') for elt in x_val]
y_val = np.array(y_val)

In [None]:
a = model.predict(x_val, verbose = True)[2]

In [None]:
a = a[:,0,2]

In [None]:
a

In [None]:
y_val -1

In [None]:
roc_auc_score(y_val, a)

In [None]:
pred.shape

In [None]:
for elt in x_val[0]:
    print(elt.shape)

In [None]:
pred[1]

In [None]:
y_test[0]

In [None]:
perf = test(y_test, pred)

## Ameliorations

In [None]:
x_val[0].shape

In [None]:
x_test[5][:,15]

In [None]:
y_test[0][:,0]

In [None]:
pred

In [None]:
true

add context on lecture and tasks

cluster lecture and tasks

give average score of a given task

enhance test set with train set (optimization constraint)


In [None]:
model.save_weights('./weights/lstmgpt_auc_0.757.h5')

In [None]:
true