In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import random
from copy import deepcopy
import _pickle as pickle
import gc

from tensorflow.keras.preprocessing.text import Tokenizer

def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile, protocol=4)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

In [None]:
os.listdir()

In [None]:
# train = pd.read_csv('train.csv')
train = load('train')

# train[train['content_id'] == 0] = 13433

lectures = pd.read_csv('lectures.csv')
questions = pd.read_csv('questions.csv')

test = pd.read_csv('example_test.csv')
sample = pd.read_csv('example_sample_submission.csv')

import gc
gc.collect()

In [None]:
train.head()

In [None]:
train = train[train['content_type_id'] == 0]

In [None]:
dico_questions = {}

for q, data in tqdm(train.groupby('content_id'), total = train['content_id'].nunique()):
    dico_questions['q_'+str(q)] = data['answered_correctly'].mean()

In [None]:
save(dico_questions, 'dico_questions_mean')

In [None]:
class FakeDataGenerator:
    
    def __init__(self):
        '''
        self.data will be a dictionnary to iterate over the stored data
        self.all_rows will be the rows of the train set that are used by the generato
        self.data_index will be all the data available in the dataset        
        '''
        self.data = None
        self.all_rows = None
        self.data_index = None
        return None
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        sub = sample[['row_id', 'group_num']].copy()
        sub['answered_correctly'] = np.zeros(sub.shape[0])+0.5
        return (sample, sub)
    
    
    def load(self, save_name):
        self.data,self.all_rows = load(save_name)
        self.data_index = np.array(list(self.data.keys()))
    
    def build_from_train(self, train, n_users, beginner_rate = 0.3, save_name = 'fake_train_generator'):
        """
        train will be the training set you loaded
        n_users is a number of user from whom you will sample the data
        beginner_rate is the rate of these users who will begin their journey during test
        save_name : the name under which the item will be saved
        """
        
        ## Sampling a restricted list of users
        user_list = train['user_id'].unique()
        test_user_list = np.random.choice(user_list, size = n_users)
        train.index = train['user_id']
        test_data_non_filter = train.loc[test_user_list]
        test_data_non_filter.index = list(range(test_data_non_filter.shape[0]))
        
        ## building a dictionnary with all the rows and container id from a user
        dico_user = {}
        def agg(x):
            return [elt for elt in x]
        
        print("Generating user dictionnary")
        for user, frame in tqdm(test_data_non_filter.groupby('user_id'), total =test_data_non_filter['user_id'].nunique()):
            if frame.shape[0] > 0:
                dico_user[user] = {}

                dico_user[user]['min_indice'] = frame['task_container_id'].min()
                dico_user[user]['max_indice'] = frame['task_container_id'].max()

                r = random.uniform(0,1)
                if r < beginner_rate:
                    dico_user[user]['current_indice'] = dico_user[user]['min_indice']
                else:
                    dico_user[user]['current_indice'] = random.randint(dico_user[user]['min_indice'],dico_user[user]['max_indice']-2)

                row_ids = frame[['task_container_id','row_id']].groupby('task_container_id').agg(agg)
                row_ids = row_ids.to_dict()['row_id']
                dico_user[user]['row_ids'] = row_ids

        work_dico = deepcopy(dico_user)
        
        ## Choosing batch_data to generate
        work_dico = deepcopy(dico_user)
        batches = {}

        all_rows = []
        batch_number = 0
        
        print('Creating batches')
        while len(work_dico)> 1:

            size = random.randint(20,500)
            size = min(size, len(work_dico))


            batch = []

            users = np.random.choice(np.array(list(work_dico.keys())),replace = False,  size = size)

            for u in users:
                try:
                    batch.extend(work_dico[u]['row_ids'][work_dico[u]['current_indice']])
                    all_rows.extend(work_dico[u]['row_ids'][work_dico[u]['current_indice']])
                    work_dico[u]['current_indice'] += 1
                    if work_dico[u]['current_indice'] == work_dico[u]['max_indice']:
                        work_dico.pop(u)
                except:
                    work_dico.pop(u)

            batches[batch_number] = batch
            batch_number += 1
        
        ## building data

        data = {}
        
        print("Building dataset")
        test_data_non_filter.index = test_data_non_filter['row_id']
        for i in tqdm(batches):
            current_data = test_data_non_filter.loc[np.array(batches[i])]
            current_data['group_num'] = i

            current_data['prior_group_answers_correct'] = [np.nan for elt in range(current_data.shape[0])]
            current_data['prior_group_responses'] = [np.nan for elt in range(current_data.shape[0])]

            if i != 0:
                current_data['prior_group_answers_correct'].iloc[0] = saved_correct_answer
                current_data['prior_group_responses'].iloc[0] = saved_answer

            saved_answer = str(list(current_data[current_data['content_type_id'] == 0]['user_answer'].values))
            saved_correct_answer = str(list(current_data[current_data['content_type_id'] == 0]['answered_correctly'].values))
            current_data = current_data.drop(columns = ['user_answer', 'answered_correctly'])

            data[i] = current_data

        save((data,np.array(all_rows)) , save_name)
        
        self.data = data
        self.all_rows = np.array(all_rows)
        self.data_index = np.array(list(data.keys()))
        print('finished')

In [None]:
env = FakeDataGenerator()

In [None]:
env.build_from_train(train, 15000, beginner_rate = 0.3, save_name = 'fake_train_generator')

In [None]:
len(env.all_rows)

In [None]:
env.load('fake_train_generator')

In [None]:
env.all_rows

In [None]:
train.index = train['row_id']

In [None]:
train = train.drop(index = env.all_rows)

In [None]:
save(train, 'train_train')

## EDA

In [None]:
393656/5000

In [None]:

dico = {}
count = 0
for userid, data in tqdm(train.groupby('user_id'), total = train['user_id'].nunique()):
    dico[userid] = data
    if len(dico.keys()) == 10000:
        save(dico, 'userbatch_'+str(count), 'user_batch')
        count+=1
        dico = {}

In [None]:
train.shape

In [None]:
train.head(50)

In [None]:
for elt in train.columns:
    print(elt + '       '+ str(train[elt].nunique()))

In [None]:
train.describe()

In [None]:
u115.head(50)

timestamp : relative time since first interaction

user_id : identifier of the user

content_id : identifier of the content

content_type_id : 0 = question, 1 = lecture

task_container_id : identifier of a sequence of question (ie correction a la fin de la sequence)

user_answer : user answer

answered correctly : the user answered correctly to the question

prior_quesiton_elapsed_time : avg time the user spend on the last container

prior_question_had_explanatione : in a same bundle if the user have seen the answer of the last question or not

In [None]:
questions.shape

In [None]:
questions.head()

In [None]:
for elt in questions.columns:
    print(elt + '       '+ str(questions[elt].nunique()))

In [None]:
questions['bundle_id'].unique()

Parts
Section 1 listening
1 : 6 questions, four oral statement about photo choose right one
2 : 25 questions, 3 reponse for one question oraly
3 : 39 questions, conversation between people, question written, select best answer
4 : 30 questions, talks or narrations ...
    
Section 2 reading
5 : 30 questions, incomplete sentence completion
6 : 16 questions, text completion
7 : 29 + 25 questions, text understanding

In [None]:
lectures.shape

In [None]:
lectures.head()

In [None]:
lectures['type_of'].unique()

In [None]:
for elt in lectures.columns:
    print(elt + '       '+ str(lectures[elt].nunique()))

## Modeling

In [None]:
cat_cols = [
    'content_id',
    'content_type_id',
    'task_container_id',
    'user_answer_last',
    'answered_correctly_last',
    'prior_question_had_explanation',
]

num_cols = [
    "timestamp",
    "prior_question_elapsed_time",
]

pred_col = 'answered_correctly'

In [None]:
sequence1 : ['content_id' + 'type_id','time_spent_discretised', 'answer', 'answer_correctly']
sequence2 (embedding) : ['timestamp']
sequence3 (embedding): ['number of event before']
mask1 :  'padding_mask'
mask2 :  'answer_correctly_mask'


In [None]:
train = load('train_train')

In [None]:
gc.collect()

In [None]:
def build_user_sequence(df_user):
    
    df_user =  df_user.sort_values(by = 'timestamp')
    df_user.index = list(range(df_user.shape[0]))
    
    df_user['content_type'] =  df_user['content_type_id'].apply(lambda x : 'q' if x == 0 else 'l')
    df_user['content_seq'] = df_user['content_type'].astype(str) + '_' + df_user['content_id'].astype(str)
    df_user['user_answer_seq'] = 'a_' + df_user['user_answer'].astype(str)
    df_user['user_answer_corr_seq'] = 'r_' + df_user['answered_correctly'].astype(str)
    
    seq_to_encode = df_user[['content_seq', 'user_answer_seq', 'user_answer_corr_seq']].values
    
    reduced_seq = list(map(lambda x : ' '.join(x), seq_to_encode))
    
    seq = " ".join(reduced_seq).split(' ')
    
    timestamps = df_user['timestamp'].values
    timestamps = np.repeat(timestamps, 3)
    
    position = df_user.index
    position = np.repeat(position, 3)
    
    ## Build masks and outups
    y = []
    for elt in seq[2:]:
        if elt == 'r_0':
            y.append(0)
        elif elt == 'r_1':
            y.append(1)
        else:
            y.append(2)
    y = np.array(y)
    
    return seq, timestamps, position, y

In [None]:
user_test = train[train['user_id'] == 115]

In [None]:
seq, timestamps, position, y = build_user_sequence(user_test)

In [None]:
train.head()

In [None]:
dico_user = {}
count = 0
for user, data_user in tqdm(train.groupby('user_id'), total = train['user_id'].nunique()):
    seq, timestamps, position, y = build_user_sequence(data_user)
    dico_user[user] = {
                        "sequence" : seq,
                        "timestamps" : timestamps,
                        "position" : position,
                        "output" : y
                    }
    
    if len(dico_user) == 1000:
        save(dico_user, 'batch_'+str(count), 'user_batch')
        dico_user = {}
        count +=1

In [None]:
save(dico_user, 'batch_'+str(count), 'user_batch')

## Tokenization

In [None]:
dico_user = load('batch_'+str(0), 'user_batch')

In [None]:
## Building tokenizer
lectures = pd.read_csv('lectures.csv')
questions = pd.read_csv('questions.csv')
user_answer = np.array([-1,0,1,2,3])
answered_correctly = np.array([-1,0,1])


lectures_id = lectures['lecture_id'].unique()
question_id = questions['question_id'].unique()

In [None]:
user_answer = ['a_'+ elt for elt in user_answer.astype(str)]
answered_correctly = ['r_' + elt for elt in answered_correctly.astype(str)]
lectures_id = ['l_' +  elt for elt in  lectures_id.astype(str)]
question_id = ['q_' +  elt for elt in  question_id.astype(str)]

In [None]:
all_tokens = np.array(['[PAD]','[CLS]','[RES]'] + answered_correctly + user_answer + lectures_id + question_id)

In [None]:
tokenizer = Tokenizer(filters = '')

tokenizer.fit_on_texts(
    all_tokens
)

In [None]:
dir(tokenizer)

In [None]:
tokenizer.word_index.values()

In [None]:
save(tokenizer, 'tokenizer')

## Data Generator

In [None]:
import tensorflow as tf
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self,batch_size=32, max_len = 128, folder = 'user_batch'):
        self.batch_size = batch_size
        self.tokenizer = load('tokenizer')
        self.max_len = max_len
        self.folder = folder
        self.dico_question = load('dico_questions_mean')
        
    def __len__(self):
        return 1000000

    
    def __getitem__(self, index):
        ## Load random batch
        file_name = random.choice(os.listdir('./'+self.folder))
        dico_user = load(file_name.split('.')[0], self.folder)
        
        list_user = np.random.choice(list(dico_user.keys()), size = self.batch_size)
        
        
        sequence = []
        timestamp = []
        positions = []
        questions_mean = []
        user_average = []
        y = []
        
        for user in list_user:
            s = dico_user[user]['sequence']
            t = list(dico_user[user]['timestamps'])
            p = list(range(int(len(t)/3)))
            p = list(np.repeat(p, 3))
            y_temp = list(dico_user[user]['output'])
            
            ## Choose if we start from start or not
            a = random.uniform(0,1)
            
            if a < 0.5:
                start = 0
                s =  ['[CLS]'] + s
            else:
                start = random.choice(list(range(len(s))))
                s = ['[RES]'] + s[start:]
                t = t[start:]
                p = p[start:]
                y_temp = y_temp[start:]
            
            t = [0] + t
            p = [0] + p
            y_temp = [2] + y_temp +[2,2]
            
            ## Padding
            while len(s) <= self.max_len:
                s += ['[PAD]']
                t += [0]
                p += [0]
                y_temp += [2]
                
            s = s[:self.max_len]
            t = t[:self.max_len]
            p = p[:self.max_len]
            y_temp = y_temp[:self.max_len]
            
            ## Mean of questions
            qm = []
            for elt in s:
                try:
                    qm.append(self.dico_question[elt])
                except:
                    qm.append(0.5)
            
            ## Avg user grade
            um = []
            m  =[]
            for elt in s:
                if elt == 'r_0':
                    m.append(0)
                if elt == 'r_1':
                    m.append(1)
                
                if len(m) == 0:
                    um.append(0.5)
                else:
                    um.append(np.mean(m))           
            
            
            
            sequence.append(s)
            timestamp.append(t)
            positions.append(p)
            questions_mean.append(qm)
            user_average.append(um)
            y.append(y_temp)
            
        sequence = self.tokenizer.texts_to_sequences(sequence)
        sequence = np.array(sequence)
        timestamp = np.array(timestamp)
        positions = np.array(positions)
        questions_mean = np.array(questions_mean).astype("float32")
        user_average = np.array(user_average).astype("float32")
        y = np.array(y)
        
        timestamp = timestamp.reshape((timestamp.shape[0],timestamp.shape[1], 1))
        timestamp = np.log(timestamp+1)/10
        
#         print(sequence.shape)
#         print(questions_mean)
#         print(user_average)
        
        questions_mean = questions_mean.reshape((self.batch_size, self.max_len, 1))
        user_average = user_average.reshape((self.batch_size, self.max_len, 1))
        
        numericals = np.concatenate([timestamp, questions_mean, user_average], axis = -1)
        
        
        X = [sequence, numericals, positions]
#         X = sequence
#         y = tf.keras.utils.to_categorical(y)
        return X, y

    def on_epoch_end(self):
        pass

    def __get_data(self, batch):
        pass

In [None]:
gen = DataGenerator(batch_size=32, max_len = 128, folder = 'user_batch')

In [None]:
x, y = gen[0]

In [None]:
x[1].shape

In [None]:
from tf_transformers2 import *
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, LSTM
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
class LSTMGPTDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(LSTMGPTDecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.lstm = tf.keras.layers.LSTM(
                        d_model, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
                         recurrent_dropout=0.0, implementation=2, return_sequences=True
                        )
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, training, look_ahead_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)
        
        x = self.lstm(x)
#         print(x.shape)
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

#        attn2, attn_weights_block2 = self.mha2(out1, out1, out1, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
#        attn2 = self.dropout2(attn2, training=training)
#        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out1)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1  

In [None]:
class GPTDecoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
               maximum_position_encoding, rate=0.1, bidirectional_decoder = False):
        super(GPTDecoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(14000, d_model)
        
#         self.token_types_embedding = tf.keras.layers.Embedding(num_types, d_model)
        
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        
        self.numericals_encoding = Dense(d_model, activation = 'relu')
        
        self.question_mean_encoding = Dense(d_model, activation = 'relu')
        
        self.position_bis_encoding = tf.keras.layers.Embedding(20000, d_model)
        
        
        self.dec_layers = [LSTMGPTDecoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        
        self.bidirectional_decoder = bidirectional_decoder
    
    def call(self, x, training = True, numericals = None, positions = None):

        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        if self.bidirectional_decoder == False:
            look_ahead_mask = create_look_ahead_mask(tf.shape(x)[1])
            dec_target_padding_mask = create_padding_mask(x)
            mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        else:
            mask = create_padding_mask(x)
        
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        if numericals is not None:
            numericals_emb = self.numericals_encoding(numericals)
            x += numericals_emb
        
        if positions is not None:
            positions_emb = self.position_bis_encoding(positions)
            x += positions_emb
            
#         if question_mean is not None:
#             question_mean_emb = self.question_mean_encoding(question_mean)
#             x += question_mean_emb
        
        x = self.dropout(x, training=training)
        

        for i in range(self.num_layers):
            x, block1 = self.dec_layers[i](x, training, look_ahead_mask = mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1

        return x 

In [None]:
max_len = 3*128




inputs_ids = tf.keras.Input(shape = (max_len,))
inputs_numericals = tf.keras.Input(shape = (max_len,3,))
inputs_positions = tf.keras.Input(shape = (max_len,))

inputs =[inputs_ids, inputs_numericals, inputs_positions]

decoder = GPTDecoder(num_layers = 4, d_model = 512, 
                     num_heads = 8, dff = 512,
                    maximum_position_encoding = 3*128, 
                     rate=0.1, bidirectional_decoder = False)

decoded = decoder(inputs_ids, numericals = inputs_numericals, positions = inputs_positions)

outputs = tf.keras.layers.Dense(3, activation = 'softmax')(decoded)

model = Model(inputs, outputs)

In [None]:
model.summary()

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')


# pred batch_size, seq_lenght, 3

def loss_function(real, pred):
    mask1 = tf.math.logical_not(tf.math.equal(real, 2))
    loss_ = loss_object(real, pred)
    mask1 = tf.cast(mask1, dtype=loss_.dtype)
    loss_ *= mask1
    return tf.reduce_mean(loss_)

def acc(true, pred):
    mask = tf.cast(tf.math.logical_not(tf.math.equal(true, 2)),dtype = true.dtype)
    
    pred = pred[:,:,:2]
    pred = tf.math.argmax(pred, axis=-1, output_type=tf.dtypes.int64, name=None)
    pred = tf.cast(pred, dtype = true.dtype)
    
    pred = pred*mask
    true = true*mask
    
    equal = tf.cast(tf.math.equal(pred, true), dtype = true.dtype)
    
    n_equal = tf.math.reduce_sum(equal)
    n_mask = tf.math.reduce_sum(mask)
    n_tot = tf.math.reduce_sum(tf.cast(tf.math.greater(true, -1), dtype = true.dtype))
    n_masked = n_tot - n_mask
    
    return (n_equal - n_masked) / (n_tot - n_masked)


In [None]:
from tensorflow.keras.optimizers import Adam, SGD

loss_classif     =  loss_function # find the right loss for multi-class classification
optimizer        =  Adam(3e-5, 1e-8) # find the right optimizer
metrics_classif  =  [acc]

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [None]:
train_gen = DataGenerator(batch_size=32, max_len = 3*128, folder = 'user_batch')
test_gen = DataGenerator(batch_size=1024, max_len = 3*128, folder = 'user_batch_test')
x_test, y_test = test_gen[0]

In [None]:
class Roc_Auc(tf.keras.callbacks.Callback):

    def __init__(self, train = None, validation=None):
        super(Roc_Auc, self).__init__()
        self.train = train
        self.validation = validation

    def on_epoch_end(self, epoch, logs={}):
        
        x_val, y_val = self.validation[0], self.validation[1]
        
        pred = self.model.predict(x_test, verbose = 0)
        
        pred = pred[:,:,1]
        y_pred = pred.reshape(1024*384)
        true = y_test.reshape(1024*384)

        y_pred = y_pred[true != 2]
        true = true[true != 2]
        
        metric = roc_auc_score(true, y_pred)
        logs['roc_auc_val'] = metric
        print(logs)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=7, verbose=1, 
                                                mode='auto', restore_best_weights=True)
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, 
                                                 mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
roc = Roc_Auc(validation = (x_test,  y_test))

bs = 32
n_epochs = 50
steps_per_epoch = 1250
# steps_per_epoch = 10
#, batch_size=bs
history = model.fit(train_gen, epochs=n_epochs,steps_per_epoch = steps_per_epoch, validation_data=(x_test,  y_test), callbacks = [early, reduce, roc])

In [None]:
pred = model.predict(x_test, verbose = 1)

In [None]:
pred = pred[:,:,1]
y_pred = pred.reshape(1024*384)
true = y_test.reshape(1024*384)

y_pred = y_pred[true != 2]
true = true[true != 2]

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(true, (y_pred >= 0.5)*1)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(true, y_pred)

In [None]:
model.save_weights('./weights/lstmgpt_auc_0.757.h5')

In [None]:
from sklearn.metrics import roc_auc_score
def acc(true, pred):
    true1 = np.array(true)
    pred1 = np.array(pred)
    
    pred1 = pred1[true1 < 2]
    true1 = true1[true1 < 2]
    
    
    if true1.sum() == 0 or true1.sum() == len(true1):
        true1 = np.concatenate([true1, np.array([0,1])])
        pred1 = np.concatenate([pred1, np.array([0,1])])
    
    return roc_auc_score(true1, pred1)

def test(true, pred):
    p = []
    
    pred2 = pred.reshape(true.shape[0] * true.shape[1])
    true2 = true.reshape(true.shape[0] * true.shape[1])
    pred2 = pred2[true2 < 2]
    true2 = true2[true2 < 2]
    
    print(roc_auc_score(true2, pred2))
    
    for i, elt in enumerate(tqdm(true)):
#         print(pred[i])
        p.append(acc(elt, pred[i]))
    
    plt.figure(figsize = (25,15))
    plt.hist(p, bins = 50)
    
    print(np.mean(p))
    
    return p

In [None]:
pred = model.predict(X_test)

In [None]:
pred.shape

In [None]:
pred = pred[:,:,:2]

def softmax(tab):
    e = np.exp(tab)
    s = np.sum(e, axis = -1)
        
    return e[:,:,1] / s

pred = softmax(pred)


In [None]:
pred.shape

In [None]:
pred[1]

In [None]:
y_test[0]

In [None]:
perf = test(y_test, pred)

## Ameliorations

add context on lecture and tasks

cluster lecture and tasks

give average score of a given task

enhance test set with train set (optimization constraint)
