In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import random
from copy import deepcopy
import _pickle as pickle
import gc
from multiprocess import Pool
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer

def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile, protocol=4)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

In [None]:
os.listdir()

In [None]:
# train = pd.read_csv('train.csv')
train = load('train')

# train[train['content_id'] == 0] = 13433

lectures = pd.read_csv('lectures.csv')
questions = pd.read_csv('questions.csv')

test = pd.read_csv('example_test.csv')
sample = pd.read_csv('example_sample_submission.csv')

import gc
gc.collect()

In [None]:
train.head()

In [None]:
train = train[train['content_type_id'] == 0]

In [None]:
dico_questions = {}

for q, data in tqdm(train.groupby('content_id'), total = train['content_id'].nunique()):
    dico_questions['q_'+str(q)] = data['answered_correctly'].mean()

In [None]:
save(dico_questions, 'dico_questions_mean')

In [None]:
class FakeDataGenerator:
    
    def __init__(self):
        '''
        self.data will be a dictionnary to iterate over the stored data
        self.all_rows will be the rows of the train set that are used by the generato
        self.data_index will be all the data available in the dataset        
        '''
        self.data = None
        self.all_rows = None
        self.data_index = None
        return None
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        sub = sample[['row_id', 'group_num']].copy()
        sub['answered_correctly'] = np.zeros(sub.shape[0])+0.5
        return (sample, sub)
    
    
    def load(self, save_name):
        self.data,self.all_rows = load(save_name)
        self.data_index = np.array(list(self.data.keys()))
    
    def build_from_train(self, train, n_users, beginner_rate = 0.3, save_name = 'fake_train_generator'):
        """
        train will be the training set you loaded
        n_users is a number of user from whom you will sample the data
        beginner_rate is the rate of these users who will begin their journey during test
        save_name : the name under which the item will be saved
        """
        
        ## Sampling a restricted list of users
        user_list = train['user_id'].unique()
        test_user_list = np.random.choice(user_list, size = n_users)
        train.index = train['user_id']
        test_data_non_filter = train.loc[test_user_list]
        test_data_non_filter.index = list(range(test_data_non_filter.shape[0]))
        
        ## building a dictionnary with all the rows and container id from a user
        dico_user = {}
        def agg(x):
            return [elt for elt in x]
        
        print("Generating user dictionnary")
        for user, frame in tqdm(test_data_non_filter.groupby('user_id'), total =test_data_non_filter['user_id'].nunique()):
            if frame.shape[0] > 0:
                dico_user[user] = {}

                dico_user[user]['min_indice'] = frame['task_container_id'].min()
                dico_user[user]['max_indice'] = frame['task_container_id'].max()

                r = random.uniform(0,1)
                if r < beginner_rate:
                    dico_user[user]['current_indice'] = dico_user[user]['min_indice']
                else:
                    dico_user[user]['current_indice'] = random.randint(dico_user[user]['min_indice'],dico_user[user]['max_indice']-2)

                row_ids = frame[['task_container_id','row_id']].groupby('task_container_id').agg(agg)
                row_ids = row_ids.to_dict()['row_id']
                dico_user[user]['row_ids'] = row_ids

        work_dico = deepcopy(dico_user)
        
        ## Choosing batch_data to generate
        work_dico = deepcopy(dico_user)
        batches = {}

        all_rows = []
        batch_number = 0
        
        print('Creating batches')
        while len(work_dico)> 1:

            size = random.randint(20,500)
            size = min(size, len(work_dico))


            batch = []

            users = np.random.choice(np.array(list(work_dico.keys())),replace = False,  size = size)

            for u in users:
                try:
                    batch.extend(work_dico[u]['row_ids'][work_dico[u]['current_indice']])
                    all_rows.extend(work_dico[u]['row_ids'][work_dico[u]['current_indice']])
                    work_dico[u]['current_indice'] += 1
                    if work_dico[u]['current_indice'] == work_dico[u]['max_indice']:
                        work_dico.pop(u)
                except:
                    work_dico.pop(u)

            batches[batch_number] = batch
            batch_number += 1
        
        ## building data

        data = {}
        
        print("Building dataset")
        test_data_non_filter.index = test_data_non_filter['row_id']
        for i in tqdm(batches):
            current_data = test_data_non_filter.loc[np.array(batches[i])]
            current_data['group_num'] = i

            current_data['prior_group_answers_correct'] = [np.nan for elt in range(current_data.shape[0])]
            current_data['prior_group_responses'] = [np.nan for elt in range(current_data.shape[0])]

            if i != 0:
                current_data['prior_group_answers_correct'].iloc[0] = saved_correct_answer
                current_data['prior_group_responses'].iloc[0] = saved_answer

            saved_answer = str(list(current_data[current_data['content_type_id'] == 0]['user_answer'].values))
            saved_correct_answer = str(list(current_data[current_data['content_type_id'] == 0]['answered_correctly'].values))
            current_data = current_data.drop(columns = ['user_answer', 'answered_correctly'])

            data[i] = current_data

        save((data,np.array(all_rows)) , save_name)
        
        self.data = data
        self.all_rows = np.array(all_rows)
        self.data_index = np.array(list(data.keys()))
        print('finished')

In [None]:
env = FakeDataGenerator()

In [None]:
env.build_from_train(train, 15000, beginner_rate = 0.3, save_name = 'fake_train_generator')

In [None]:
len(env.all_rows)

In [None]:
env.load('fake_train_generator')

In [None]:
env.all_rows

In [None]:
train.index = train['row_id']

In [None]:
train = train.drop(index = env.all_rows)

In [None]:
save(train, 'train_train')

## EDA

In [None]:
393656/5000

In [None]:

dico = {}
count = 0
for userid, data in tqdm(train.groupby('user_id'), total = train['user_id'].nunique()):
    dico[userid] = data
    if len(dico.keys()) == 10000:
        save(dico, 'userbatch_'+str(count), 'user_batch')
        count+=1
        dico = {}

In [None]:
train.shape

In [None]:
train.head(50)

In [None]:
for elt in train.columns:
    print(elt + '       '+ str(train[elt].nunique()))

In [None]:
train.describe()

In [None]:
u115.head(50)

timestamp : relative time since first interaction

user_id : identifier of the user

content_id : identifier of the content

content_type_id : 0 = question, 1 = lecture

task_container_id : identifier of a sequence of question (ie correction a la fin de la sequence)

user_answer : user answer

answered correctly : the user answered correctly to the question

prior_quesiton_elapsed_time : avg time the user spend on the last container

prior_question_had_explanatione : in a same bundle if the user have seen the answer of the last question or not

In [None]:
questions.shape

In [None]:
questions.head()

In [None]:
for elt in questions.columns:
    print(elt + '       '+ str(questions[elt].nunique()))

In [None]:
questions['bundle_id'].unique()

Parts
Section 1 listening
1 : 6 questions, four oral statement about photo choose right one
2 : 25 questions, 3 reponse for one question oraly
3 : 39 questions, conversation between people, question written, select best answer
4 : 30 questions, talks or narrations ...
    
Section 2 reading
5 : 30 questions, incomplete sentence completion
6 : 16 questions, text completion
7 : 29 + 25 questions, text understanding

In [None]:
lectures.shape

In [None]:
lectures.head()

In [None]:
lectures['type_of'].unique()

In [None]:
for elt in lectures.columns:
    print(elt + '       '+ str(lectures[elt].nunique()))

## Modeling

In [None]:
cat_cols = [
    'content_id',
    'content_type_id',
    'task_container_id',
    'user_answer_last',
    'answered_correctly_last',
    'prior_question_had_explanation',
]

num_cols = [
    "timestamp",
    "prior_question_elapsed_time",
]

pred_col = 'answered_correctly'

In [None]:
sequence1 : ['content_id' + 'type_id','time_spent_discretised', 'answer', 'answer_correctly']
sequence2 (embedding) : ['timestamp']
sequence3 (embedding): ['number of event before']
mask1 :  'padding_mask'
mask2 :  'answer_correctly_mask'


In [None]:
train = load('train_train')

In [None]:
gc.collect()

In [None]:
test_user = train[train['user_id'] == 115]

In [None]:
def build_user_sequence(df_user):
    
    df_user =  df_user.sort_values(by = 'timestamp')
    df_user.index = list(range(df_user.shape[0]))
    
    df_user['content_type'] =  df_user['content_type_id'].apply(lambda x : 'q' if x == 0 else 'l')
    df_user['content_seq'] = df_user['content_type'].astype(str) + '_' + df_user['content_id'].astype(str)
    
    ## Encoder
    exercise_id = df_user['content_seq'].values
    container_id = df_user['task_container_id'].values
    timestamp = df_user['timestamp'].values
    
    ## Decoder
    correctness = df_user['answered_correctly'].values
    answer = df_user['user_answer'].values
    
    elapsed_time = df_user['prior_question_elapsed_time'].values[1:] ## Already Padded
    prior_question_had_explanation = df_user['prior_question_had_explanation'].values[1:]*1 ## Already Padded
    
    lag_time = timestamp[1:] - timestamp[:1] + elapsed_time
    
    dico = {
        'exercise_id' : exercise_id,
        'container_id' : container_id,
        'timestamp' : timestamp,
        'correctness' : correctness,
        'answer' : answer, 
        'elapsed_time' : elapsed_time,
        'prior_question_had_explanation' : prior_question_had_explanation,
        'lag_time' : lag_time
    }
    return dico
    

In [None]:
%%timeit
dico = build_user_sequence(test_user)

In [None]:
batch_size = 2000
count = 0
vect = []
count = 0
p = Pool(12)

for elt in tqdm(train.groupby('user_id'), total = train['user_id'].nunique()):
    vect.append(elt)
    if len(vect) == batch_size:
        vect = np.array(vect)
        vect_user = vect[:,0]
        vect_data = vect[:,1]
        vect = []
        
        processed_dico = p.map(build_user_sequence, vect_data)
        
        dico_user = {}
        for i, elt in enumerate(vect_user):
            dico_user[elt] = processed_dico[i]
        save(dico_user, 'batch_'+str(count), 'user_batch_saint')
        count += 1
        
p.close()

## Tokenization

In [None]:
dico_user = load('batch_'+str(0), 'user_batch_saint')

In [None]:
## Building tokenizer
lectures = pd.read_csv('lectures.csv')
questions = pd.read_csv('questions.csv')
user_answer = np.array([-1,0,1,2,3])
answered_correctly = np.array([-1,0,1])


lectures_id = lectures['lecture_id'].unique()
question_id = questions['question_id'].unique()

In [None]:
lectures_id = ['l_' +  elt for elt in  lectures_id.astype(str)]
question_id = ['q_' +  elt for elt in  question_id.astype(str)]

In [None]:
all_tokens = np.array(['[PAD]'] + lectures_id + question_id)

In [None]:
tokenizer = Tokenizer(filters = '')

tokenizer.fit_on_texts(
    all_tokens
)

In [None]:
tokenizer.word_index.values()

In [None]:
save(tokenizer, 'tokenizer')

In [None]:
tokenizer = load('tokenizer')

In [None]:
dir(tokenizer)

In [None]:
tokenizer.word_index

In [None]:
dico_user[115].keys()

In [None]:
m = 0
for elt in dico_user:
    a = len(dico_user[elt]['container_id'])
    if a >= m:
        m = a

In [None]:
m

## Data Generator

In [2]:
def create_dictionnaries():
    lectures = pd.read_csv('lectures.csv')
    questions = pd.read_csv('questions.csv')
    
    ## lecture
    id_lectures = lectures['lecture_id']
    part_lecture = lectures['part']
    tag_lecture = lectures['tag']
    
    id_to_part = {}
    id_to_tag = {}
    
    for i, line in lectures.iterrows():
        ids = 'l_' + str(line['lecture_id'])
        id_to_part[ids] = line['part']
        id_to_tag[ids] = line['tag']
    
    for i, line in questions.iterrows():
        ids = 'q_' + str(line['question_id'])
        id_to_part[ids] = line['part']
        try:
            id_to_tag[ids] = np.array(str(line['tags']).split(' ')).astype(int)
        except:
            id_to_tag[ids] = np.array([])
            
    return id_to_part, id_to_tag

In [3]:
import tensorflow as tf
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self,batch_size=32, max_len = 128, folder = 'user_batch_saint'):
        self.batch_size = batch_size
        self.tokenizer = load('tokenizer')
        self.max_len = max_len
        self.folder = folder
        self.dico_question = load('dico_questions_mean')
        self.id_to_part, self.id_to_tag = create_dictionnaries()
        
    def __len__(self):
        return 1000000



    def __getitem__(self, index):
        ## Load random batch
        file_name = random.choice(os.listdir('./'+self.folder))
        dico_user = load(file_name.split('.')[0], self.folder)
        
        list_user = np.random.choice(list(dico_user.keys()), size = self.batch_size)
        
        ## Encoder
        exercise = []
        container = []
        was_tagged = []
        timestamp = []
        question_mean = []
        parts = []
        
        ## Ouput
        output = []
        
        ## Decoder
        correctness = []
        answer = []
        elapsed_time = []
        lag_time = []
        was_explained = []
        
        
        for user in list_user:
            ex = list(dico_user[user]['exercise_id'])
            cont  = list(dico_user[user]['container_id'])
            times = list(dico_user[user]['timestamp'])
            
            y = list(dico_user[user]['correctness'] + 1)
            
            ans = list(dico_user[user]['answer'] + 1)
            el = list(dico_user[user]['elapsed_time'])
            lag = list(dico_user[user]['lag_time'])
            expl = dico_user[user]['prior_question_had_explanation']
            
            a = (expl == 0)
            b = (expl == 1)
            c = np.bitwise_not(a|b)
            expl[c] = 0
#             expl[np.isnan(expl)] = 0
            expl = list(expl)
            
            ## Choose if we start from start or not
            a = random.uniform(0,1)
            if a < 0.5:
                start = 0
            else:
                start = random.choice(list(range(len(ex))))
            
            ex = ex[start:]
            cont = cont[start:]
            times = times[start:]
            
            y = y[start:]
            
            cor = [0] + y[:-1]
            ans = [2] + ans[:-1]
            lag = [0] + lag
            el =  [0] + el
            expl = [0] + expl
            
            ## Padding
            while len(ex) <= self.max_len:
                ex += ['[PAD]']
                cont += [0]
                times += [0]
                y += [3]
                cor += [3]
                ans += [0]
                lag += [0]
                el += [0]
                expl += [0]
                
            ex = ex[:self.max_len]
            cont = cont[:self.max_len]
            times = times[:self.max_len]
            y = y[:self.max_len]
            cor = cor[:self.max_len]
            ans = ans[:self.max_len]
            lag = lag[:self.max_len]
            el = el[:self.max_len]
            expl = expl[:self.max_len]
            
            
            
            ## Mean of questions
            qm = []
            for elt in ex:
                try:
                    qm.append(self.dico_question[elt])
                except:
                    qm.append(0.5)
            
            ## Add if a question was tagged in a previous lecture
            tagged = []
            is_tagged = []
            for elt in ex:
                if elt[0] == 'l':
                    tagged.append(self.id_to_tag[elt])
                    is_tagged.append(0)
                else:
                    try:
                        tags = self.id_to_tag[elt]
                    except:
                        tags = []
                    cond = False
                    for elt in tags:
                        if elt in tagged:
                            cond = True
                    if cond:
                        is_tagged.append(1)
                    else:
                        is_tagged.append(0)
                        
            ## Add part of sequence
            p = []
            for elt in ex:
                try:
                    p.append(self.id_to_part[elt])
                except:
                    p.append(0)
            
            
            
            
            exercise.append(ex)
            container.append(cont)
            was_tagged.append(is_tagged)
            timestamp.append(times)
            question_mean.append(qm)
            parts.append(p)

            ## Ouput
            output.append(y)

            ## Decoder
            correctness.append(cor)
            answer.append(ans)
            elapsed_time.append(el)
            lag_time.append(lag)
            was_explained.append(expl)
            
        exercise = self.tokenizer.texts_to_sequences(exercise)
        
        ## Numpyisation
        exercise = np.array(exercise)  ## 14000
        container = np.array(container) ## 10000
        was_tagged = np.array(was_tagged) ## 2
        timestamp = np.array(timestamp) ## Num log
        question_mean = np.array(question_mean) ## Num
        parts = np.array(parts) ## 7
        
        output = np.array(output)
        
        correctness = np.array(correctness) ## 4
        answer = np.array(answer) ## 5
        elapsed_time = np.array(elapsed_time) ## Num logged
        lag_time = np.array(lag_time) ## Num logged
        was_explained = np.array(was_explained) ## 2
        
        ## Log of high numerical values
        timestamp[np.isnan(timestamp)] = 0
        timestamp = timestamp.reshape((timestamp.shape[0],timestamp.shape[1], 1))
        timestamp = np.log(timestamp+1)/5
        
        elapsed_time[np.isnan(elapsed_time)] = 0
        elapsed_time = elapsed_time.reshape((elapsed_time.shape[0],elapsed_time.shape[1], 1))
        elapsed_time = np.log(elapsed_time+1)/5
        
        lag_time[np.isnan(lag_time)] = 0
        lag_time = lag_time.reshape((lag_time.shape[0],lag_time.shape[1], 1))
        lag_time = np.log(lag_time+1)/5
        
        
        
        ## Other Nums
        question_mean[np.isnan(question_mean)] = 0
        question_mean = question_mean.reshape((question_mean.shape[0],question_mean.shape[1], 1))
        

        num_encoder = np.concatenate([timestamp, question_mean], axis = -1).astype('float32')
        num_decoder = np.concatenate([elapsed_time, lag_time], axis = -1).astype('float32')
        
        X = [
            exercise,  ## 0
            container,  ## 1
            was_tagged,  ## 2
            parts,  ## 3
            num_encoder,  ## 4
            
            correctness,  ## 5
            answer,  ## 6
            was_explained,  ## 7
            num_decoder     ## 8        
        ]

        return X, output

    def on_epoch_end(self):
        pass

    def __get_data(self, batch):
        pass

In [None]:
gen = DataGenerator(batch_size=1024, max_len = 128, folder = 'user_batch_saint')

In [None]:
%%time
x, y = gen[0]

In [4]:
from tf_transformers2 import *
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, LSTM
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [5]:
class SaintEncoder(tf.keras.layers.Layer):
    def __init__(self, num_layers = 2, d_model = 512, num_heads = 8, dff = 1024, 
                 maximum_position_encoding = 512, rate=0.1, bidirectional_encoder = False):
        super(SaintEncoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(14000, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                self.d_model)
        
        self.container_embedding = tf.keras.layers.Embedding(10000, d_model)
        self.was_tagged_embedding = tf.keras.layers.Embedding(2, d_model)
        self.parts_embedding = tf.keras.layers.Embedding(8, d_model)
        self.nums_embedding = tf.keras.layers.Dense(d_model)
        
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
        self.bidirectional_encoder = bidirectional_encoder
        
    def call(self, x, training, container = None, was_tagged = None, parts = None, nums = None, calls = []):
        seq_len = tf.shape(x)[1]
        
        if self.bidirectional_encoder == False:
            look_ahead_mask = create_look_ahead_mask(tf.shape(x)[1])
            dec_target_padding_mask = create_padding_mask(x, pad_token = 1)
            mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        else:
            mask = create_padding_mask(x)
        
        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        container_emb = self.container_embedding(container)
        was_tagged_emb = self.was_tagged_embedding(was_tagged)
        parts_emb = self.parts_embedding(parts)
        nums_emb = self.nums_embedding(nums)
        
        if 'container' in calls:
#             container_emb = self.container_embedding(container)
            x += container_emb
            
        if 'tagged' in calls:
#             was_tagged_emb = self.was_tagged_embedding(was_tagged)
            x += was_tagged_emb
            
        if 'parts' in calls:
#             parts_emb = self.parts_embedding(parts)
            x += parts_emb
            
        if 'num' in calls:
#             nums_emb = self.nums_embedding(nums)
            x += nums_emb
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x, mask  # (batch_size, input_seq_len, d_model)

In [6]:
class SaintDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, take_encoder = True):
        super(SaintDecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        self.take_encoder = take_encoder
    
    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        if self.take_encoder:
            attn2, attn_weights_block2 = self.mha2(
                enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
            attn2 = self.dropout2(attn2, training=training)
            out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
            ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
            res = out2
        else:
            attn_weights_block2 = attn_weights_block1
            ffn_output = self.ffn(out1)  # (batch_size, target_seq_len, d_model)
            res = out1
        
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + res)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2 

In [7]:
class SaintDecoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff,
               maximum_position_encoding, rate=0.1, bidirectional_decoder = False, take_encoder = True):
        super(SaintDecoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(4, d_model, name = 'embedding')
        
        self.answer_embeddings =  tf.keras.layers.Embedding(5, d_model, name = 'answer_embeddings')
        self.was_explained_embeddings =  tf.keras.layers.Embedding(2, d_model, name = 'was_explained_embeddings')
        self.nums_embeddings =  tf.keras.layers.Dense(d_model, name = 'nums_embeddings')

        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [SaintDecoderLayer(d_model, num_heads, dff, rate, take_encoder = take_encoder) 
                           for i in range(num_layers)]
        
        self.dropout = tf.keras.layers.Dropout(rate)
        self.bidirectional_decoder = bidirectional_decoder
    
    def call(self, x, enc_output, training = True, padding_mask = None, answer = None, was_explained = None, nums = None, calls = []):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        if self.bidirectional_decoder == False:
            look_ahead_mask = create_look_ahead_mask(tf.shape(x)[1])
            dec_target_padding_mask = create_padding_mask(x)
            mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        else:
            mask = create_padding_mask(x, pad_token = 3)
        
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        ## Adding different embeddings
        answer_emb = self.answer_embeddings(answer)
        was_explained_emb = self.was_explained_embeddings(was_explained)
        nums_emb = self.nums_embeddings(nums)
        if 'answer' in calls:
#             answer_emb = self.answer_embeddings(answer)
            x += answer_emb
        
        if 'explained' in calls:
#             was_explained_emb = self.was_explained_embeddings(was_explained)
            x += was_explained_emb
            
        if 'nums' in calls:
#             nums_emb = self.nums_embeddings(nums)
            x += nums_emb
            
        x = self.dropout(x, training=training)
        

        
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                 mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x

In [16]:
max_len = 128

inputs_exercise = tf.keras.Input(shape = (max_len,))
inputs_container = tf.keras.Input(shape = (max_len,))
inputs_was_tagged = tf.keras.Input(shape = (max_len,))
inputs_parts = tf.keras.Input(shape = (max_len,))
inputs_num_encoder = tf.keras.Input(shape = (max_len,2))

inputs_correctness = tf.keras.Input(shape = (max_len,))
inputs_answer = tf.keras.Input(shape = (max_len,))
inputs_was_explained = tf.keras.Input(shape = (max_len,))
inputs_num_decoder = tf.keras.Input(shape = (max_len,2))


inputs = [
    inputs_exercise,
    inputs_container,
    inputs_was_tagged,
    inputs_parts,
    inputs_num_encoder,
    
    inputs_correctness,
    inputs_answer,
    inputs_was_explained,
    inputs_num_decoder
]


encoder = SaintEncoder(num_layers = 4, d_model = 512, 
                       num_heads = 8, dff = 512, 
                 maximum_position_encoding = max_len, rate=0, bidirectional_encoder = False)

decoder = SaintDecoder(num_layers = 4, d_model = 512, 
                       num_heads = 8, dff = 512, 
               maximum_position_encoding = max_len, rate=0, bidirectional_decoder = False, take_encoder = True)


calls_encoder = ['container', 'tagged', 'parts', 'num']
calls_encoder = [
    'container', 
    'tagged', 
    'parts',             
    'num',
]

encoded, masks = encoder(inputs_exercise, training = True, 
                         container = inputs_container, was_tagged = inputs_was_tagged, 
                         parts = inputs_parts, nums = inputs_num_encoder,
                        calls = calls_encoder)

calls_decoder = [
    'answer', 
    'explained', 
    'nums',
]
# calls_decoder = ['answer', 'explained', 'nums']
# calls_decoder = []
decoded = decoder(inputs_correctness, encoded, training = True, padding_mask = masks, 
                          answer = inputs_answer, was_explained = inputs_was_explained, nums = inputs_num_decoder,
                 calls = calls_decoder)

outputs = tf.keras.layers.Dense(4, activation = 'softmax')(decoded)

model = Model(inputs, outputs)

In [17]:
model.load_weights('./weights/saint_base.h5')

In [18]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 128, 2)]     0                                            
__________________________________________________________________________________________________
input_13 (InputLayer)           [(None, 128)]        0                                            
____________________________________________________________________________________________

In [None]:
# with tf.device('/CPU:0'):
#     encoded, masks = encoder(x[0], training = True, 
#                              container = x[1], was_tagged = x[2], 
#                              parts = x[3], nums = x[4])

In [None]:
# with tf.device('/CPU:0'):
#     out = decoder(x[5], encoded, training = True, padding_mask = masks, 
#                               answer = x[6], was_explained = x[7], nums = x[8])

In [19]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
                    from_logits=True, reduction='none')


# pred batch_size, seq_lenght, 3

def loss_function(real, pred):
    mask1 = tf.math.logical_not(tf.math.equal(real, 3))
    loss_ = loss_object(real, pred)
    mask1 = tf.cast(mask1, dtype=loss_.dtype)
    loss_ *= mask1
    return tf.reduce_mean(loss_)

def acc(true, pred):
    mask = tf.cast(tf.math.logical_not(tf.math.equal(true, 3)),dtype = true.dtype)
    
    pred = pred[:,:,:3]
    pred = tf.math.argmax(pred, axis=-1, output_type=tf.dtypes.int64, name=None)
    pred = tf.cast(pred, dtype = true.dtype)
    
    pred = pred*mask
    true = true*mask
    
    equal = tf.cast(tf.math.equal(pred, true), dtype = true.dtype)
    
    n_equal = tf.math.reduce_sum(equal)
    n_mask = tf.math.reduce_sum(mask)
    n_tot = tf.math.reduce_sum(tf.cast(tf.math.greater(true, -1), dtype = true.dtype))
    n_masked = n_tot - n_mask
    
    return (n_equal - n_masked) / (n_tot - n_masked)


In [20]:
class Roc_Auc(tf.keras.callbacks.Callback):

    def __init__(self, train = None, validation=None):
        super(Roc_Auc, self).__init__()
        self.train = train
        self.validation = validation

    def on_epoch_end(self, epoch, logs={}):
        
        x_val, y_val = self.validation[0], self.validation[1]
        
        pred = self.model.predict(x_test, verbose = 0)
        
        pred = pred[:,:,2]
        y_pred = pred.reshape(pred.shape[0]*pred.shape[1])
        true = y_test.reshape(pred.shape[0]*pred.shape[1])

        y_pred = y_pred[true != 3]
        true = true[true != 3]
        
        y_pred = y_pred[true != 0]
        true = true[true != 0]
        
        true = true - 1 
        
        metric = roc_auc_score(true, y_pred)
        logs['roc_auc_val'] = metric
        print(logs)

In [21]:
from tensorflow.keras.optimizers import Adam, SGD

loss_classif     =  loss_function # find the right loss for multi-class classification
optimizer        =  Adam(3e-5, 1e-8) # find the right optimizer
metrics_classif  =  [acc]

model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)

In [22]:
train_gen = DataGenerator(batch_size=64, max_len = max_len, folder = 'user_batch_saint')
test_gen = DataGenerator(batch_size=1024, max_len = max_len, folder = 'user_batch_saint_test')
x_test, y_test = test_gen[0]

In [23]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=7, verbose=1, 
                                                mode='auto', restore_best_weights=True)
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, 
                                                 mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
roc = Roc_Auc(validation = (x_test,  y_test))

bs = 32
n_epochs = 50
steps_per_epoch = 1250
# steps_per_epoch = 500
#, batch_size=bs
history = model.fit(train_gen, epochs=n_epochs,steps_per_epoch = steps_per_epoch, validation_data=(x_test,  y_test), callbacks = [early, reduce, roc])

  ...
    to  
  ['...']
Train for 1250 steps, validate on 1024 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 00004: ReduceLROnPlateau reducing learning rate to 2.9999999242136257e-06.
{'loss': 0.42661825578212736, 'acc': 0.70062006, 'val_loss': 0.4315062630921602, 'val_acc': 0.70532787, 'lr': 3e-05, 'roc_auc_val': 0.7588080238672412}
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 00012: ReduceLROnPlateau reducing learning rate to 2.9999998787388907e-07.
{'loss': 0.4295765555381775, 'acc': 0.70300025, 'val_loss': 0.4306689687073231, 'val_acc': 0.7059049, 'lr': 2.9999999e-06, 'roc_auc_val': 0.7633278097815374}
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: ReduceLROnPlateau reducing learning rate to 2.999999821895472e-08.
{'loss': 0.4267364072084427, 'acc': 0.7032428, 'val_loss': 0.43042660132050514, 'val_acc': 0.70720744, 'lr': 2.9999998e-07, 'roc_auc_val': 0.7626915321390921}
Epoch 16/50
{'loss': 0.428847366809845, 'acc':

KeyboardInterrupt: 

In [24]:
model.save_weights('./weights/saint_base.h5')

In [None]:
help(model.save_weights)

In [None]:
model.save('./model')

In [None]:
pred = model.predict(x_test, verbose = 1)

In [None]:
pred = pred[:,:,1]
y_pred = pred.reshape(1024*384)
true = y_test.reshape(1024*384)

y_pred = y_pred[true != 2]
true = true[true != 2]

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(true, (y_pred >= 0.5)*1)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(true, y_pred)

In [None]:
model.save_weights('./weights/lstmgpt_auc_0.757.h5')

In [None]:
from sklearn.metrics import roc_auc_score
def acc(true, pred):
    true1 = np.array(true)
    pred1 = np.array(pred)
    
    pred1 = pred1[true1 < 2]
    true1 = true1[true1 < 2]
    
    
    if true1.sum() == 0 or true1.sum() == len(true1):
        true1 = np.concatenate([true1, np.array([0,1])])
        pred1 = np.concatenate([pred1, np.array([0,1])])
    
    return roc_auc_score(true1, pred1)

def test(true, pred):
    p = []
    
    pred2 = pred.reshape(true.shape[0] * true.shape[1])
    true2 = true.reshape(true.shape[0] * true.shape[1])
    pred2 = pred2[true2 < 2]
    true2 = true2[true2 < 2]
    
    print(roc_auc_score(true2, pred2))
    
    for i, elt in enumerate(tqdm(true)):
#         print(pred[i])
        p.append(acc(elt, pred[i]))
    
    plt.figure(figsize = (25,15))
    plt.hist(p, bins = 50)
    
    print(np.mean(p))
    
    return p

In [None]:
pred = model.predict(X_test)

In [None]:
pred.shape

In [None]:
pred = pred[:,:,:2]

def softmax(tab):
    e = np.exp(tab)
    s = np.sum(e, axis = -1)
        
    return e[:,:,1] / s

pred = softmax(pred)


In [None]:
pred.shape

In [None]:
pred[1]

In [None]:
y_test[0]

In [None]:
perf = test(y_test, pred)

## Ameliorations

add context on lecture and tasks

cluster lecture and tasks

give average score of a given task

enhance test set with train set (optimization constraint)
