In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import random
from copy import deepcopy
import _pickle as pickle
import gc
from multiprocess import Pool
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import KBinsDiscretizer

from tensorflow.keras.optimizers import Adam, SGD
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile, protocol=4)
    outfile.close
    
def load(name, folder = ""):
    
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

class Discretiser:
    def __init__(self, nbins):
        self.nbins = nbins-1
        self.map_to = np.arange(self.nbins)/self.nbins
        
    def fit(self, X):
        ## X is a one dimension np array
        self.map_from = np.quantile(X, self.map_to)
        
    def transform(self, X):
        X1 = (np.interp(X, self.map_from, self.map_to, left=0, right=1, period=None) * self.nbins).astype(int)
        return X1
    
from tf_transformers2 import *
from tensorflow.keras.layers import Input, Dense, Dropout, TimeDistributed, LSTM
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [None]:
dico_batch = load('batch_1', 'user_batch_2000')

In [None]:
test_user = dico_batch[11084192]

In [None]:
test_user

In [None]:
## features present
exercise to predict    ## cat
part of exercise       ## cat
gtag of exercise       ## cat
cluster of exercise    ## cat
qmean on user          ## num
container              ## num
timestamp              ## num

## feature past
avg correctness
number explanation
number question
number lecture
avg elapsed time

## per part
avg correct
number explained
number lecture
number question
avg elapsed time
time since first question
time since last question
time since last lecture

## Per cluster
avg correct
number explained
number lecture
number question
avg elapsed time
time since first question
time since last question
time since last lecture

In [None]:
## Segments          28
user                 1
content              1
parts                6
cluster              20



## Time windows      17 possibles
history
# based on id
first five / ten /twenty
last five / ten / twenty
# Based on time
last 5/12/24 hours
first 5/12/24 hours
first week/month
last week/month

## Kpis              4 possibles 
interaction_type
time spent
time elapsed
explanation

## Kpis type         7 possibles
count
mean
hmean
std
min
max
slope - % improvement

#####################################

## Total 13328 features

In [None]:
tokenizer = load('tokenizer')
dico_utags, dico_gtags, dico_parts, _ = load('dico_tags')
dico_question = load('dico_questions_mean')
dico_cluster = load('transformer_clusters')
timestamp_enc, elapsed_enc,lag_time_enc, qmean_enc = load('discrete_encoders')
reducer = load('umap_reducer')

def map_dict(ids, dico):
    def replace_dico(x):
        try:
            return dico[x]
        except:
            return 0
    return np.array(list(map(replace_dico,ids)))

def map_question_type(ids):
    def mapping(x):
        if x[0] == 'l':
            return 1
        elif x[0] == 'q':
            return 0
        else:
            return -1
    return np.array(list(map(mapping,ids)))

def apply_tokenizer(ids):
    return np.array(tokenizer.texts_to_sequences(ids)).reshape(len(ids))

def divide(a, b):
    c = deepcopy(b)
    c[c == 0] = 1e9
    return a/c

def propagate_values(x):
    x1 = [x[0]]
    for elt in x[1:]:
        if elt == 0:
            x1.append(x1[-1])
        else:
            x1.append(elt)
    return np.array(x1)

def minimum(x):
    try:
        return x.min()
    except:
        return 0
    
def treat_neg(x):
    x[x <= 0] = 0
    return x

In [None]:
def build_lgb_features(user_dico):
    
    features = []
    
    ids = user_dico['exercise_id']
#     print(ids)
    num_ids = apply_tokenizer(ids)
    parts = map_dict(ids, dico_parts)
    gtags = map_dict(ids, dico_gtags)
    cl = map_dict(ids, dico_cluster)
    id_type = map_question_type(ids)
    qmean = map_dict(ids, dico_question)
    cont = user_dico['container_id']
    timest = user_dico['timestamp']
    correct = user_dico['correctness']
    elapsed = user_dico['elapsed_time']
    expl = user_dico['prior_question_had_explanation']
    
    y = deepcopy(user_dico['correctness'])
    
    headers = ['id_question', 'part', 'gtags', 'cluster', 'qmean', 'cont', 'timest']
    features.extend(deepcopy([num_ids, parts, gtags, cl, qmean, cont, timest]))
    
    ##Past processing
    parts = np.concatenate([[-1], parts])[:-1]
    cl = np.concatenate([[-1], cl])[:-1]
#     timest = np.concatenate([[0], timest])
    correct = np.concatenate([[0], correct])[:-1]
    elapsed = np.concatenate([[0], elapsed])
    expl = np.concatenate([[0], expl])
    id_type = np.concatenate([[-1], id_type])[:-1]
    
    headers.extend(['nb_question', 'nb_correct', 'avg_correct','std_correct', 
                    'nb_lecture', 'nb_explained', 'avg_time', 'std_time', 
                    'time_first_question', 'time_last_question', 'time_first_lecture', 'time_last_lecture'])

    nb_correct = np.cumsum(correct * (id_type == 0))
    nb_qu = np.cumsum(id_type == 0)
    avg_correct = divide(nb_correct , nb_qu)
    std_correct = np.sqrt(divide(np.cumsum(np.square(correct * (id_type == 0) - avg_correct)),nb_qu))
    
    nb_le = np.cumsum(id_type == 1)
    nb_expl = np.cumsum(expl == 1)
    avg_el = divide(np.cumsum(elapsed * (id_type == 0)), nb_qu)
    std_el = np.sqrt(divide(np.cumsum(np.square(elapsed * (id_type == 0) - avg_correct)),nb_qu))
    
    t_last_question = propagate_values(timest * (id_type == 0))
    t_last_question = treat_neg(np.concatenate([[0], timest[1:] - t_last_question[:-1]]))
    
    t_last_lecture = propagate_values(timest * (id_type == 1))
    t_last_lecture = treat_neg(np.concatenate([[0], timest[1:] - t_last_lecture[:-1]]))
    
    t_first_question = timest * (id_type == 0)
    t_first_question = treat_neg(timest - minimum(t_first_question[t_first_question != 0]))
    
    t_first_lecture = timest * (id_type == 1)
    t_first_lecture = treat_neg(timest - minimum(t_first_lecture[t_first_lecture != 0]))
    
    features.extend(deepcopy([nb_qu, nb_correct, avg_correct,std_correct, 
                     nb_le, nb_expl, avg_el, std_el, 
                              t_first_question, t_last_question, t_first_lecture, t_last_lecture]))
    

    # Parts
    base_header = ['nb_question', 'nb_correct', 'avg_correct','std_correct', 'nb_lecture', 'nb_explained', 
                   'avg_time', 'std_time', 
                   'time_since_first_question', 'time_since_last_question','time_since_first_lecture', 'time_since_last_lecture']
    for i in range(1,7):
        mask = (parts == i)
        headers.extend([elt + '_part_'+str(i) for elt in base_header])
        
        nb_correct = np.cumsum(correct * (id_type == 0) * mask)
        nb_qu = np.cumsum((id_type == 0) * mask)
        avg_correct = divide(nb_correct , nb_qu)
        std_correct = np.sqrt(divide(np.cumsum(np.square(correct * (id_type == 0) * mask - avg_correct)),nb_qu))
        
        nb_le = np.cumsum((id_type == 1) * mask)
        nb_expl = np.cumsum((expl == 1) * mask)
        avg_el = divide(np.cumsum(elapsed * (id_type == 0) * mask), nb_qu)
        std_el = np.sqrt(divide(np.cumsum(np.square(elapsed * (id_type == 0) * mask - avg_correct)),nb_qu))
        
        t_last_question = propagate_values(timest * (id_type == 0) * mask)
        t_last_question = treat_neg(np.concatenate([[0], timest[1:] - t_last_question[:-1]]))

        t_last_lecture = propagate_values(timest * (id_type == 1) * mask)
        t_last_lecture = treat_neg(np.concatenate([[0], timest[1:] - t_last_lecture[:-1]]))

        t_first_question = timest * (id_type == 0) * mask
        t_first_question = treat_neg(timest - minimum(t_first_question[t_first_question != 0]))

        t_first_lecture = timest * (id_type == 1) * mask
        t_first_lecture = treat_neg(timest - minimum(t_first_lecture[t_first_lecture != 0]))
        
        features.extend(deepcopy([
            nb_qu, nb_correct, avg_correct, std_correct, nb_le, nb_expl,
            avg_el, std_el,
            t_first_question, t_last_question, t_first_lecture, t_last_lecture
        ]))
    
    for i in range(20):
        mask = (cl == i)
        headers.extend([elt + '_cluster_'+str(i) for elt in base_header])
        
        nb_correct = np.cumsum(correct * (id_type == 0) * mask)
        nb_qu = np.cumsum((id_type == 0) * mask)
        avg_correct = divide(nb_correct , nb_qu)
        std_correct = np.sqrt(divide(np.cumsum(np.square(correct * (id_type == 0) * mask - avg_correct)),nb_qu))
        
        nb_le = np.cumsum((id_type == 1) * mask)
        nb_expl = np.cumsum((expl == 1) * mask)
        avg_el = divide(np.cumsum(elapsed * (id_type == 0) * mask), nb_qu)
        std_el = np.sqrt(divide(np.cumsum(np.square(elapsed * (id_type == 0) * mask - avg_correct)),nb_qu))
        
        t_last_question = propagate_values(timest * (id_type == 0) * mask)
        t_last_question = treat_neg(np.concatenate([[0], timest[1:] - t_last_question[:-1]]))

        t_last_lecture = propagate_values(timest * (id_type == 1) * mask)
        t_last_lecture = treat_neg(np.concatenate([[0], timest[1:] - t_last_lecture[:-1]]))

        t_first_question = timest * (id_type == 0) * mask
        t_first_question = treat_neg(timest - minimum(t_first_question[t_first_question != 0]))

        t_first_lecture = timest * (id_type == 1) * mask
        t_first_lecture = treat_neg(timest - minimum(t_first_lecture[t_first_lecture != 0]))
        
        features.extend(deepcopy([
            nb_qu, nb_correct, avg_correct, std_correct, nb_le, nb_expl,
            avg_el, std_el,
            t_first_question, t_last_question, t_first_lecture, t_last_lecture
        ]))  
        
    ## Transformers features
#     size = len(ids)
#     input_seq = build_sequence(user_dico)
    
#     with tf.device('/GPU:0'):
#         p1 = model_inf(input_seq).numpy()
    
#     pred = np.concatenate([p1, input_seq[-1]], axis = -1)[0][:size]
#     new_feats = reducer.transform(pred)
    
#     headers.extend(['t_'+str(i) for i in range(new_feats.shape[1])])
    
#     features.extend(list(new_feats.T))
    
    return pd.DataFrame(np.array(features).T, columns = headers), y

# number question
# number correct
# avg correct
# std correct
# number lecture
# number explained

# avg elapsed time
# std elapsed time

# time since first question
# time since last question
# time since last lecture  

In [None]:
%%time
f, y = build_lgb_features(test_user)

In [None]:
pd.set_option('display.max_columns', 150)

In [None]:
features = f.columns
f['target'] = y

In [None]:
max_len = 1

df = []
for i in range(1,4):
    dico_batch = load('batch_'+str(i), 'user_batch_2000')
    for elt in tqdm(dico_batch):
        user_dico = dico_batch[elt]
        seq_len = user_dico['exercise_id'].shape[0]
#         if seq_len <= max_len:
        f, y = build_lgb_features(user_dico)
        features = f.columns
        f['target'] = y
        f.index = [elt for i in range(f.shape[0])]
        df.append(f.copy())
    
    

In [None]:
df = pd.concat(df)

In [None]:
df.shape

In [None]:
df = df[df['target'] != -1]

In [None]:
cats_features = ['id_question', 'part', 'gtags', 'cluster', 'nb_question']
for elt in cats_features:
    df[elt] = df[elt].astype(int)

In [None]:
# df = df[df['nb_question']>=20]

In [None]:
ids = np.unique(df.index)

In [None]:
tcols = ['t_'+str(i) for i in range(32)]

In [None]:
from sklearn.model_selection import train_test_split
ids_train, ids_test, _, _ = train_test_split(ids, ids, test_size=0.2, random_state=42)

X_train = df.loc[ids_train][df.columns[:-1]].copy()
# X_train = df.loc[ids_train][tcols].copy()
y_train = df.loc[ids_train]['target'].copy().values

X_test = df.loc[ids_test][df.columns[:-1]].copy()
# X_test = df.loc[ids_test][tcols].copy()
y_test = df.loc[ids_test]['target'].copy().values

In [None]:
del df
gc.collect()

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(max_depth = -1, n_estimators = 500, n_jobs = 12, silent = False)
clf.fit(X_train, y_train, eval_set =(X_test, y_test), eval_metric = 'auc')

In [None]:
pred = clf.predict_proba(X_test)[:,1]
roc_auc_score(y_test, pred)

In [None]:
sorted_f = np.argsort(clf.feature_importances_)

for elt in range(len(sorted_f)):
    print(clf.feature_importances_[sorted_f[-elt-1]], clf.feature_name_[sorted_f[-elt-1]])

In [None]:
plt.figure(figsize = (25,15))
plt.bar(clf.feature_name_, )
plt.xticks(rotation = 90)

In [None]:
def split_cat_num(X,y, cat_cols = ['id_question', 'part', 'gtags', 'cluster'], batch_size = 256):
    out = []
#     s =  X.shape[0]//batch_size * batch_size
    for elt in cat_cols:
        out.append(X[elt].values.reshape(-1,1))
    
    num_cols = [elt for elt in X.columns if not(elt in cat_cols)]
    
    out.append(X[num_cols].values)
#     y = y[:s]
    return out, y

In [None]:
# save((X_train, y_train, X_test, y_test), 'batch_save_tabnet')

(X_train, y_train, X_test, y_test) = load('batch_save_tabnet')

In [None]:
X_train.shape

In [None]:
X_train, y_train = split_cat_num(X_train,y_train, batch_size = 2048)
X_test, y_test = split_cat_num(X_test, y_test,batch_size = 2048)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler().fit(X_train[-1])
X_train[-1] = sc.transform(X_train[-1])
X_test[-1] = sc.transform(X_test[-1])

In [None]:
# save((X_train, y_train, X_test, y_test), 'batch_save_tabnet')

# (X_train, y_train, X_test, y_test) = load('batch_save_tabnet')

In [None]:
for i,elt in enumerate(X_train[:-1]):
    print(i)
    print(np.max(elt))

In [None]:
for i,elt in enumerate(X_test[:-1]):
    print(i)
    print(np.max(elt))

In [None]:
X_train[-1].shape

In [None]:
import tensorflow as tf
from tabnet import *

tabnet_encoder = TabNet(
        num_features = 513,
        feature_dim = 128,
        output_dim = 128,
        feature_columns = None,
        n_step = 4,
        n_total = 4,
        n_shared = 2,
        relaxation_factor = 1.5,
        bn_epsilon = 1e-5,
        bn_momentum = 0.7,
        bn_virtual_divider = 20,
    )


inputs = [tf.keras.Input(shape=(1,)) for i in range(4)] + [tf.keras.Input(shape=(327,))]

in_size = [14000,8,100,20]
out_size = [128,8,30,20]

agg = [tf.squeeze(tf.keras.layers.Embedding(in_size[i], out_size[i])(inputs[i]), axis = 1) for i in range(len(in_size))] + [inputs[-1]]
agg = tf.keras.layers.Concatenate(axis = 1)(agg)

enc, masks = tabnet_encoder(agg)

out = tf.keras.layers.Dense(1, activation = 'sigmoid')(enc)
model = tf.keras.Model(inputs, out)

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.02, beta_1=0.9, beta_2=0.999)

model.compile(
        loss = 'binary_crossentropy',
        optimizer = optimizer,
        metrics = ['accuracy', 'AUC'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

early = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=9, verbose=1, 
                                                mode='auto', restore_best_weights=True)

reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, 
                           mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

callbacks =[early, reduce]

epochs = 1000
batch_size = 20000

ls = X_train[0].shape[0]//batch_size * batch_size
lt = X_test[0].shape[0]//batch_size * batch_size

model.fit([elt[:ls] for elt in X_train], y_train[:ls], 
          validation_data = ([elt[:lt] for elt in X_test], y_test[:lt]), 
          batch_size = batch_size, epochs = epochs, callbacks=callbacks)

In [None]:
X_test[0].shape[0]/256

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(max_depth = -1, n_estimators = 500, n_jobs = 12, silent = False)
clf.fit(X_train, y_train, eval_set =(X_test, y_test), eval_metric = 'auc')