# **Riiid! Answer Correctness Prediction**

We are going to predict if the user will predict the correct answer ... or not.

For that, we will use LGBMClassifier model.

## IMPORTS

In [1]:
import eli5
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import riiideducation
import seaborn as sns
import time


from collections import defaultdict
from eli5.sklearn import PermutationImportance
from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope
from lightgbm import LGBMClassifier
from sklearn import pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm



## GLOBAL PARAMETERS

In [2]:
IS_TIMESTAMPS_IN_MONTH = False
IS_TIMESTAMPS_IN_DAY = False
IS_TIMESTAMPS_IN_HOUR = True

IS_PRIOR_TIME_IN_HOUR = True

IS_FIRST_QUESTION = True

IS_TEST = False

IS_EVAL = False

NBMILLISEC_IN_HOUR = 1000 * 60 * 60
nbmillisec_in_hour = 1000 * 60 * 60

## FUNCTIONS

In [3]:
def tags_column_processing(tags):
    '''Function which replace existing tags list by 
    a new list tags by keeping only tags in list SELECTED_TAGS '''
    new_tags = []
    tags_words = tags.split()
    for tag in tags_words:
        if (tag in SELECTED_TAGS) :
            new_tags.append(tag)   
    return new_tags


def preprocessing_data(data_df):
    '''Function which preprocess data (test and train)'''
    data_df = data_df[data_df['content_type_id'] == 0]
    data_df = data_df.drop(['content_type_id'], axis=1)
    data_df['prior_had_explanation_le'] = 0
    data_df.loc[data_df["prior_question_had_explanation"]== True,
                "prior_had_explanation_le"] = 1
    data_df = data_df.drop(['prior_question_had_explanation'], axis=1)
    
    if(IS_FIRST_QUESTION):
        data_df['is_first_question'] = 0
        data_df.loc[(data_df['prior_question_elapsed_time'].isna()) | (data_df['timestamp'] == 0),\
                    'is_first_question'] = 1
        data_df = data_df.astype({"is_first_question":'int8'}) 
    
    if(IS_TIMESTAMPS_IN_MONTH):
        data_df['timestamp_in_month'] = data_df['timestamp'] / (nbmillisec_in_hour * 24 * 365 / 12)
        data_df = data_df.astype({"timestamp_in_month":'float32'}) 
        
    if(IS_TIMESTAMPS_IN_DAY):
        data_df['timestamp_in_day'] = data_df['timestamp'] / (nbmillisec_in_hour * 24)
        data_df = data_df.astype({"timestamp_in_day":'float32'})
    
    if(IS_TIMESTAMPS_IN_HOUR):
        data_df['timestamp_in_hour'] = data_df['timestamp'] / (nbmillisec_in_hour)
        data_df = data_df.astype({"timestamp_in_hour":'float32'}) 
        
    prior_question_elapsed_time_mean = data_df[~data_df['prior_question_elapsed_time'].\
                                               isna()]['prior_question_elapsed_time'].mean()
    data_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    if(IS_PRIOR_TIME_IN_HOUR):
        data_df['prior_time_in_hours'] = data_df['prior_question_elapsed_time'] / nbmillisec_in_hour
        data_df = data_df.astype({"prior_time_in_hours":'float32'}) 
    return data_df

## Data loading

We will follow this tutorials : [Competition API Detailed Introduction](https://www.kaggle.com/sohier/competition-api-detailed-introduction) and 
[Tutorial on reading large datasets](http://https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets/).

In [4]:
%%time

questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

CPU times: user 22.7 ms, sys: 4.35 ms, total: 27 ms
Wall time: 32.9 ms


In [5]:
%%time

dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', 
                       low_memory=False, 
                       nrows=10**6, 
                       dtype=dtypes
                      )
print("Train size:", train_df.shape)

Train size: (1000000, 10)
CPU times: user 2.18 s, sys: 339 ms, total: 2.52 s
Wall time: 2.62 s


In [6]:
train_df = preprocessing_data(train_df)

In [7]:
train_df.memory_usage(deep=True)

Index                          7840744
row_id                         7840744
timestamp                      7840744
user_id                        3920372
content_id                     1960186
task_container_id              1960186
user_answer                     980093
answered_correctly              980093
prior_question_elapsed_time    3920372
prior_had_explanation_le       7840744
is_first_question               980093
timestamp_in_hour              3920372
prior_time_in_hours            3920372
dtype: int64

## We keep 2 years (cf EDA)

In [8]:
train_df = train_df[train_df['timestamp'] <= (2 * 365 * 24 * 60 *60 *1000)]

## questions.csv

In [9]:
questions_df['tags'].fillna("", inplace=True)
questions_df["nb_tags"] = questions_df["tags"].apply(lambda text: len(text.split()))
questions_df = questions_df[questions_df["nb_tags"] > 0]

In [10]:
questions_df['tags_list'] = questions_df['tags'].apply(lambda x: x.split())
tags_list = [item for sublist in questions_df['tags_list'].values for item in sublist]

tags_unique_list = list(set(tags_list))
if(IS_TEST):
    print(len(tags_list))
    print(len(tags_unique_list))

In [11]:
tags_frequence= nltk.FreqDist(tags_list)
NB_TAGS = 25
words_most_common = tags_frequence.most_common(NB_TAGS)
fq_words_df = pd.DataFrame(words_most_common, columns = ['tags' , 'nb_tags'])

SELECTED_TAGS = (fq_words_df['tags'][:NB_TAGS]).to_list()
#print(SELECTED_TAGS)
questions_df['new_tags'] = questions_df["tags"].apply(lambda text : tags_column_processing(text))
classes_tags = tuple(SELECTED_TAGS)
one_hot = MultiLabelBinarizer(classes = classes_tags )

types_encoded = pd.DataFrame(one_hot.fit_transform(questions_df['new_tags']),columns=one_hot.classes_)
#types_encoded.head()
questions_df = pd.concat([questions_df,types_encoded], axis = 1)

In [12]:
# Drop column B as it is now encoded
questions_df = questions_df.drop(['new_tags', 'tags', 'tags_list'],axis = 1)

In [13]:
questions_df = questions_df[~questions_df['part'].isna()]

In [14]:
questions_df['is_easy_part'] = 0
questions_df['is_medium_part'] = 0
questions_df['is_difficult_part'] = 0
questions_df.loc[questions_df['part'] <= 3 ,'is_easy_part'] = 1
questions_df.loc[(questions_df['part'] == 4)|(questions_df['part'] == 5) ,'is_difficult_part'] = 1
questions_df.loc[questions_df['part'] >6 ,'is_medium_part'] = 1

In [15]:
questions_df['is_reading_section']= 0 
questions_df.loc[questions_df['part'] >= 5 ,'is_reading_section'] = 1

## Merge questions and train

In [16]:
train_df = pd.merge(train_df,questions_df, how='left', left_on='content_id', right_on='question_id').sort_values('row_id')

In [17]:
user_agg = train_df.groupby('user_id')['answered_correctly'].\
                    agg(['sum', 'count'])
train_df['user_count'] = train_df['user_id'].map(user_agg['count']).astype('int32')
train_df['user_nb_mean'] = train_df['user_id'].map(user_agg['sum']/user_agg['count']).astype('int32')

In [18]:
content_agg = train_df.groupby('content_id')['answered_correctly'].\
                        agg(['sum', 'count']) #'mean', 'median', 'std'
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_nb_mean'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

Number of attempts

In [19]:
train_df['nb_attempt'] = train_df.groupby(['user_id', 'content_id']).cumcount()+1

In [20]:
if(IS_TEST):
    train_df.head(15)

In [21]:
def get_state_user():
    # compute user features over all train data
    answered_correctly_user = train_df.groupby('user_id')['answered_correctly'].sum().values
    answered_user = train_df.groupby('user_id')['answered_correctly'].count().values  
    mean_user_accuracy = train_df.groupby('user_id')['answered_correctly'].mean().values
    # fill dictionary with default values
    state_user = dict()
    for user_id in train_df['user_id'].unique():
        state_user[user_id] = {}
    total = len(state_user.keys())

    # add user content attempts
    user_content = train_df.groupby('user_id')['content_id'].apply(np.array).apply(np.sort).apply(np.unique)
    user_attempts = train_df.groupby(['user_id', 'content_id'])['content_id'].count().groupby('user_id').apply(np.array).values
    user_attempts -= 1
    
    for user_id, content_id, nb_attempts in tqdm(zip(state_user.keys(), user_content, user_attempts),total=total):
        state_user[user_id]['user_content_attempts'] = dict(zip(content_id, nb_attempts))
        
    del user_content, user_attempts
    gc.collect()
    
    for idx, user_id in enumerate(state_user.keys()):
        state_user[user_id]['mean_user_accuracy'] = mean_user_accuracy[idx]
        state_user[user_id]['answered_correctly_user'] = answered_correctly_user[idx]
        state_user[user_id]['answered_user'] = answered_user[idx]
    return state_user

state_user = get_state_user()

if(IS_TEST):
    display(state_user[2746])

HBox(children=(FloatProgress(value=0.0, max=3824.0), HTML(value='')))




In [22]:
def get_user_data(state, test_df):
    # updated data
    attempt, mean_user_accuracy, answered_correctly_user, answered_user = [], [], [], []
    
    for idx, (user_id, content_id) in test_df[['user_id', 'content_id']].iterrows():
        # check if user exists
        if user_id in state:
            # check if user already answered the question, if so update it to a maximum of 4
            if content_id in state[user_id]['user_content_attempts']:
                state[user_id]['user_content_attempts'][content_id] = min(4, state[user_id]['user_content_attempts'][content_id] + 1)
            # if user did not answered the question already, set the number of attempts to 0
            else:
                state[user_id]['user_content_attempts'][content_id] = 0
        
        # else create user with default values
        else:
            dict_keys = ['mean_user_accuracy', 'answered_correctly_user', 'answered_user', 'user_content_attempts']
            dict_default_vals = [0.680, 0, 0, dict(zip([content_id],[0]))]
            state[user_id] = dict(zip(dict_keys, dict_default_vals))
            
        # add user data to lists
        attempt.append(state[user_id]['user_content_attempts'][content_id])
        mean_user_accuracy.append(state[user_id]['mean_user_accuracy'])
        answered_correctly_user.append(state[user_id]['answered_correctly_user'])
        answered_user.append(state[user_id]['answered_user'])
    
    return attempt, mean_user_accuracy, answered_correctly_user, answered_user

# updates the user data
def update_user_data(state, features_questions_df, prev_test_df):
    for user_id, content_id, answered_correctly in prev_test_df[['user_id', 'content_id', 'answered_correctly']].values:
        # update user features
        state[user_id]['answered_correctly_user'] += answered_correctly
        state[user_id]['answered_user'] += 1
        state[user_id]['mean_user_accuracy'] = state[user_id]['answered_correctly_user'] / state[user_id]['answered_user']

# Baseline

In [23]:
#sc = MinMaxScaler()
sc = StandardScaler()

In [24]:
train_df.columns

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'task_container_id',
       'user_answer', 'answered_correctly', 'prior_question_elapsed_time',
       'prior_had_explanation_le', 'is_first_question', 'timestamp_in_hour',
       'prior_time_in_hours', 'question_id', 'bundle_id', 'correct_answer',
       'part', 'nb_tags', '92', '38', '81', '29', '136', '162', '102', '8',
       '143', '131', '73', '21', '53', '82', '74', '106', '1', '96', '12',
       '97', '67', '55', '27', '122', '79', 'is_easy_part', 'is_medium_part',
       'is_difficult_part', 'is_reading_section', 'user_count', 'user_nb_mean',
       'content_count', 'content_nb_mean', 'nb_attempt'],
      dtype='object')

In [25]:
train_df = train_df[train_df['answered_correctly']!= -1]
features_bsl = ['prior_had_explanation_le', 'nb_attempt',
                'content_id', 'content_count', 'content_nb_mean',
                'user_nb_mean', 'user_count',
                'is_reading_section', 'user_id',
                'nb_tags',
                '92', '38', '81', '29', '136',
                '162', '102', '8', '143', '131'
                ]

if(IS_TIMESTAMPS_IN_MONTH):
    features_bsl.append('timestamp_in_month')
if(IS_TIMESTAMPS_IN_DAY):
    features_bsl.append('timestamp_in_day')
if(IS_TIMESTAMPS_IN_HOUR):
    features_bsl.append('timestamp_in_hour')
if(IS_PRIOR_TIME_IN_HOUR):
    features_bsl.append('prior_time_in_hours')
if(IS_FIRST_QUESTION):
    features_bsl.append('is_first_question')

X = train_df[features_bsl]
X = sc.fit_transform(X)
y = train_df['answered_correctly']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1)

In [26]:
features_bsl

['prior_had_explanation_le',
 'nb_attempt',
 'content_id',
 'content_count',
 'content_nb_mean',
 'user_nb_mean',
 'user_count',
 'is_reading_section',
 'user_id',
 'nb_tags',
 '92',
 '38',
 '81',
 '29',
 '136',
 '162',
 '102',
 '8',
 '143',
 '131',
 'timestamp_in_hour',
 'prior_time_in_hours',
 'is_first_question']

## Optimisation of the model (Hyperparameter tuning and crossvalidation)

### To optimize my model, I use : Bayesian Optimization (hyperopt) and stratification.

In [27]:
param_hyperopt= {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
    'feature_fraction': hp.uniform('feature_fraction', 0.6, 0.9),
    'n_estimators': scope.int(hp.quniform('n_estimators', 25, 300, 25)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 5, 50, 1)),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 0.2),
}


def hyperopt(param_space, X_train, y_train, X_test, y_test, num_eval):
    num_folds=3

    def objective_function(params):
        clf = lgb.LGBMClassifier(random_state =1, **params)
        kf = KFold(n_splits=num_folds, shuffle=True)
        score = cross_val_score(clf, X_train, y_train, cv=kf, scoring='roc_auc', n_jobs=1).mean()
        return {'loss': 1-score, 'status': STATUS_OK}
    
    trials = Trials()
    
    best_param = fmin(objective_function, 
                      param_space, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate= np.random.RandomState(1))
    loss = [x['result']['loss'] for x in trials.trials]
    
    best_param_values = [x for x in best_param.values()]
    
    if best_param_values[0] == 0:
        boosting_type = 'gbdt'
    else:
        boosting_type= 'dart'
   
    clf_best = lgb.LGBMClassifier(random_state = 1,
                                  feature_fraction=best_param_values[0],
                                  learning_rate=best_param_values[1],
                                  max_depth=int(best_param_values[2]),
                                  n_estimators=int(best_param_values[3]),
                                  num_leaves=int(best_param_values[4]),
                                  reg_lambda=best_param_values[5]
                                 )
    
    clf_best.fit(X_train, y_train)
    
    print("")
    print("##### Results")
    print("Score best parameters: ", min(loss)*-1)
    print("Best parameters: ", best_param)
    print("Test Score: ", clf_best.score(X_test, y_test))
    print("AUROC : ",roc_auc_score(y_test, clf_best.predict_proba(X_test)[:,1]) )
    print("Parameter combinations evaluated: ", num_eval)
    return trials

IS_EVAL = False
if(IS_EVAL):
    num_eval = 75
    results_hyperopt = hyperopt(param_hyperopt, X_train, y_train, X_test, y_test, num_eval)


75/75 [29:45<00:00, 23.81s/trial, best loss: 0.24545802722089016]

##### Results
Score best parameters:  -0.24545802722089016
Best parameters:  {'feature_fraction': 0.7847766875579623, 'learning_rate': 0.20475753529956947, 'max_depth': 8.0, 'n_estimators': 300.0, 'num_leaves': 25.0, 'reg_lambda': 0.08560393035568381}
Test Score:  0.7178898261301032
AUROC :  0.7537312406572992
Parameter combinations evaluated:  75

In [28]:
'''params = {
    #'bagging_fraction': 0.5817242323514327,
    'feature_fraction': 0.6884588361650144,
    #'learning_rate': 0.42887924851375825, 
    'learning_rate': 0.4, 
    'max_depth': 7,
    'min_child_samples': 100, 
    'min_child_weight': 0.01,
    'min_data_in_leaf': 20, 
    'n_estimators': 200,
    'n_job': 4,
    'num_leaves': 42,
    'random_state': 1,
    'reg_alpha': 5, 
    'reg_lambda': 0,
    'subsample': 0.863184719640143
}'''

params = {
    'bagging_fraction': 0.6,
    'feature_fraction': 0.7847766875579623, 
    'learning_rate': 0.45, #0.20475753529956947, 
    'max_depth': 8, 
    'n_estimators': 300,
    'num_leaves': 25,
    'reg_lambda': 0,
    'reg_alpha': 5,
    'random_state': 1,
    'n_job': 4,
    'min_child_samples': 100, 
    'min_child_weight': 0.01,
    'min_data_in_leaf': 15,
    'subsample': 0.75
}

lgbm = LGBMClassifier(**params)

In [29]:
lgbm.fit(train_df[features_bsl], y)

LGBMClassifier(bagging_fraction=0.6, feature_fraction=0.7847766875579623,
               learning_rate=0.45, max_depth=8, min_child_samples=100,
               min_child_weight=0.01, min_data_in_leaf=15, n_estimators=300,
               n_job=4, num_leaves=25, random_state=1, reg_alpha=5,
               reg_lambda=0, subsample=0.75)

In [30]:
print(roc_auc_score(y.values, lgbm.predict_proba(train_df[features_bsl])[:,1]))

0.7854608254629457


In [31]:
if(IS_EVAL):
    train_df.fillna(train_df.median(), inplace=True)
    perm = PermutationImportance(lgbm, random_state=1).fit(train_df[features_bsl], y)
    eli5.show_weights(perm, feature_names = features_bsl)

In [32]:
#displaying the most important features
if(IS_TEST):
    lgb.plot_importance(lgbm)
    plt.show()

In [33]:
# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()

In [34]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [35]:
prev_test_df = None

iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    if prev_test_df is not None:
        prev_test_df['answered_correctly_01']
        
        for user_id, content_id, answered_correctly in prev_test_df[['user_id', 'content_id', 'answered_correctly_01']].values:
            state_user[user_id]['answered_correctly_user'] += answered_correctly
            state_user[user_id]['answered_user'] += 1
            state_user[user_id]['mean_user_accuracy'] = state_user[user_id]['answered_correctly_user'] / state_user[user_id]['answered_user']
        
    test_df = preprocessing_data(test_df)
    
    test_df = pd.merge(test_df,questions_df, how='left', left_on='content_id', right_on='question_id').sort_values('row_id')
    test_df = test_df.sort_values(['user_id','timestamp'], ascending=False)

    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    user_mean = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    content_mean = np.zeros(len(test_df), dtype=np.int16)
    user_content_attempts = np.zeros(len(test_df), dtype=np.int8)
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        
        if user_id not in state_user:
            dict_keys = ['mean_user_accuracy', 'answered_correctly_user', 'answered_user', 'user_content_attempts']
            dict_default_vals = [0.680, 0, 0, dict(zip([content_id],[0]))]
            state_user[user_id] = dict(zip(dict_keys, dict_default_vals))
            
        if content_id not in state_user[user_id]['user_content_attempts']:
            state_user[user_id]['user_content_attempts'][content_id] = 1
        else:
            state_user[user_id]['user_content_attempts'][content_id] = state_user[user_id]['user_content_attempts'][content_id] + 1
        
        user_content_attempts[i]= state_user[user_id]['user_content_attempts'][content_id]
        
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        user_mean[i] = state_user[user_id]['mean_user_accuracy']
        
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        if(content_count[i] > 0 ) :
            content_mean[i] = content_sum[i] / content_count[i]
        else :
            content_mean[i] = 0.65
    test_df['nb_attempt'] = user_content_attempts
    
    test_df['user_count'] = user_count
    test_df['user_sum'] = user_sum
    test_df['user_nb_mean'] = user_mean
    
    test_df['content_count'] = content_count
    test_df['content_nb_mean'] = content_sum/content_count
    test_df['answered_correctly_01'] =  lgbm.predict(test_df[features_bsl])
    test_df['answered_correctly'] =  lgbm.predict_proba(test_df[features_bsl])[:,1]
    env.predict(test_df[['row_id', 'answered_correctly']])
    prev_test_df = test_df.copy()

In [36]:
submission = pd.read_csv('./submission.csv')

In [37]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   row_id              104 non-null    int64  
 1   answered_correctly  104 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 1.8 KB


In [38]:
submission.head()

Unnamed: 0,row_id,answered_correctly
0,17,0.704027
1,4,0.277226
2,2,0.601066
3,8,0.353694
4,6,0.534968
