In [1]:
import pandas as pd
import lightgbm as lgb
from matplotlib import pyplot as plt

In [2]:
%%time
#pickle file taken from _____ for improved loading speed
features =  ['user_id', 'content_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
train_df = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")
train_df = train_df[features]
print("Train size:", train_df.shape)
train_df

Train size: (101230332, 5)
CPU times: user 8.45 s, sys: 8.57 s, total: 17 s
Wall time: 47.1 s


Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,115,5692,1,,
1,115,5716,1,37000.0,False
2,115,128,1,55000.0,False
3,115,7860,1,19000.0,False
4,115,7922,1,11000.0,False
...,...,...,...,...,...
101230327,2147482888,3586,1,18000.0,True
101230328,2147482888,6341,1,14000.0,True
101230329,2147482888,4212,1,14000.0,True
101230330,2147482888,6343,0,22000.0,True


In [3]:
#Eliminate rows with -1 values in the target
train_df = train_df[train_df['answered_correctly'] != -1].reset_index(drop=True)
#Replace null values with FALSE
train_df.fillna(False, inplace=True)

train_df['user_id'] = train_df['user_id'].astype('int32')
train_df['content_id'] = train_df['content_id'].astype('int16')
train_df['answered_correctly'] = train_df['answered_correctly'].astype('int8')
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].astype('float32')
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('bool')


In [4]:
train_df['user'] = train_df.groupby('user_id')['answered_correctly'].shift()
#Calculate ratio of correct answers of the whole answers provided by the user
cumulated = train_df.groupby('user_id')['user'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cumulated['cumsum'] / cumulated['cumcount']
train_df.drop(columns=['user'], inplace=True)

In [5]:
train_df.head()

Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness
0,115,5692,1,0.0,False,
1,115,5716,1,37000.0,False,1.0
2,115,128,1,55000.0,False,1.0
3,115,7860,1,19000.0,False,1.0
4,115,7922,1,11000.0,False,1.0


In [6]:
user_agg = train_df.groupby('user_id')['answered_correctly'].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')['answered_correctly'].agg(['sum', 'count'])

In [7]:
train_df = train_df.groupby('user_id').tail(50).reset_index(drop=True)

In [8]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')[['question_id', 'part']]
questions_df['question_id'] = questions_df['question_id'].astype('int16')
questions_df['part'] = questions_df['part'].astype('int8')

train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

In [9]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

In [10]:
valid_df = train_df.groupby('user_id').tail(15)
train_df.drop(valid_df.index, inplace=True)

In [11]:
train_df.head() #remove later

Unnamed: 0,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,user_correctness,part,content_count
0,115,0.745495,1,0.0,False,,5,36514
1,115,0.734901,1,37000.0,False,1.0,5,23991
2,115,0.966869,1,55000.0,False,1.0,1,19619
3,115,0.954815,1,19000.0,False,1.0,1,21733
4,115,0.953218,1,11000.0,False,1.0,1,19409


In [12]:
#Defining the features to consider after feature engineering
features = [
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'part',
    'content_count'
]

target = 'answered_correctly'

In [13]:
#Defining LightGBM parameters
params = {
    'objective': 'binary',
    #'tree_method': 'hist'
    'seed': 42,
    'metric': 'auc',
#     'learning_rate': 0.07,
    'learning_rate: 0.15'
    'max_bin': 800,
    'num_leaves': 100
}

In [14]:
tr_data = lgb.Dataset(train_df[features], label=train_df[target])
va_data = lgb.Dataset(valid_df[features], label=valid_df[target])

#Training of the model
model = lgb.train(
    params, 
    tr_data, 
    num_boost_round=12000,
    valid_sets=[tr_data, va_data], 
    early_stopping_rounds=10,
    verbose_eval=50
)
# original early_stopping_rounds 100

#If you want to save the model
# model.save_model(f'model.txt')

Training until validation scores don't improve for 10 rounds
[50]	training's auc: 0.753427	valid_1's auc: 0.741727
[100]	training's auc: 0.754306	valid_1's auc: 0.742483
[150]	training's auc: 0.75472	valid_1's auc: 0.742678
[200]	training's auc: 0.755108	valid_1's auc: 0.742813
[250]	training's auc: 0.755455	valid_1's auc: 0.742918
[300]	training's auc: 0.755791	valid_1's auc: 0.743009
[350]	training's auc: 0.756114	valid_1's auc: 0.743083
[400]	training's auc: 0.756446	valid_1's auc: 0.743143
[450]	training's auc: 0.756759	valid_1's auc: 0.743191
[500]	training's auc: 0.757063	valid_1's auc: 0.743242
[550]	training's auc: 0.757348	valid_1's auc: 0.743279
[600]	training's auc: 0.757657	valid_1's auc: 0.743312
[650]	training's auc: 0.757952	valid_1's auc: 0.743348
Early stopping, best iteration is:
[663]	training's auc: 0.758014	valid_1's auc: 0.743354


In [15]:
import riiideducation

In [16]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [17]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            if user_id in user_agg.index:
                user_agg.loc[user_id, 'sum'] += answered_correctly
                user_agg.loc[user_id, 'count'] += 1
            else:
                user_agg.loc[user_id] = [answered_correctly, 1]
            
            if content_id in content_agg.index:
                content_agg.loc[content_id, 'sum'] += answered_correctly
                content_agg.loc[content_id, 'count'] += 1
            else:
                content_agg.loc[content_id] = [answered_correctly, 1]
                
    prior_test_df = test_df.copy()
    
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    
    test_df['user_correctness'] = test_df['user_id'].map(user_agg['sum'] / user_agg['count'])
    
    test_df['content_count'] = test_df['content_id'].map(content_agg['count']).fillna(1)
    test_df['content_id'] = test_df['content_id'].map(content_agg['sum'] / content_agg['count']).fillna(0.7)
      
    test_df['answered_correctly'] = model.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])