In [1]:
import pandas as pd
import lightgbm as lgb
from matplotlib import pyplot as plt

In [2]:
%%time
#pickle file taken from _____ for improved loading speed
features =  ['user_id', 'content_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
train_df = pd.read_pickle("../input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip")
train_df = train_df[features]

CPU times: user 5.29 s, sys: 7.31 s, total: 12.6 s
Wall time: 35.8 s


In [3]:
#Eliminate rows with -1 values in the target
train_df = train_df[train_df['answered_correctly'] != -1].reset_index(drop=True)
#Replace null values with FALSE
train_df.fillna(False, inplace=True)

train_df['user_id'] = train_df['user_id'].astype('int32')
train_df['content_id'] = train_df['content_id'].astype('int16')
train_df['answered_correctly'] = train_df['answered_correctly'].astype('int8')
train_df['prior_question_elapsed_time'] = train_df['prior_question_elapsed_time'].astype('float32')
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('bool')

In [4]:
train_df['user'] = train_df.groupby('user_id')['answered_correctly'].shift()
#Calculate ratio of correct answers of the whole answers provided by the user
cumulated = train_df.groupby('user_id')['user'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cumulated['cumsum'] / cumulated['cumcount']
train_df.drop(columns=['user'], inplace=True)

In [5]:
user_agg = train_df.groupby('user_id')['answered_correctly'].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')['answered_correctly'].agg(['sum', 'count'])

In [6]:
train_df = train_df.groupby('user_id').tail(60).reset_index(drop=True)

In [7]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')[['question_id', 'part']]
questions_df['question_id'] = questions_df['question_id'].astype('int16')
questions_df['part'] = questions_df['part'].astype('int8')

train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

In [8]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

In [9]:
valid_df = train_df.groupby('user_id').tail(15)
train_df.drop(valid_df.index, inplace=True)

In [10]:
#Defining the features to consider after feature engineering
features = [
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'part',
    'content_count'
]

target = 'answered_correctly'

In [11]:
#Defining LightGBM parameters
params = {
    'objective': 'binary',
    #'tree_method': 'hist
    'metric': 'auc',
    'learning_rate': 0.12,
    'max_bin': 400,
    'num_leaves': 80,
    #'bagging_fraction': .6,
    #'bagging_freq':2
}

In [12]:
#tr_data = lgb.Dataset(train_df[features], label=train_df[target])
#va_data = lgb.Dataset(valid_df[features], label=valid_df[target])

In [13]:
#Training of the model
'''
model = lgb.train(
    params, 
    tr_data, 
    num_boost_round=12000,
    valid_sets=[tr_data, va_data], 
    early_stopping_rounds=100,
    verbose_eval=50
)
'''
#If you want to save the model
# model.save_model(f'model.txt')

'\nmodel = lgb.train(\n    params, \n    tr_data, \n    num_boost_round=12000,\n    valid_sets=[tr_data, va_data], \n    early_stopping_rounds=100,\n    verbose_eval=50\n)\n'

In [14]:
from sklearn.model_selection import KFold

In [15]:
#res = pd.DataFrame()
#res['row_id'] = [i for i in range(6611282)]
#res.loc[:, ['answered_correctly']] = 0
models = []

In [16]:
K = 7
for n, (tr, te) in enumerate(KFold(n_splits=K, random_state=626, shuffle=True).split(train_df[target])):
    print(f'Fold {n}')
    
    tr_data = lgb.Dataset(train_df[features].values[tr], label=train_df[target].values[tr])
    va_data = lgb.Dataset(valid_df[features].values, label=valid_df[target].values)
    
    
    model = lgb.train(
        params, 
        tr_data, 
        num_boost_round=2500,
        valid_sets=[tr_data, va_data],        
        early_stopping_rounds=50,
        verbose_eval=50
    )

    #res.loc[te, ['answered_correctly']] = model.predict(train_df[features].values[te])
    models.append(model)

Fold 0
Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.752701	valid_1's auc: 0.741786
[100]	training's auc: 0.75342	valid_1's auc: 0.742353
[150]	training's auc: 0.753872	valid_1's auc: 0.742639
[200]	training's auc: 0.754281	valid_1's auc: 0.742787
[250]	training's auc: 0.75461	valid_1's auc: 0.742869
[300]	training's auc: 0.754972	valid_1's auc: 0.742987
[350]	training's auc: 0.755298	valid_1's auc: 0.743061
[400]	training's auc: 0.755605	valid_1's auc: 0.743122
[450]	training's auc: 0.755892	valid_1's auc: 0.743179
[500]	training's auc: 0.756193	valid_1's auc: 0.743216
[550]	training's auc: 0.756491	valid_1's auc: 0.74327
[600]	training's auc: 0.756762	valid_1's auc: 0.743306
[650]	training's auc: 0.757048	valid_1's auc: 0.743335
[700]	training's auc: 0.75728	valid_1's auc: 0.74334
[750]	training's auc: 0.757523	valid_1's auc: 0.743354
[800]	training's auc: 0.757783	valid_1's auc: 0.743368
[850]	training's auc: 0.758056	valid_1's auc: 0.743396
[90

In [17]:
import riiideducation

In [18]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [19]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, answered_correctly in zip(user_ids, content_ids, targets):
            if user_id in user_agg.index:
                user_agg.loc[user_id, 'sum'] += answered_correctly
                user_agg.loc[user_id, 'count'] += 1
            else:
                user_agg.loc[user_id] = [answered_correctly, 1]
            
            if content_id in content_agg.index:
                content_agg.loc[content_id, 'sum'] += answered_correctly
                content_agg.loc[content_id, 'count'] += 1
            else:
                content_agg.loc[content_id] = [answered_correctly, 1]
                
    prior_test_df = test_df.copy()
    
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('bool')    
    
    test_df['user_correctness'] = test_df['user_id'].map(user_agg['sum'] / user_agg['count'])
    
    test_df['content_count'] = test_df['content_id'].map(content_agg['count']).fillna(1)
    test_df['content_id'] = test_df['content_id'].map(content_agg['sum'] / content_agg['count']).fillna(0.7)
    
    preds = [model.predict(test_df[features]) for model in models]
    
    predictions = preds[0]
    for i in range(1, K):
        predictions += preds[i]
    predictions /= K
    
    test_df['answered_correctly'] =  predictions
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    