In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import optuna
from optuna.samplers import TPESampler
SEED = 0

In [2]:
types_dict = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}
train_df = pd.read_csv(
    'data/train.csv',
    usecols=types_dict.keys(),
    dtype=types_dict,
    index_col=0
)
question_df = pd.read_csv('data/questions.csv')
lecture_df = pd.read_csv('data/lectures.csv')

In [4]:
features_df = train_df.iloc[:int(9 /10 * len(train_df))]
train_df = train_df.iloc[int(9 /10 * len(train_df)):]
print(features_df.shape, train_df.shape)

(9110730, 6) (1012304, 6)


In [5]:
train_questions_only_df = features_df[features_df['answered_correctly']!=-1]
train_lectures_only_df = features_df[features_df['answered_correctly']==-1]

In [7]:
new_column = list(question_df.columns)
new_column[0] = 'content_id'
question_df.columns = new_column
question_df = question_df[['content_id', 'part']]
new_column = list(lecture_df.columns)
new_column[0] = 'content_id'
lecture_df.columns = new_column
lecture_df = lecture_df[['content_id', 'part']]

In [8]:
train_questions_only_df = train_questions_only_df.merge(question_df, how='left', on='content_id')
train_lectures_only_df = train_lectures_only_df.merge(lecture_df, how='left', on='content_id')

In [9]:
train_questions_only_df.head()

Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part
0,8231964660,1933715875,11259,0,13000.0,True,5
1,8232002976,1933715875,4957,1,44000.0,True,5
2,8232096407,1933715875,5113,1,22000.0,True,5
3,8232119872,1933715875,4699,1,inf,True,5
4,8232142930,1933715875,11430,1,9000.0,True,5


In [11]:
train_lectures_only_df.head()

Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part
0,358247540,1933736600,18545,-1,,False,5
1,363108158,1933736600,21411,-1,,False,2
2,404110757,1933736600,18186,-1,,False,4
3,419176392,1933736600,3852,-1,,False,4
4,447039061,1933736600,10688,-1,,False,7


In [16]:
grouped_by_user_df = train_questions_only_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
user_answers_df.columns = ['mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy']
user_question_part_df = grouped_by_user_df.agg({'part': ['mean', 'count', 'std']}).copy()
user_question_part_df.columns = ['mean_user_question_part', 'count_question_part', 'std_question_part']

grouped_by_content_df = train_questions_only_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew'] }).copy()
content_answers_df.columns = ['mean_accuracy', 'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy']

grouped_by_part_df = train_questions_only_df.groupby('part')
part_answers_df = grouped_by_part_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew'] }).copy()
part_answers_df.columns = ['mean_part_accuracy', 'question_part_answered', 'std_part_accuracy', 'median_part_accuracy', 'skew_part_accuracy']

grouped_by_user_df = train_lectures_only_df.groupby('user_id')
user_lecture_part_df = grouped_by_user_df.agg({'part': ['mean', 'count', 'std']}).copy()
user_lecture_part_df.columns = ['mean_user_lecture_part', 'count_lecture_part', 'std_lecture_part']

In [17]:
user_answers_df.head()

Unnamed: 0_level_0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1933715875,0.742857,665,0.437388,1.0,-1.113839
1933716138,0.428571,42,0.50087,0.0,0.299479
1933722593,0.571429,7,0.534522,1.0,-0.374166
1933732537,0.421053,19,0.507257,0.0,0.347892
1933736600,0.732727,1100,0.442737,1.0,-1.053226


In [18]:
user_question_part_df.head()

Unnamed: 0_level_0,mean_user_question_part,count_question_part,std_question_part
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1933715875,4.915789,665,0.79464
1933716138,4.095238,42,1.461865
1933722593,5.0,7,0.0
1933732537,3.421053,19,1.304513
1933736600,4.709091,1100,1.400932


In [19]:
content_answers_df.head()

Unnamed: 0_level_0,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.898858,613,0.301763,1.0,-2.652174
1,0.876755,641,0.328975,1.0,-2.297648
2,0.553811,3949,0.497159,1.0,-0.216584
3,0.783056,2042,0.412265,1.0,-1.37452
4,0.60682,2874,0.488541,1.0,-0.437605


In [20]:
part_answers_df.head()

Unnamed: 0_level_0,mean_part_accuracy,question_part_answered,std_part_accuracy,median_part_accuracy,skew_part_accuracy
part,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.741056,659491,0.438055,1,-1.100579
2,0.707604,1675773,0.454864,1,-0.912818
3,0.694886,779845,0.460456,1,-0.846491
4,0.627016,714779,0.483598,1,-0.525298
5,0.610643,3692143,0.487605,1,-0.453823


In [21]:
user_lecture_part_df.head()

Unnamed: 0_level_0,mean_user_lecture_part,count_lecture_part,std_lecture_part
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1933736600,4.931818,44,1.387615
1933758283,4.388889,18,1.719743
1933782835,3.666667,6,2.065591
1933797058,1.0,1,
1933804373,3.0,2,2.828427


In [22]:
del features_df
del grouped_by_user_df
del grouped_by_content_df
del grouped_by_part_df

gc.collect()

46846

In [23]:
features = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'skew_accuracy',
    'mean_part_accuracy',
    'question_part_answered',
    'std_part_accuracy',
    'median_part_accuracy',
    'skew_part_accuracy',
    'mean_user_question_part',
    'count_question_part',
    'std_question_part',
    'mean_user_lecture_part',
    'count_lecture_part',
    'std_lecture_part',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation'
]
target = 'answered_correctly'
train_df = train_df[train_df[target] != -1]

In [24]:
train_df = train_df.merge(question_df, how='left', on='content_id')
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
train_df = train_df.merge(user_question_part_df, how='left', on='user_id')
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
train_df = train_df.merge(part_answers_df, how='left', on='part')
train_df = train_df.merge(user_lecture_part_df, how='left', on='user_id')

In [25]:
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,mean_user_accuracy,questions_answered,std_user_accuracy,...,median_accuracy,skew_accuracy,mean_part_accuracy,question_part_answered,std_part_accuracy,median_part_accuracy,skew_part_accuracy,mean_user_lecture_part,count_lecture_part,std_lecture_part
0,19868244269,2127040843,1172,0,13000.0,True,2,0.702234,3805.0,0.457336,...,1.0,-1.87255,0.707604,1675773,0.454864,1,-0.912818,4.272727,110.0,1.305662
1,19868267527,2127040843,1273,1,16992.0,True,2,0.702234,3805.0,0.457336,...,1.0,-1.003488,0.707604,1675773,0.454864,1,-0.912818,4.272727,110.0,1.305662
2,19868474994,2127040843,1379,1,16000.0,True,2,0.702234,3805.0,0.457336,...,1.0,-1.357657,0.707604,1675773,0.454864,1,-0.912818,4.272727,110.0,1.305662
3,19868496155,2127040843,883,1,13000.0,True,2,0.702234,3805.0,0.457336,...,1.0,-0.56077,0.707604,1675773,0.454864,1,-0.912818,4.272727,110.0,1.305662
4,19868522403,2127040843,429,0,15000.0,True,2,0.702234,3805.0,0.457336,...,1.0,-0.987585,0.707604,1675773,0.454864,1,-0.912818,4.272727,110.0,1.305662


In [26]:
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
train_df = train_df.fillna(value = -1)
train_df = train_df[features + [target]]
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0)

In [27]:
train_df.head()

Unnamed: 0,mean_user_accuracy,questions_answered,std_user_accuracy,median_user_accuracy,skew_user_accuracy,mean_accuracy,question_asked,std_accuracy,median_accuracy,skew_accuracy,...,skew_part_accuracy,mean_user_question_part,count_question_part,std_question_part,mean_user_lecture_part,count_lecture_part,std_lecture_part,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly
0,0.702234,3805.0,0.457336,1.0,-0.884864,0.841512,1243,0.365345,1.0,-1.87255,...,-0.912818,3.748752,3805.0,1.505144,4.272727,110.0,1.305662,13000.0,True,0
1,0.702234,3805.0,0.457336,1.0,-0.884864,0.723973,1047,0.447244,1.0,-1.003488,...,-0.912818,3.748752,3805.0,1.505144,4.272727,110.0,1.305662,16992.0,True,1
2,0.702234,3805.0,0.457336,1.0,-0.884864,0.78042,715,0.414252,1.0,-1.357657,...,-0.912818,3.748752,3805.0,1.505144,4.272727,110.0,1.305662,16000.0,True,1
3,0.702234,3805.0,0.457336,1.0,-0.884864,0.634783,920,0.481753,1.0,-0.56077,...,-0.912818,3.748752,3805.0,1.505144,4.272727,110.0,1.305662,13000.0,True,1
4,0.702234,3805.0,0.457336,1.0,-0.884864,0.721127,1065,0.448656,1.0,-0.987585,...,-0.912818,3.748752,3805.0,1.505144,4.272727,110.0,1.305662,15000.0,True,0


In [28]:
train_df, test_df = train_test_split(train_df, random_state=SEED, test_size=0.2)

In [29]:
sampler = TPESampler(seed=666)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 31)
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    model = LGBMClassifier(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=SEED
    )
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(train_df[features], train_df[target])
    score = roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1])
    return score

# uncomment to use optuna
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=70)

params = study.best_params
params['random_state'] = SEED
# params = {
#     bagging_fraction=0.7463373281938641,
#     feature_fraction=0.820669839991336,
#     learning_rate=0.09355696690242987, 
#     max_depth=8,
#     min_child_samples=221, 
#     min_data_in_leaf=38, 
#     n_estimators=223,
#     num_leaves=18, 
#     random_state=0
# }
model = LGBMClassifier(**params)
model.fit(train_df[features], train_df[target])

1413156, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7322400881413156
[32m[I 2020-10-18 21:09:37,697][0m Trial 40 finished with value: 0.7199881269516397 and parameters: {'num_leaves': 22, 'n_estimators': 192, 'max_depth': 7, 'min_child_samples': 162, 'learning_rate': 0.6071687887764963, 'min_data_in_leaf': 40, 'bagging_fraction': 0.8893285835948712, 'feature_fraction': 0.7322400881413156}. Best is trial 36 with value: 0.7235875612146813.[0m
[32m[I 2020-10-18 21:09:41,001][0m Trial 41 finished with value: 0.7235228042989454 and parameters: {'num_leaves': 15, 'n_estimators': 182, 'max_depth': 8, 'min_child_samples': 283, 'learning_rate': 0.07661416200532031, 'min_data_in_leaf': 40, 'bagging_fraction': 0.9330239589037268, 'feature_fraction': 0.7610912808728623}. Best is trial 36 with value: 0.7235875612146813.[0m
[32m[I 2020-10-18 21:09:44,587][0m Trial 42 finished with value: 0.72364968433117 and parameters: {'num_leaves': 18, 'n_estimators': 223, 'ma

LGBMClassifier(bagging_fraction=0.7463373281938641,
               feature_fraction=0.820669839991336,
               learning_rate=0.09355696690242987, max_depth=8,
               min_child_samples=221, min_data_in_leaf=38, n_estimators=223,
               num_leaves=18, random_state=0)

In [30]:
print('LGB score: ', roc_auc_score(test_df[target].values, model.predict_proba(test_df[features])[:,1]))

LGB score:  0.72364968433117


In [None]:
'''
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(question_df, how='left', on='content_id')
    test_df = test_df.merge(user_answers_df, how = 'left', on = 'user_id')
    test_df = test_df.merge(user_question_part_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how = 'left', on = 'content_id')
    test_df = test_df.merge(part_answers_df, how='left', on='part')
    test_df = test_df.merge(user_lecture_part_df, how='left', on='user_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df.fillna(value = -1, inplace = True)

    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:,1]
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
'''