In [1]:
import pandas as pd, numpy as np, gc
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [3]:
#loading in datasets
train = pd.read_csv('train.csv')
print('Train size of first piece:', train.shape )
train.head()

Train size of first piece: (26296946, 20)


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [4]:
targets = pd.read_csv('train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print(targets.shape)
targets.head()

(424116, 4)


Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [5]:
#feature engineering
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

# got the idea of events from here: https://www.kaggle.com/code/kimtaehun/lightgbm-baseline-with-aggregated-log-data
EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']

In [6]:
# got the idea and code for a feature engineering function from here: https://www.kaggle.com/code/cdeotte/xgboost-baseline-0-680
def feature_engineer(train):
    
    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS: 
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS,axis=1)
        
    df = pd.concat(dfs,axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df

In [12]:
train_final = feature_engineer(train)
train_final

Unnamed: 0_level_0,level_group,event_name_nunique,fqid_nunique,room_fqid_nunique,text_nunique,elapsed_time_mean,level_mean,page_mean,room_coor_x_mean,room_coor_y_mean,...,navigate_click_sum,person_click_sum,cutscene_click_sum,object_click_sum,map_hover_sum,notification_click_sum,map_click_sum,observation_click_sum,checkpoint_sum,elapsed_time_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090312431273200,0-4,10,30,7,56,8.579356e+04,1.945455,-1.000000,7.701275,-71.413749,...,81,22,28,11,4,8,2,4,1,14155937
20090312431273200,13-22,10,49,12,168,1.040601e+06,17.402381,-1.000000,-130.347170,-162.004310,...,170,123,60,20,14,10,6,3,1,437052322
20090312431273200,5-12,10,39,11,124,3.572052e+05,8.054054,-1.000000,14.306062,-57.269322,...,103,104,12,28,9,9,8,1,1,105732736
20090312433251036,0-4,11,22,6,49,9.763342e+04,1.870504,0.000000,-84.045960,-53.671082,...,49,18,36,15,3,5,3,2,1,13571045
20090312433251036,13-22,11,73,16,183,2.498852e+06,17.762529,5.100000,-30.762282,-142.861892,...,637,145,65,83,186,14,45,5,1,3241011333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100219442786200,13-22,11,49,12,166,9.619192e+05,17.671395,5.230769,-158.599129,-257.988796,...,181,101,54,15,13,8,8,5,1,406891805
22100219442786200,5-12,11,41,11,116,3.866058e+05,8.111511,1.833333,-2.569202,-96.365247,...,85,95,11,23,10,9,7,1,1,107476420
22100221145014656,0-4,11,27,7,64,2.036104e+05,2.061611,0.333333,-1.339605,-39.749052,...,92,27,31,27,2,9,2,5,1,42961793
22100221145014656,13-22,11,54,13,205,4.899580e+06,18.127632,5.181818,-57.838513,-167.203933,...,363,139,76,48,65,6,16,4,1,3723681179


In [9]:
#model building
clf = XGBClassifier(eval_metric='log-loss')

In [10]:
from sklearn.model_selection import GridSearchCV
# set up our search grid
param_grid = {"max_depth":    [4, 5],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}

In [19]:
FEATURES = [c for c in train_final.columns if c != 'level_group']
print('We will train with', len(FEATURES) ,'features')
ALL_USERS = train_final.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 30 features
We will train with 23562 users info


In [22]:
#Since the training data and targets data have different amount of rows, i was unable to figure out how to traing the model myself 
#So i also used the model training code from here: https://www.kaggle.com/code/cdeotte/xgboost-baseline-0-680

#gs = GridSearchCV(clf, param_grid = param_grid)
#gs.fit(train_final, targets)

gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}

# COMPUTE CV SCORE WITH 5 GROUP K FOLD
for i, (train_index, test_index) in enumerate(gkf.split(X=train_final, groups=train_final.index)):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)
    
    xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.05,
    'max_depth': 4,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'tree_method':'hist',
    'subsample':0.8,
    'colsample_bytree': 0.4,
    'use_label_encoder' : False}
    # ITERATE THRU QUESTIONS 1 THRU 18
    for t in range(1,19):
        
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
            
        # TRAIN DATA
        train_x = train_final.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        
        # VALID DATA
        valid_x = train_final.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
        
        # TRAIN MODEL        
        clf =  XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        print(f'{t}({clf.best_ntree_limit}), ',end='')
        
        # SAVE MODEL, PREDICT VALID OOF
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]
        
    print()

#########################
### Fold 1
#########################




1(151), 



2(143), 



3(158), 



4(163), 



5(133), 



6(154), 



7(92), 



8(77), 



9(130), 



10(215), 



11(121), 



12(132), 



13(140), 



14(264), 



15(198), 



16(66), 



17(64), 



18(148), 
#########################
### Fold 2
#########################




1(193), 



2(247), 



3(108), 



4(187), 



5(142), 



6(141), 



7(116), 



8(49), 



9(105), 



10(158), 



11(127), 



12(143), 



13(163), 



14(233), 



15(241), 



16(145), 



17(68), 



18(158), 
#########################
### Fold 3
#########################




1(163), 



2(141), 



3(128), 



4(220), 



5(151), 



6(110), 



7(96), 



8(52), 



9(124), 



10(137), 



11(91), 



12(75), 



13(118), 



14(148), 



15(355), 



16(105), 



17(166), 



18(151), 
#########################
### Fold 4
#########################




1(237), 



2(138), 



3(115), 



4(152), 



5(156), 



6(134), 



7(103), 



8(54), 



9(185), 



10(107), 



11(116), 



12(99), 



13(186), 



14(270), 



15(234), 



16(83), 



17(79), 



18(102), 
#########################
### Fold 5
#########################




1(193), 



2(133), 



3(136), 



4(252), 



5(130), 



6(131), 



7(112), 



8(67), 



9(148), 



10(123), 



11(82), 



12(79), 



13(226), 



14(136), 



15(192), 



16(57), 



17(126), 



18(223), 


In [27]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,...,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group,session_level
0,20090109393214576,0,0,cutscene_click,basic,0,,-413.991405,75.685314,380.0,...,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4,0
1,20090109393214576,1,1965,person_click,basic,0,,-105.991405,-63.314686,688.0,...,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4,0
2,20090109393214576,2,3614,person_click,basic,0,,-418.991405,47.685314,375.0,...,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4,0
3,20090109393214576,3,5330,person_click,basic,0,,-110.991405,-57.314686,683.0,...,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4,0
4,20090109393214576,4,6397,person_click,basic,0,,-110.991405,-57.314686,683.0,...,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3723,20090312331414616,1001,1581679,map_hover,basic,22,,,,,...,484.0,,tunic.wildlife,tunic.historicalsociety.entry,,0,0,1,13-22,8
3724,20090312331414616,1002,1583044,map_hover,basic,22,,,,,...,783.0,,tunic.capitol_2,tunic.historicalsociety.entry,,0,0,1,13-22,8
3725,20090312331414616,1003,1583410,map_click,undefined,22,,483.726363,-3.880047,456.0,...,,,tunic.capitol_2,tunic.historicalsociety.entry,,0,0,1,13-22,8
3726,20090312331414616,1004,1585841,navigate_click,undefined,22,,192.372139,38.216178,383.0,...,,,chap4_finale_c,tunic.capitol_2.hall,,0,0,1,13-22,8


In [29]:
test_final = feature_engineer(test)
test_final

Unnamed: 0_level_0,level_group,event_name_nunique,fqid_nunique,room_fqid_nunique,text_nunique,elapsed_time_mean,level_mean,page_mean,room_coor_x_mean,room_coor_y_mean,...,navigate_click_sum,person_click_sum,cutscene_click_sum,object_click_sum,map_hover_sum,notification_click_sum,map_click_sum,observation_click_sum,checkpoint_sum,elapsed_time_sum
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20090109393214576,0-4,11,24,6,53,117119.8,2.05,0.5,39.449107,-64.625451,...,62,21,27,9,2,5,3,3,1,16396773
20090109393214576,13-22,11,52,12,202,6165666.0,17.918089,4.84375,-124.577142,-220.648267,...,240,137,78,40,20,10,7,4,1,3613080500
20090109393214576,5-12,11,43,12,140,2092368.0,7.686567,1.888889,48.26039,-43.350532,...,130,127,11,175,17,9,6,2,1,1121509230
20090312143683264,0-4,11,27,7,58,142418.3,1.803681,0.0,91.577137,-112.486741,...,62,18,33,22,2,9,2,4,1,23214178
20090312143683264,13-22,11,55,15,172,2195108.0,17.335626,4.833333,-21.133174,-198.835547,...,414,113,55,37,31,8,12,5,1,1595843559
20090312143683264,5-12,11,56,13,127,791150.4,8.445172,1.565217,42.823079,-78.412808,...,276,102,16,92,22,9,13,8,1,483392868
20090312331414616,0-4,11,23,6,53,112832.8,1.861538,0.0,67.333464,-61.57197,...,41,19,30,10,4,6,2,1,1,14668268
20090312331414616,13-22,11,62,13,166,1262481.0,18.332689,5.545455,-93.299934,-156.284179,...,202,102,55,50,26,11,9,6,1,652702482
20090312331414616,5-12,11,46,11,117,558520.5,8.459119,2.25,8.512538,-70.478232,...,92,97,11,36,25,9,7,1,1,177609534


In [36]:
FEATURES_TEST = [c for c in test_final.columns if c != 'level_group']
FEATURES_TEST

['event_name_nunique',
 'fqid_nunique',
 'room_fqid_nunique',
 'text_nunique',
 'elapsed_time_mean',
 'level_mean',
 'page_mean',
 'room_coor_x_mean',
 'room_coor_y_mean',
 'screen_coor_x_mean',
 'screen_coor_y_mean',
 'hover_duration_mean',
 'elapsed_time_std',
 'level_std',
 'page_std',
 'room_coor_x_std',
 'room_coor_y_std',
 'screen_coor_x_std',
 'screen_coor_y_std',
 'hover_duration_std',
 'navigate_click_sum',
 'person_click_sum',
 'cutscene_click_sum',
 'object_click_sum',
 'map_hover_sum',
 'notification_click_sum',
 'map_click_sum',
 'observation_click_sum',
 'checkpoint_sum',
 'elapsed_time_sum']

In [39]:
predictions = clf.predict(test_final[FEATURES_TEST])
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1])

In [41]:
output = pd.DataFrame({"Id":test_final.index, "Predictions":predictions})
output.to_csv('submission.csv', index=False)