In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb

In [3]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

### Read Data

In [4]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    specs_df = pd.read_csv(path/'specs.csv')
    return train_df, test_df, train_labels_df, specs_df

In [5]:
train_df, test_df, train_labels_df, specs_df = read_data()

In [6]:
train_df.shape

(11341042, 11)

In [7]:
train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [8]:
test_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [9]:
train_labels_df.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


In [10]:
specs_df.head()

Unnamed: 0,event_id,info,args
0,2b9272f4,The end of system-initiated feedback (Correct)...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
1,df4fe8b6,The end of system-initiated feedback (Incorrec...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
2,3babcb9b,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
3,7f0836bf,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."
4,ab3136ba,The end of system-initiated instruction event ...,"[{""name"":""game_time"",""type"":""int"",""info"":""mill..."


## Feature Engineering

### Process train set

In [11]:
main_keys = ['installation_id', 'game_session']
merge_args = {'left_index':True, 'right_index':True}

In [12]:
def remove_wrong_event_codes(df):
    return df[((df['title'] == 'Bird Measurer (Assessment)') & (df['event_code'] == 4100)) == False]

In [13]:
train_df = remove_wrong_event_codes(train_df)

In [14]:
test_df = remove_wrong_event_codes(test_df)

In [15]:
test_df.shape

(1156150, 11)

In [16]:
merged_train_df = pd.merge(train_df, train_labels_df, on=main_keys)

In [17]:
pd.options.display.max_colwidth = 2000
merged_train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,world,title_y,num_correct,num_incorrect,accuracy,accuracy_group
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.0,3
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2025}",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.0,3
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""total_duration"":3000,""event_count"":3,""game_time"":37,""event_code"":3010}",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.0,3
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""duration"":3864,""event_count"":4,""game_time"":3901,""event_code"":3110}",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.0,3
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it out of the ground with your finger!"",""identifier"":""Dot_PickFinger"",""media_type"":""audio"",""total_duration"":2680,""event_count"":5,""game_time"":3901,""event_code"":3010}",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.0,3


In [18]:
import json

class EventCodeCounter():
    def __init__(self, event_code):
        self.event_code = event_code

    def process_event_code(self, x, is_correct):
        count = 0
        for xi in x:
            fields = json.loads(xi)
            if 'event_code' in fields and fields['event_code'] == self.event_code and 'correct' in fields and fields['correct'] == is_correct:
                count += 1
        return count

    def process_correct_event_code(self, x):
        return self.process_event_code(x, True)

    def process_incorrect_event_code(self, x):
        return self.process_event_code(x, False)

    def extract_correct_incorrect(self, df, field_name, func):
        key = main_keys
        event_code_count = df[:].groupby(key)['event_data'].agg(func)
        event_code_count = event_code_count.reset_index()
        event_code_count.columns = [*key, field_name]
        event_code_count[field_name].fillna(0.0, inplace=True)
        return event_code_count

    def extract_correct(self, df):
        return self.extract_correct_incorrect(df, f'num_correct_{self.event_code}', self.process_correct_event_code)

    def extract_incorrect(self, df):
        return self.extract_correct_incorrect(df, f'num_incorrect_{self.event_code}', self.process_incorrect_event_code)

In [19]:
event_code_counter_4100 = EventCodeCounter(4100)
event_code_counter_4100.extract_correct(merged_train_df)

Unnamed: 0,installation_id,game_session,num_correct_4100
0,0006a69f,6bdf9623adc94d89,1
1,0006a69f,77b8ee947eb84b4e,0
2,0006a69f,901acc108f55a5a1,1
3,0006a69f,9501794defd84e4d,1
4,0006a69f,a9ef3ecb3d1acc6a,0
...,...,...,...
17685,ffc90c32,c996482b11d149dd,0
17686,ffd2871d,b05a02b52d5c1f4c,1
17687,ffeb0b1b,5448d652309a6324,1
17688,ffeb0b1b,a6885ab824fbc32c,0


In [20]:
merged_train_df = merged_train_df.merge(event_code_counter_4100.extract_correct(merged_train_df), how='left')

In [21]:
merged_train_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,world,title_y,num_correct,num_incorrect,accuracy,accuracy_group,num_correct_4100
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2025}",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""total_duration"":3000,""event_count"":3,""game_time"":37,""event_code"":3010}",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""duration"":3864,""event_count"":4,""game_time"":3901,""event_code"":3110}",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it out of the ground with your finger!"",""identifier"":""Dot_PickFinger"",""media_type"":""audio"",""total_duration"":2680,""event_count"":5,""game_time"":3901,""event_code"":3010}",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863093,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""selecting"",""round_number"":2,""duration"":7067,""event_count"":58,""game_time"":67094,""event_code"":2030}",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1
863094,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_bucket"":1,""coordinates"":{""x"":552,""y"":401,""stage_width"":1015,""stage_height"":762},""correct"":true,""prompt"":""holds least"",""event_count"":57,""game_time"":67094,""event_code"":4025}",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1
863095,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""total_duration"":700,""event_count"":59,""game_time"":67094,""event_code"":3021}",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1
863096,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""duration"":753,""event_count"":60,""game_time"":67847,""event_code"":3121}",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1


In [22]:
merged_train_df = merged_train_df.merge(event_code_counter_4100.extract_incorrect(merged_train_df), how='left')

In [23]:
merged_train_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,world,title_y,num_correct,num_incorrect,accuracy,accuracy_group,num_correct_4100,num_incorrect_4100
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2025}",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""total_duration"":3000,""event_count"":3,""game_time"":37,""event_code"":3010}",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""duration"":3864,""event_count"":4,""game_time"":3901,""event_code"":3110}",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it out of the ground with your finger!"",""identifier"":""Dot_PickFinger"",""media_type"":""audio"",""total_duration"":2680,""event_count"":5,""game_time"":3901,""event_code"":3010}",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863093,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""selecting"",""round_number"":2,""duration"":7067,""event_count"":58,""game_time"":67094,""event_code"":2030}",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2
863094,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_bucket"":1,""coordinates"":{""x"":552,""y"":401,""stage_width"":1015,""stage_height"":762},""correct"":true,""prompt"":""holds least"",""event_count"":57,""game_time"":67094,""event_code"":4025}",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2
863095,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""total_duration"":700,""event_count"":59,""game_time"":67094,""event_code"":3021}",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2
863096,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""duration"":753,""event_count"":60,""game_time"":67847,""event_code"":3121}",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2


In [24]:
merged_train_df[(merged_train_df['event_code'] == 4100) & (merged_train_df['num_correct'] != merged_train_df['num_correct_4100'])].sort_values('installation_id').head(20)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,world,title_y,num_correct,num_incorrect,accuracy,accuracy_group,num_correct_4100,num_incorrect_4100


In [25]:
event_code_counter_4110 = EventCodeCounter(4110)
event_code_counter_4110.extract_correct(merged_train_df)

Unnamed: 0,installation_id,game_session,num_correct_4110
0,0006a69f,6bdf9623adc94d89,0
1,0006a69f,77b8ee947eb84b4e,0
2,0006a69f,901acc108f55a5a1,0
3,0006a69f,9501794defd84e4d,0
4,0006a69f,a9ef3ecb3d1acc6a,1
...,...,...,...
17685,ffc90c32,c996482b11d149dd,1
17686,ffd2871d,b05a02b52d5c1f4c,0
17687,ffeb0b1b,5448d652309a6324,0
17688,ffeb0b1b,a6885ab824fbc32c,0


In [26]:
merged_train_df = merged_train_df.merge(event_code_counter_4110.extract_correct(merged_train_df), how='left')

In [27]:
merged_train_df[(merged_train_df['event_code'] == 4110) & (merged_train_df['num_correct'] != merged_train_df['num_correct_4110'])].sort_values('installation_id').head(20)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,world,title_y,num_correct,num_incorrect,accuracy,accuracy_group,num_correct_4100,num_incorrect_4100,num_correct_4110


In [28]:
merged_train_df = merged_train_df.merge(event_code_counter_4110.extract_incorrect(merged_train_df), how='left')

In [29]:
merged_train_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,world,title_y,num_correct,num_incorrect,accuracy,accuracy_group,num_correct_4100,num_incorrect_4100,num_correct_4110,num_incorrect_4110
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0,0,0
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2025}",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0,0,0
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""total_duration"":3000,""event_count"":3,""game_time"":37,""event_code"":3010}",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0,0,0
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""duration"":3864,""event_count"":4,""game_time"":3901,""event_code"":3110}",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0,0,0
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it out of the ground with your finger!"",""identifier"":""Dot_PickFinger"",""media_type"":""audio"",""total_duration"":2680,""event_count"":5,""game_time"":3901,""event_code"":3010}",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,Mushroom Sorter (Assessment),1,0,1.000000,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863093,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""selecting"",""round_number"":2,""duration"":7067,""event_count"":58,""game_time"":67094,""event_code"":2030}",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2,0,0
863094,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_bucket"":1,""coordinates"":{""x"":552,""y"":401,""stage_width"":1015,""stage_height"":762},""correct"":true,""prompt"":""holds least"",""event_count"":57,""game_time"":67094,""event_code"":4025}",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2,0,0
863095,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""total_duration"":700,""event_count"":59,""game_time"":67094,""event_code"":3021}",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2,0,0
863096,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""duration"":753,""event_count"":60,""game_time"":67847,""event_code"":3121}",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,Cauldron Filler (Assessment),1,2,0.333333,1,1,2,0,0


In [30]:
merged_train_df['num_correct_new'] = merged_train_df.apply(lambda x : x['num_correct_4100'] + x['num_correct_4110'], axis=1)

In [31]:
merged_train_df['num_incorrect_new'] = merged_train_df.apply(lambda x : x['num_incorrect_4100'] + x['num_incorrect_4110'], axis=1)

In [32]:
merged_train_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,...,num_correct,num_incorrect,accuracy,accuracy_group,num_correct_4100,num_incorrect_4100,num_correct_4110,num_incorrect_4110,num_correct_new,num_incorrect_new
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,1,0,1.000000,3,1,0,0,0,1,0
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2025}",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,...,1,0,1.000000,3,1,0,0,0,1,0
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""total_duration"":3000,""event_count"":3,""game_time"":37,""event_code"":3010}",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,...,1,0,1.000000,3,1,0,0,0,1,0
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""duration"":3864,""event_count"":4,""game_time"":3901,""event_code"":3110}",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,...,1,0,1.000000,3,1,0,0,0,1,0
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it out of the ground with your finger!"",""identifier"":""Dot_PickFinger"",""media_type"":""audio"",""total_duration"":2680,""event_count"":5,""game_time"":3901,""event_code"":3010}",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,...,1,0,1.000000,3,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863093,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""selecting"",""round_number"":2,""duration"":7067,""event_count"":58,""game_time"":67094,""event_code"":2030}",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,...,1,2,0.333333,1,1,2,0,0,1,2
863094,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_bucket"":1,""coordinates"":{""x"":552,""y"":401,""stage_width"":1015,""stage_height"":762},""correct"":true,""prompt"":""holds least"",""event_count"":57,""game_time"":67094,""event_code"":4025}",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,...,1,2,0.333333,1,1,2,0,0,1,2
863095,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""total_duration"":700,""event_count"":59,""game_time"":67094,""event_code"":3021}",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,...,1,2,0.333333,1,1,2,0,0,1,2
863096,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Awesome"",""media_type"":""audio"",""duration"":753,""event_count"":60,""game_time"":67847,""event_code"":3121}",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,...,1,2,0.333333,1,1,2,0,0,1,2


In [33]:
def check_correctness(field_orig='num_correct', field_new='num_correct_new'):
    merged_check = merged_train_df.apply(lambda x : x['num_correct'] == x['num_correct_new'], axis=1).reset_index()
    merged_check.columns = ['index', 'check']
    return merged_check.groupby('check').agg('count')

check_correctness()

Unnamed: 0_level_0,index
check,Unnamed: 1_level_1
True,863098


In [34]:
check_correctness('num_incorrect', 'num_incorrect_new')

Unnamed: 0_level_0,index
check,Unnamed: 1_level_1
True,863098


In [35]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['year'] = df['timestamp'].dt.year
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['weekofyear'] = df['timestamp'].dt.weekofyear
    df['dayofyear'] = df['timestamp'].dt.dayofyear
    df['quarter'] = df['timestamp'].dt.quarter
    return df

merged_train_df = extract_time_features(merged_train_df)
merged_train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title_x,type,...,num_incorrect_4110,num_correct_new,num_incorrect_new,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,0,1,0,8,5,2019,1,32,218,3
1,db02c830,901acc108f55a5a1,2019-08-06 05:22:01.400000+00:00,"{""event_count"":2,""game_time"":37,""event_code"":2025}",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,...,0,1,0,8,5,2019,1,32,218,3
2,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:01.403000+00:00,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""total_duration"":3000,""event_count"":3,""game_time"":37,""event_code"":3010}",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,...,0,1,0,8,5,2019,1,32,218,3
3,a52b92d5,901acc108f55a5a1,2019-08-06 05:22:05.242000+00:00,"{""description"":""Pull three mushrooms out of the ground and order them from shortest to tallest!"",""identifier"":""Dot_PullMushrooms"",""media_type"":""audio"",""duration"":3864,""event_count"":4,""game_time"":3901,""event_code"":3110}",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,...,0,1,0,8,5,2019,1,32,218,3
4,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:05.244000+00:00,"{""description"":""To pick a mushroom, pull it out of the ground with your finger!"",""identifier"":""Dot_PickFinger"",""media_type"":""audio"",""total_duration"":2680,""event_count"":5,""game_time"":3901,""event_code"":3010}",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,...,0,1,0,8,5,2019,1,32,218,3


In [36]:
# Remove some columns
merged_train_df = merged_train_df[[col for col in merged_train_df.columns if col not in ['timestamp', 'num_correct', 'num_incorrect', 'accuracy', 'event_data', 'title_y', 'num_incorrect_4110', 'num_correct_4110', 'num_incorrect_4100', 'num_correct_4100']]]
merged_train_df.head()

Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,game_time,title_x,type,world,accuracy_group,num_correct_new,num_incorrect_new,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,3bfd1a65,901acc108f55a5a1,0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
1,db02c830,901acc108f55a5a1,0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
2,a1e4395d,901acc108f55a5a1,0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
3,a52b92d5,901acc108f55a5a1,0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
4,a1e4395d,901acc108f55a5a1,0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3


In [37]:
merged_train_df = merged_train_df.rename(columns={"title_x": "title", "num_correct_new": "num_correct", "num_incorrect_new": "num_incorrect"})
merged_train_df

Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,game_time,title,type,world,accuracy_group,num_correct,num_incorrect,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,3bfd1a65,901acc108f55a5a1,0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
1,db02c830,901acc108f55a5a1,0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
2,a1e4395d,901acc108f55a5a1,0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
3,a52b92d5,901acc108f55a5a1,0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
4,a1e4395d,901acc108f55a5a1,0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,3,1,0,8,5,2019,1,32,218,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863093,28520915,5448d652309a6324,ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,1,2,9,2,2019,6,38,265,3
863094,91561152,5448d652309a6324,ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,1,2,9,2,2019,6,38,265,3
863095,d3268efa,5448d652309a6324,ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,1,2,9,2,2019,6,38,265,3
863096,b5053438,5448d652309a6324,ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,1,2,9,2,2019,6,38,265,3


In [38]:
def convert_categorical_to_num(df, categorical_list):
    for cat_name in categorical_list:
        df[cat_name] = df[cat_name].astype('category')
    # Get categorical columns again
    cat_columns = df.select_dtypes(['category']).columns
    print('cat_columns', cat_columns)
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

convert_categorical_to_num(merged_train_df, ['title', 'type', 'world', 'event_id'])

cat_columns Index(['event_id', 'title', 'type', 'world'], dtype='object')


In [39]:
pd.options.display.max_columns = None
merged_train_df

Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,game_time,title,type,world,accuracy_group,num_correct,num_incorrect,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,21,901acc108f55a5a1,0006a69f,1,2000,0,4,0,2,3,1,0,8,5,2019,1,32,218,3
1,84,901acc108f55a5a1,0006a69f,2,2025,37,4,0,2,3,1,0,8,5,2019,1,32,218,3
2,64,901acc108f55a5a1,0006a69f,3,3010,37,4,0,2,3,1,0,8,5,2019,1,32,218,3
3,65,901acc108f55a5a1,0006a69f,4,3110,3901,4,0,2,3,1,0,8,5,2019,1,32,218,3
4,64,901acc108f55a5a1,0006a69f,5,3010,3901,4,0,2,3,1,0,8,5,2019,1,32,218,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863093,10,5448d652309a6324,ffeb0b1b,58,2030,67094,2,0,1,1,1,2,9,2,2019,6,38,265,3
863094,56,5448d652309a6324,ffeb0b1b,57,4025,67094,2,0,1,1,1,2,9,2,2019,6,38,265,3
863095,82,5448d652309a6324,ffeb0b1b,59,3021,67094,2,0,1,1,1,2,9,2,2019,6,38,265,3
863096,73,5448d652309a6324,ffeb0b1b,60,3121,67847,2,0,1,1,1,2,9,2,2019,6,38,265,3


### Process test set

In [40]:
comp_test_df = test_df.merge(event_code_counter_4100.extract_correct(test_df), on=main_keys, how='left')
comp_test_df[comp_test_df['num_correct_4100'] > 0]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct_4100
282,7ad3efc6,8b38fc0d2fd315dc,2019-09-11T18:56:11.918Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",00abaee7,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES,1
283,65a38bf7,8b38fc0d2fd315dc,2019-09-11T18:56:12.019Z,"{""crystals"":[{""id"":""gem07"",""weight"":6},{""id"":""gem05"",""weight"":4},{""id"":""gem01"",""weight"":1},{""id"":""gem03"",""weight"":3},{""id"":""gem02"",""weight"":1},{""id"":""gem08"",""weight"":6},{""id"":""gem04"",""weight"":3},{""id"":""gem06"",""weight"":4}],""event_count"":2,""game_time"":58,""event_code"":2020}",00abaee7,2,2020,58,Cart Balancer (Assessment),Assessment,CRYSTALCAVES,1
284,795e4a37,8b38fc0d2fd315dc,2019-09-11T18:56:12.030Z,"{""description"":""We've got to balance this scale! Fill the trays with gems so both sides weigh the same! Use your finger to drag the gems to the trays!"",""identifier"":""Dot_BalanceScale,Dot_FillTrays,Dot_UseFingerDragGems"",""media_type"":""audio"",""total_duration"":6970,""event_count"":3,""game_time"":58,""event_code"":3010}",00abaee7,3,3010,58,Cart Balancer (Assessment),Assessment,CRYSTALCAVES,1
285,5e109ec3,8b38fc0d2fd315dc,2019-09-11T18:56:15.790Z,"{""crystal_id"":""gem07"",""weight"":6,""coordinates"":{""x"":133,""y"":607,""stage_width"":1015,""stage_height"":762},""source"":""resources"",""crystals"":[{""id"":""gem05"",""weight"":4},{""id"":""gem01"",""weight"":1},{""id"":""gem03"",""weight"":3},{""id"":""gem02"",""weight"":1},{""id"":""gem08"",""weight"":6},{""id"":""gem04"",""weight"":3},{""id"":""gem06"",""weight"":4}],""event_count"":4,""game_time"":3852,""event_code"":4030}",00abaee7,4,4030,3852,Cart Balancer (Assessment),Assessment,CRYSTALCAVES,1
286,5c2f29ca,8b38fc0d2fd315dc,2019-09-11T18:56:17.869Z,"{""crystal_id"":""gem07"",""weight"":6,""coordinates"":{""x"":164,""y"":260,""stage_width"":1015,""stage_height"":762},""side"":""left"",""left"":[{""id"":""gem07"",""weight"":6}],""duration"":2115,""right"":[],""crystals"":[{""id"":""gem05"",""weight"":4},{""id"":""gem01"",""weight"":1},{""id"":""gem03"",""weight"":3},{""id"":""gem02"",""weight"":1},{""id"":""gem08"",""weight"":6},{""id"":""gem04"",""weight"":3},{""id"":""gem06"",""weight"":4}],""source"":""resources"",""event_count"":5,""game_time"":5968,""event_code"":4020}",00abaee7,5,4020,5968,Cart Balancer (Assessment),Assessment,CRYSTALCAVES,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1156143,a52b92d5,46ff9d3ad2be09f2,2019-09-28T21:20:36.719Z,"{""description"":""Okay, when you want to check your answer - tap here!"",""identifier"":""Dot_CheckTapHere"",""media_type"":""audio"",""duration"":410,""event_count"":37,""game_time"":27797,""event_code"":3110}",ffe774cc,37,3110,27797,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1
1156144,9d29771f,46ff9d3ad2be09f2,2019-09-28T21:20:36.723Z,"{""description"":""Alright! This one is the littlest mushroom, and this one is the biggest!"",""identifier"":""Dot_AlrightThisLittleThisBig"",""media_type"":""audio"",""total_duration"":4310,""event_count"":38,""game_time"":27797,""event_code"":3021}",ffe774cc,38,3021,27797,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1
1156145,c74f40cd,46ff9d3ad2be09f2,2019-09-28T21:20:40.918Z,"{""description"":""Alright! This one is the littlest mushroom, and this one is the biggest!"",""identifier"":""Dot_AlrightThisLittleThisBig"",""media_type"":""audio"",""duration"":4233,""event_count"":39,""game_time"":32030,""event_code"":3121}",ffe774cc,39,3121,32030,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1
1156146,6c930e6e,46ff9d3ad2be09f2,2019-09-28T21:20:41.493Z,"{""duration"":20008,""misses"":0,""event_count"":40,""game_time"":32584,""event_code"":2030}",ffe774cc,40,2030,32584,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1


In [41]:
comp_test_df = comp_test_df.merge(event_code_counter_4100.extract_incorrect(comp_test_df), on=main_keys, how='left')
comp_test_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct_4100,num_incorrect_4100
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,0
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,0
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,0,0
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,0,0
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""game_time"":0,""event_code"":2000}",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156145,c74f40cd,46ff9d3ad2be09f2,2019-09-28T21:20:40.918Z,"{""description"":""Alright! This one is the littlest mushroom, and this one is the biggest!"",""identifier"":""Dot_AlrightThisLittleThisBig"",""media_type"":""audio"",""duration"":4233,""event_count"":39,""game_time"":32030,""event_code"":3121}",ffe774cc,39,3121,32030,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0
1156146,6c930e6e,46ff9d3ad2be09f2,2019-09-28T21:20:41.493Z,"{""duration"":20008,""misses"":0,""event_count"":40,""game_time"":32584,""event_code"":2030}",ffe774cc,40,2030,32584,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0
1156147,a5be6304,46ff9d3ad2be09f2,2019-09-28T21:20:45.499Z,"{""session_duration"":36607,""exit_type"":""game_completed"",""event_count"":41,""game_time"":36607,""event_code"":2010}",ffe774cc,41,2010,36607,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0
1156148,27253bdc,96d7dc31e822cedc,2019-09-28T21:21:05.670Z,"{""event_code"": 2000, ""event_count"": 1}",ffe774cc,1,2000,0,Tree Top City - Level 3,Clip,TREETOPCITY,0,0


In [42]:
comp_test_df = comp_test_df.merge(event_code_counter_4110.extract_correct(comp_test_df), on=main_keys, how='left')
comp_test_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct_4100,num_incorrect_4100,num_correct_4110
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,0,0
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,0,0
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,0,0,0
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,0,0,0
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""game_time"":0,""event_code"":2000}",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156145,c74f40cd,46ff9d3ad2be09f2,2019-09-28T21:20:40.918Z,"{""description"":""Alright! This one is the littlest mushroom, and this one is the biggest!"",""identifier"":""Dot_AlrightThisLittleThisBig"",""media_type"":""audio"",""duration"":4233,""event_count"":39,""game_time"":32030,""event_code"":3121}",ffe774cc,39,3121,32030,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,0
1156146,6c930e6e,46ff9d3ad2be09f2,2019-09-28T21:20:41.493Z,"{""duration"":20008,""misses"":0,""event_count"":40,""game_time"":32584,""event_code"":2030}",ffe774cc,40,2030,32584,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,0
1156147,a5be6304,46ff9d3ad2be09f2,2019-09-28T21:20:45.499Z,"{""session_duration"":36607,""exit_type"":""game_completed"",""event_count"":41,""game_time"":36607,""event_code"":2010}",ffe774cc,41,2010,36607,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,0
1156148,27253bdc,96d7dc31e822cedc,2019-09-28T21:21:05.670Z,"{""event_code"": 2000, ""event_count"": 1}",ffe774cc,1,2000,0,Tree Top City - Level 3,Clip,TREETOPCITY,0,0,0


In [43]:
comp_test_df = comp_test_df.merge(event_code_counter_4110.extract_incorrect(comp_test_df), on=main_keys, how='left')
comp_test_df[comp_test_df['num_incorrect_4110'] > 0]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct_4100,num_incorrect_4100,num_correct_4110,num_incorrect_4110
3459,f56e0afc,597a8839a5a3468d,2019-09-22T22:27:47.533Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0,""event_code"":2000}",01242218,1,2000,0,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
3460,ec138c1c,597a8839a5a3468d,2019-09-22T22:27:47.575Z,"{""stage_number"":1,""event_count"":2,""game_time"":0,""event_code"":2020}",01242218,2,2020,0,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
3461,1375ccb7,597a8839a5a3468d,2019-09-22T22:27:47.579Z,"{""description"":""Use the caterpillars to measure the birds. Pull the caterpillars out of their holes with your finger."",""identifier"":""Dot_UseCaterpillars,Dot_PullCaterpillarsFinger"",""media_type"":""audio"",""total_duration"":4410,""event_count"":3,""game_time"":0,""event_code"":3010}",01242218,3,3010,0,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
3462,bdf49a58,597a8839a5a3468d,2019-09-22T22:27:52.059Z,"{""description"":""Use the caterpillars to measure the birds. Pull the caterpillars out of their holes with your finger."",""identifier"":""Dot_UseCaterpillars,Dot_PullCaterpillarsFinger"",""media_type"":""audio"",""duration"":4577,""event_count"":4,""game_time"":4577,""event_code"":3110}",01242218,4,3110,4577,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
3463,51102b85,597a8839a5a3468d,2019-09-22T22:27:52.253Z,"{""hat"":0,""caterpillar"":""left"",""coordinates"":{""x"":80,""y"":581,""stage_width"":1015,""stage_height"":762},""hats"":[0,0,0],""caterpillars"":[2,2,2],""source"":""N/A"",""event_count"":5,""game_time"":4745,""event_code"":4030}",01242218,5,4030,4745,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1154393,8fee50e2,524d4d7266a30ff9,2019-09-15T04:04:50.477Z,"{""hat"":4,""destination"":""left"",""coordinates"":{""x"":219,""y"":454,""stage_width"":1015,""stage_height"":762},""correct"":true,""hats"":[5,0,0],""duration"":1440,""hats_placed"":[4,8,0],""source"":""resources"",""event_count"":111,""game_time"":106634,""event_code"":4020}",fe8984b5,111,4020,106634,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
1154394,51102b85,524d4d7266a30ff9,2019-09-15T04:04:51.670Z,"{""hat"":5,""caterpillar"":"""",""coordinates"":{""x"":299,""y"":216,""stage_width"":1015,""stage_height"":762},""hats"":[4,8,0],""caterpillars"":[1,1,5],""source"":""resources"",""event_count"":112,""game_time"":107795,""event_code"":4030}",fe8984b5,112,4030,107795,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
1154395,8fee50e2,524d4d7266a30ff9,2019-09-15T04:04:52.792Z,"{""hat"":5,""destination"":""right"",""coordinates"":{""x"":886,""y"":438,""stage_width"":1015,""stage_height"":762},""correct"":true,""hats"":[0,0,0],""duration"":1150,""hats_placed"":[4,8,5],""source"":""resources"",""event_count"":113,""game_time"":108946,""event_code"":4020}",fe8984b5,113,4020,108946,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2
1154396,f6947f54,524d4d7266a30ff9,2019-09-15T04:04:56.464Z,"{""duration"":65736,""misses"":0,""stage_number"":2,""event_count"":115,""game_time"":112591,""event_code"":2030}",fe8984b5,115,2030,112591,Bird Measurer (Assessment),Assessment,TREETOPCITY,0,0,1,2


In [44]:
comp_test_df['num_correct_new'] = comp_test_df.apply(lambda x : x['num_correct_4100'] + x['num_correct_4110'], axis=1)

In [45]:
comp_test_df['num_incorrect_new'] = comp_test_df.apply(lambda x : x['num_incorrect_4100'] + x['num_incorrect_4110'], axis=1)

In [46]:
comp_test_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct_4100,num_incorrect_4100,num_correct_4110,num_incorrect_4110,num_correct_new,num_incorrect_new
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,0,0,0,0,0
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,0,0,0,0,0
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,0,0,0,0,0,0
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,0,0,0,0,0,0
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""game_time"":0,""event_code"":2000}",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156145,c74f40cd,46ff9d3ad2be09f2,2019-09-28T21:20:40.918Z,"{""description"":""Alright! This one is the littlest mushroom, and this one is the biggest!"",""identifier"":""Dot_AlrightThisLittleThisBig"",""media_type"":""audio"",""duration"":4233,""event_count"":39,""game_time"":32030,""event_code"":3121}",ffe774cc,39,3121,32030,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,0,0,1,0
1156146,6c930e6e,46ff9d3ad2be09f2,2019-09-28T21:20:41.493Z,"{""duration"":20008,""misses"":0,""event_count"":40,""game_time"":32584,""event_code"":2030}",ffe774cc,40,2030,32584,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,0,0,1,0
1156147,a5be6304,46ff9d3ad2be09f2,2019-09-28T21:20:45.499Z,"{""session_duration"":36607,""exit_type"":""game_completed"",""event_count"":41,""game_time"":36607,""event_code"":2010}",ffe774cc,41,2010,36607,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,0,0,1,0
1156148,27253bdc,96d7dc31e822cedc,2019-09-28T21:21:05.670Z,"{""event_code"": 2000, ""event_count"": 1}",ffe774cc,1,2000,0,Tree Top City - Level 3,Clip,TREETOPCITY,0,0,0,0,0,0


In [47]:
comp_test_df = extract_time_features(comp_test_df)
comp_test_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct_4100,num_incorrect_4100,num_correct_4110,num_incorrect_4110,num_correct_new,num_incorrect_new,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,27253bdc,0ea9ecc81a565215,2019-09-10 16:50:24.910000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,0,0,0,0,0,9,16,2019,1,37,253,3
1,27253bdc,c1ea43d8b8261d27,2019-09-10 16:50:55.503000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,0,0,0,0,0,9,16,2019,1,37,253,3
2,27253bdc,7ed86c6b72e725e2,2019-09-10 16:51:51.805000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,0,0,0,0,0,0,9,16,2019,1,37,253,3
3,27253bdc,7e516ace50e7fe67,2019-09-10 16:53:12.825000+00:00,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,0,0,0,0,0,0,9,16,2019,1,37,253,3
4,7d093bf9,a022c3f60ba547e7,2019-09-10 16:54:12.115000+00:00,"{""version"":""1.0"",""round"":0,""event_count"":1,""game_time"":0,""event_code"":2000}",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,0,0,0,0,0,0,9,16,2019,1,37,253,3


In [48]:
# Remove some columns
comp_test_df = comp_test_df[[col for col in comp_test_df.columns if col not in ['timestamp', 'num_correct', 'num_incorrect', 'accuracy', 'event_data', 'title_y', 'num_incorrect_4110', 'num_correct_4110', 'num_incorrect_4100', 'num_correct_4100']]]
comp_test_df.head()

Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,game_time,title,type,world,num_correct_new,num_incorrect_new,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,27253bdc,0ea9ecc81a565215,00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,0,9,16,2019,1,37,253,3
1,27253bdc,c1ea43d8b8261d27,00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,0,9,16,2019,1,37,253,3
2,27253bdc,7ed86c6b72e725e2,00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,0,0,9,16,2019,1,37,253,3
3,27253bdc,7e516ace50e7fe67,00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,0,0,9,16,2019,1,37,253,3
4,7d093bf9,a022c3f60ba547e7,00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,0,0,9,16,2019,1,37,253,3


In [49]:
comp_test_df = comp_test_df.rename(columns={"title_x": "title", "num_correct_new": "num_correct", "num_incorrect_new": "num_incorrect"})
comp_test_df.head()

Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,game_time,title,type,world,num_correct,num_incorrect,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,27253bdc,0ea9ecc81a565215,00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,0,0,9,16,2019,1,37,253,3
1,27253bdc,c1ea43d8b8261d27,00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,0,0,9,16,2019,1,37,253,3
2,27253bdc,7ed86c6b72e725e2,00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK,0,0,9,16,2019,1,37,253,3
3,27253bdc,7e516ace50e7fe67,00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES,0,0,9,16,2019,1,37,253,3
4,7d093bf9,a022c3f60ba547e7,00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES,0,0,9,16,2019,1,37,253,3


In [50]:
convert_categorical_to_num(comp_test_df, ['title', 'type', 'world', 'event_id'])
comp_test_df

cat_columns Index(['event_id', 'title', 'type', 'world'], dtype='object')


Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,game_time,title,type,world,num_correct,num_incorrect,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,45,0ea9ecc81a565215,00abaee7,1,2000,0,43,2,2,0,0,9,16,2019,1,37,253,3
1,45,c1ea43d8b8261d27,00abaee7,1,2000,0,28,2,1,0,0,9,16,2019,1,37,253,3
2,45,7ed86c6b72e725e2,00abaee7,1,2000,0,29,2,1,0,0,9,16,2019,1,37,253,3
3,45,7e516ace50e7fe67,00abaee7,1,2000,0,14,2,0,0,0,9,16,2019,1,37,253,3
4,185,a022c3f60ba547e7,00abaee7,1,2000,0,12,3,0,0,0,9,16,2019,1,37,253,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156145,293,46ff9d3ad2be09f2,ffe774cc,39,3121,32030,30,1,3,1,0,9,21,2019,5,39,271,3
1156146,155,46ff9d3ad2be09f2,ffe774cc,40,2030,32584,30,1,3,1,0,9,21,2019,5,39,271,3
1156147,250,46ff9d3ad2be09f2,ffe774cc,41,2010,36607,30,1,3,1,0,9,21,2019,5,39,271,3
1156148,45,96d7dc31e822cedc,ffe774cc,1,2000,0,41,2,3,0,0,9,21,2019,5,39,271,3


#### Checks

In [51]:
comp_test_df.columns

Index(['event_id', 'game_session', 'installation_id', 'event_count',
       'event_code', 'game_time', 'title', 'type', 'world', 'num_correct',
       'num_incorrect', 'month', 'hour', 'year', 'dayofweek', 'weekofyear',
       'dayofyear', 'quarter'],
      dtype='object')

In [52]:
merged_train_df.columns

Index(['event_id', 'game_session', 'installation_id', 'event_count',
       'event_code', 'game_time', 'title', 'type', 'world', 'accuracy_group',
       'num_correct', 'num_incorrect', 'month', 'hour', 'year', 'dayofweek',
       'weekofyear', 'dayofyear', 'quarter'],
      dtype='object')

In [53]:
merged_train_df.shape

(863098, 19)

In [54]:
comp_test_df.shape

(1156150, 18)

In [55]:
comp_test_df['installation_id'].nunique()

1000

## Training

In [56]:
# quadratic weighted kappa
def qwk3(a1, a2, max_rat=3):
    '''
    a1 - ground truth
    a2 - predicted values
    '''
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

In [57]:
features = [c for c in merged_train_df.columns if c not in ['accuracy_group', 'installation_id', 'game_session', 'event_data']]
target = 'accuracy_group'

In [58]:
features

['event_id',
 'event_count',
 'event_code',
 'game_time',
 'title',
 'type',
 'world',
 'num_correct',
 'num_incorrect',
 'month',
 'hour',
 'year',
 'dayofweek',
 'weekofyear',
 'dayofyear',
 'quarter']

In [59]:
num_splits = 10
params = {
    'learning_rate': 0.007,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'feature_fraction': 0.75,
    "bagging_fraction": 0.8,
    "bagging_seed": 42,
    'max_depth': 11
}

early_stopping_rounds = 100
num_boost_round = 1000 # should be around 11000 to get better results. This is just for quick testing

In [60]:
def train_model(comp_train_df):
    
    kf = KFold(n_splits=num_splits, shuffle=True)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    best_qw3 = 0.0
    best_qw3_index = 0
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1} of {num_splits}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round = num_boost_round, early_stopping_rounds = early_stopping_rounds, 
                          valid_sets=[train_set, val_set], verbose_eval = early_stopping_rounds)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        if best_qw3 < val_crt_fold:
            best_qw3 = val_crt_fold
            best_qw3_index = fold
        
    res = qwk3(comp_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return models, best_qw3_index

In [61]:
%%time
all_models, best_qw3_index = train_model(merged_train_df)
print(f'best model index: {best_qw3_index}')

Fold: 1 of 10
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.621828	valid_1's multi_logloss: 0.621971
[200]	training's multi_logloss: 0.327796	valid_1's multi_logloss: 0.327864
[300]	training's multi_logloss: 0.179418	valid_1's multi_logloss: 0.179459
[400]	training's multi_logloss: 0.0985678	valid_1's multi_logloss: 0.0985949
[500]	training's multi_logloss: 0.0551677	valid_1's multi_logloss: 0.0551854
[600]	training's multi_logloss: 0.031269	valid_1's multi_logloss: 0.0312804
[700]	training's multi_logloss: 0.017699	valid_1's multi_logloss: 0.0177067
[800]	training's multi_logloss: 0.0100341	valid_1's multi_logloss: 0.0100398
[900]	training's multi_logloss: 0.00564673	valid_1's multi_logloss: 0.00565046
[1000]	training's multi_logloss: 0.00319443	valid_1's multi_logloss: 0.00319701
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 0.00319443	valid_1's multi_logloss: 0.00319701
Fold: 1 quadratic weighted ka

[200]	training's multi_logloss: 0.327794	valid_1's multi_logloss: 0.327822
[300]	training's multi_logloss: 0.179414	valid_1's multi_logloss: 0.179439
[400]	training's multi_logloss: 0.0985678	valid_1's multi_logloss: 0.0985827
[500]	training's multi_logloss: 0.0551677	valid_1's multi_logloss: 0.055181
[600]	training's multi_logloss: 0.0312678	valid_1's multi_logloss: 0.031276
[700]	training's multi_logloss: 0.0176984	valid_1's multi_logloss: 0.0177037
[800]	training's multi_logloss: 0.0100342	valid_1's multi_logloss: 0.0100375
[900]	training's multi_logloss: 0.00564706	valid_1's multi_logloss: 0.00564927
[1000]	training's multi_logloss: 0.00319472	valid_1's multi_logloss: 0.00319601
Did not meet early stopping. Best iteration is:
[1000]	training's multi_logloss: 0.00319472	valid_1's multi_logloss: 0.00319601
Fold: 9 quadratic weighted kappa score: 1.0
Fold: 10 of 10
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.621854	valid_1's multi_lo

## Inference

In [62]:
grouped = comp_test_df.groupby('installation_id')
comp_test_tail_df = pd.concat([grouped.tail(1)]).sort_values('installation_id').reset_index(drop=True)

In [63]:
comp_test_df_correction = comp_test_df[['installation_id', 'game_session', 'num_correct', 'num_incorrect', 'event_code', 'game_time']]
comp_test_df_correction

Unnamed: 0,installation_id,game_session,num_correct,num_incorrect,event_code,game_time
0,00abaee7,0ea9ecc81a565215,0,0,2000,0
1,00abaee7,c1ea43d8b8261d27,0,0,2000,0
2,00abaee7,7ed86c6b72e725e2,0,0,2000,0
3,00abaee7,7e516ace50e7fe67,0,0,2000,0
4,00abaee7,a022c3f60ba547e7,0,0,2000,0
...,...,...,...,...,...,...
1156145,ffe774cc,46ff9d3ad2be09f2,1,0,3121,32030
1156146,ffe774cc,46ff9d3ad2be09f2,1,0,2030,32584
1156147,ffe774cc,46ff9d3ad2be09f2,1,0,2010,36607
1156148,ffe774cc,96d7dc31e822cedc,0,0,2000,0


In [64]:
comp_test_df_correction = comp_test_df_correction.groupby(['installation_id'])['num_correct', 'num_incorrect', 'game_time'].agg(['mean'])

In [65]:
comp_test_df_correction.reset_index(inplace=True)

In [66]:
comp_test_df_correction.columns = ['installation_id', 'num_correct', 'num_incorrect', 'game_time']

In [67]:
comp_test_df_correction[comp_test_df_correction['num_correct']==comp_test_df_correction['num_correct'].max()]

Unnamed: 0,installation_id,num_correct,num_incorrect,game_time
193,3412391a,13.761702,1.965957,136192.404255


In [68]:
comp_test_df_correction[comp_test_df_correction['num_incorrect']==comp_test_df_correction['num_incorrect'].max()]

Unnamed: 0,installation_id,num_correct,num_incorrect,game_time
94,1b38b81a,0.0,33.317308,129325.469952


In [69]:
comp_test_df_correction

Unnamed: 0,installation_id,num_correct,num_incorrect,game_time
0,00abaee7,0.029954,0.000000,63567.408986
1,01242218,0.503128,0.239234,75775.845786
2,017c5718,0.000000,0.000000,33017.233333
3,01a44906,0.000000,0.000000,41162.901709
4,01bc6cb6,0.000000,0.000000,147664.880252
...,...,...,...,...
995,fee254cf,0.483412,0.000000,56395.488152
996,ff57e602,0.092409,0.000000,44772.023102
997,ffc73fb2,0.220532,1.231939,61802.442966
998,ffe00ca8,0.405405,0.930502,25213.420849


In [70]:
comp_test_tail_df = comp_test_tail_df.merge(comp_test_df_correction, on='installation_id', how='left')

In [72]:
del comp_test_tail_df['num_correct_x']

In [73]:
del comp_test_tail_df['num_incorrect_x']

In [74]:
del comp_test_tail_df['game_time_x']

In [75]:
comp_test_tail_df = comp_test_tail_df.rename(columns={"num_correct_y": "num_correct", "num_incorrect_y": "num_incorrect", "game_time_y": "game_time"})

In [76]:
comp_test_tail_df

Unnamed: 0,event_id,game_session,installation_id,event_count,event_code,title,type,world,month,hour,year,dayofweek,weekofyear,dayofyear,quarter,num_correct,num_incorrect,game_time
0,216,348d7f09f96af313,00abaee7,1,2000,9,1,1,9,13,2019,3,37,255,3,0.029954,0.000000,63567.408986
1,183,1fef5d54cb4b775a,01242218,1,2000,8,1,0,10,20,2019,2,41,282,4,0.503128,0.239234,75775.845786
2,83,4b165a330a0bdd6c,017c5718,1,2000,30,1,3,9,11,2019,5,38,264,3,0.000000,0.000000,33017.233333
3,83,be0b655ad1fee30c,01a44906,1,2000,30,1,3,7,16,2019,5,30,208,3,0.000000,0.000000,41162.901709
4,183,46e8bbed71df7520,01bc6cb6,1,2000,8,1,0,9,18,2019,4,36,249,3,0.000000,0.000000,147664.880252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,216,d0ea3550fd72f6c1,fee254cf,1,2000,9,1,1,9,1,2019,5,39,271,3,0.483412,0.000000,56395.488152
996,132,d22d3b1442967ba7,ff57e602,1,2000,10,1,0,10,17,2019,6,41,286,4,0.092409,0.000000,44772.023102
997,183,fd5e0fd3fe28f907,ffc73fb2,1,2000,8,1,0,10,20,2019,0,41,280,4,0.220532,1.231939,61802.442966
998,183,8d0fdec0ad44aefb,ffe00ca8,1,2000,8,1,0,9,19,2019,1,37,253,3,0.405405,0.930502,25213.420849


In [77]:
def run_predictions(models, df):
    y_pred = np.zeros((len(df), 4))
    for i, model in enumerate(models):
        y_pred += model.predict(df[features])
        print(f'Ran {i}th model ')
    return y_pred / num_splits

In [78]:
y_pred = run_predictions(all_models[:], comp_test_tail_df)

Ran 0th model 
Ran 1th model 
Ran 2th model 


In [79]:
# y_pred = all_models[best_qw3_index].predict(comp_test_df[features])

In [80]:
assert comp_test_tail_df.shape[0] == y_pred.shape[0]

In [81]:
np.unique(y_pred.argmax(-1), return_counts=True)

(array([0, 1, 2, 3]), array([ 40,  51, 340, 569]))

In [82]:
comp_test_tail_df['accuracy_group'] = y_pred.argmax(-1)

In [83]:
comp_test_tail_df.shape, comp_test_tail_df.columns

((1000, 19),
 Index(['event_id', 'game_session', 'installation_id', 'event_count',
        'event_code', 'title', 'type', 'world', 'month', 'hour', 'year',
        'dayofweek', 'weekofyear', 'dayofyear', 'quarter', 'num_correct',
        'num_incorrect', 'game_time', 'accuracy_group'],
       dtype='object'))

### Create Submission

In [84]:
def prepare_submission(submission_df, y_pred):
    sample_submission_df = pd.read_csv(path/'sample_submission.csv')
    sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
    sample_submission_df = sample_submission_df.merge(submission_df, on = 'installation_id')
    sample_submission_df = sample_submission_df[['installation_id', 'accuracy_group']]
    sample_submission_df.to_csv('submission.csv', index = False)

In [85]:
prepare_submission(comp_test_tail_df, y_pred)

In [86]:
!head submission.csv

OSError: [Errno 12] Cannot allocate memory

In [None]:
!cat submission.csv | wc -l