# v101
## v200124

* Fixing spead

* Adding regressor and thresholds
* Amending features which performed below 0 by Permutation Importance
* Cleaning ' from feature column - title_pirate'stale

* Upgrades:
* n_estimators update, v53 n_estimators=1534
* Checking columns ordering - wrong placement of previous_assessments_count
* Updated train set to include same column ordering by adding previous_assessments_count above forecasted assessment
* Match with PBest got slightly worse

* Archive:
* Testing if test set picks up correct FC assessment - it was fine
* Permutation importance w eli5 - adjusted top=105 that is all features
* Same 1500 n_estimators 

# **Accessing working environment Kaggle**

In [1]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/data-science-bowl-2019/train.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/test.csv
/kaggle/input/data-science-bowl-2019/sample_submission.csv


# **Importing libraries**

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)
from time import time
import datetime as dt
import gc # clear garbage

# Debugging f-ions

In [3]:
def debugging_ids(df):
    return print(f'Debugging submitted dataframe: \nUnique installation_ids: {len(set(df.installation_id))} \nRows & columns count {df.shape}')

# **Loading data**

In [4]:
load_columns = ['event_id',
                'game_session',
                'timestamp',                
                'installation_id',
                'event_count',
                'event_code',
                'game_time',
                'title',
                'type',
                'world',
                'event_data']

path = '/kaggle/input/data-science-bowl-2019/' # create url path to the datasets

t0 = time()

print('Loading datasets...')
X_train = pd.read_csv(path + 'train.csv', usecols = load_columns)
X_labels = pd.read_csv(path + 'train_labels.csv')
# specs = pd.read_csv(path + 'specs.csv')
#X_test = pd.read_csv(path + 'test.csv', usecols = load_columns)
submission = pd.read_csv(path + 'sample_submission.csv')
print("Datasets loaded successfully! \nLoading time:", round(time() - t0, 3), "s")

Loading datasets...
Datasets loaded successfully! 
Loading time: 73.837 s


# **Data preparation**

### **(T) Reducing train df with users having accuracy scores (17000 -> 3614 installation_ids)**

In [5]:
# X_train has 17000 installation_id's, however there are only for 3614 installation_id's (X_labels and X_train) with Assessment attempt
# Reducing X_train to 17000 -> 3614 installation_ids
X_train = X_train[X_train['installation_id'].isin(set(X_labels.installation_id))]

### **Extracting accuracy of previous Assessment attempts**

* Preparing train set which is identical to train_labels except:
* accuracy differs for 46 observations due to saving in more floating points (16 ours vs 9 train_labels.csv)
* removed the last assessment's (target) row

#### (T) Create X_train_gt by extracting only rows with assessments events

In [6]:
# Creating X_train_gt to hold only rows with assessment attempts

X_train_gt = pd.DataFrame(data=None)

# X_train_gt will be used only for accuracy features extraction
# First, filter assessment events only

X_train_gt = X_train[((X_train['event_code'] == 4100) & 
                 (X_train['title'].isin(['Cart Balancer (Assessment)', 
                                    'Cauldron Filler (Assessment)', 
                                    'Mushroom Sorter (Assessment)', 
                                    'Chest Sorter (Assessment)']))) | 
                ((X_train['event_code'] == 4110) & 
                 (X_train['title'] == 'Bird Measurer (Assessment)'))].copy(deep=True)   

In [7]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (41549, 11)


In [8]:
X_train_gt[X_train_gt['installation_id'] == '0006c192']

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
6900,392e14df,197a373a77101924,2019-09-14T15:35:54.361Z,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",0006c192,20,4100,12635,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
7279,25fa8af4,b2297d292892745a,2019-10-01T00:53:30.849Z,"{""correct"":false,""stumps"":[0,0,0],""event_count...",0006c192,43,4100,32388,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7292,25fa8af4,b2297d292892745a,2019-10-01T00:53:36.596Z,"{""correct"":false,""stumps"":[0,0,0],""event_count...",0006c192,56,4100,38139,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7312,25fa8af4,b2297d292892745a,2019-10-01T00:53:53.430Z,"{""correct"":false,""stumps"":[2,5,3],""event_count...",0006c192,76,4100,54974,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7341,25fa8af4,b2297d292892745a,2019-10-01T00:54:18.450Z,"{""correct"":false,""stumps"":[3,5,2],""event_count...",0006c192,105,4100,79992,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7387,17113b36,957406a905d59afd,2019-10-01T00:59:35.003Z,"{""correct"":false,""caterpillars"":[3,8,5],""event...",0006c192,27,4110,24847,Bird Measurer (Assessment),Assessment,TREETOPCITY
7393,17113b36,957406a905d59afd,2019-10-01T00:59:42.289Z,"{""correct"":true,""caterpillars"":[4,8,5],""event_...",0006c192,33,4110,32131,Bird Measurer (Assessment),Assessment,TREETOPCITY


#### (T) Drop columns which will be processed later

In [9]:
# Fourth, drop columns which will be processed separately

X_train_gt.drop(['event_id', 
                 'timestamp', 
                 'event_count', 
                 'event_code', 
                 'game_time',
                 'type',
                 'world',], axis=1, inplace=True)

In [10]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (41549, 4)


#### (T) Extract accuracy features from 'event_data'

In [11]:
# Fifths, extract correct and incorrect assessment attempts per user from 'event_data'
# Create num_correct and num_incorrect columns

corr = '"correct":true'
incorr = '"correct":false'

X_train_gt['num_correct'] = X_train_gt['event_data'].apply(lambda x: 1 if corr in x else 0)
X_train_gt['num_incorrect'] = X_train_gt['event_data'].apply(lambda x: 1 if incorr in x else 0)

# Would work with as well:
# X_train_gt_test_accuracy['num_correct'] = X_train_gt_test_accuracy['event_data'].apply(lambda x: 1 if 'true' in x else 0)
# X_train_gt_test_accuracy['num_incorrect'] = X_train_gt_test_accuracy['event_data'].apply(lambda x: 1 if 'false' in x else 0)


In [12]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (41549, 6)


In [13]:
X_train_gt

Unnamed: 0,game_session,event_data,installation_id,title,num_correct,num_incorrect
2228,901acc108f55a5a1,"{""correct"":true,""stumps"":[1,2,4],""event_count""...",0006a69f,Mushroom Sorter (Assessment),1,0
2709,77b8ee947eb84b4e,"{""correct"":false,""caterpillars"":[11,8,3],""even...",0006a69f,Bird Measurer (Assessment),0,1
2715,77b8ee947eb84b4e,"{""correct"":false,""caterpillars"":[11,8,11],""eve...",0006a69f,Bird Measurer (Assessment),0,1
2720,77b8ee947eb84b4e,"{""correct"":false,""caterpillars"":[11,8,5],""even...",0006a69f,Bird Measurer (Assessment),0,1
2725,77b8ee947eb84b4e,"{""correct"":false,""caterpillars"":[11,8,7],""even...",0006a69f,Bird Measurer (Assessment),0,1
...,...,...,...,...,...,...
11337238,dadd1a4d8ac68ab0,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",ffeb0b1b,Cauldron Filler (Assessment),1,0
11337675,a6885ab824fbc32c,"{""correct"":false,""stumps"":[0,0,0],""event_count...",ffeb0b1b,Mushroom Sorter (Assessment),0,1
11337779,5448d652309a6324,"{""buckets"":[0,0,0],""correct"":false,""buckets_pl...",ffeb0b1b,Cauldron Filler (Assessment),0,1
11337794,5448d652309a6324,"{""buckets"":[0,0,0],""correct"":false,""buckets_pl...",ffeb0b1b,Cauldron Filler (Assessment),0,1


In [14]:
# Sixths, aggregate (sum) correct and incorrect assessment attempts 
# per 'game_session', 'installation_id' and assessment 'title'
# As provided in grount truth (labels.csv)

# previous aggregation was made together with sorting to match train_labels format
#X_train_gt = X_train_gt.sort_values(['installation_id', 'game_session'], ascending=True).groupby(['game_session', 'installation_id', 'title'], as_index=False, sort=False).agg(sum)
# a) difficult to extract last assessment, 
# b) difficult to truncate, 
# c) difficult to accumulate actions before assessment
X_train_gt = X_train_gt.groupby(['game_session', 'installation_id', 'title'], as_index=False, sort=False).agg(sum)

In [15]:
X_train_gt

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect
0,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11
2,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0
...,...,...,...,...,...
17685,460e8bdc2822b340,ffc90c32,Chest Sorter (Assessment),1,0
17686,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0
17687,dadd1a4d8ac68ab0,ffeb0b1b,Cauldron Filler (Assessment),1,2
17688,a6885ab824fbc32c,ffeb0b1b,Mushroom Sorter (Assessment),0,1


In [16]:
X_labels

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.000000,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.000000,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.000000,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.500000,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.000000,3
...,...,...,...,...,...,...,...
17685,c996482b11d149dd,ffc90c32,Bird Measurer (Assessment),1,0,1.000000,3
17686,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0,1.000000,3
17687,5448d652309a6324,ffeb0b1b,Cauldron Filler (Assessment),1,2,0.333333,1
17688,a6885ab824fbc32c,ffeb0b1b,Mushroom Sorter (Assessment),0,1,0.000000,0


In [17]:
# Great, because w/o sorting by game_session and installation_id 
# we preserve the original order of events by timestamp 
X_train_gt[X_train_gt['installation_id'] == '0006c192']

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect
5,197a373a77101924,0006c192,Cauldron Filler (Assessment),1,0
6,b2297d292892745a,0006c192,Mushroom Sorter (Assessment),0,4
7,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1


In [18]:
X_labels[X_labels['installation_id'] == '0006c192']

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
5,197a373a77101924,0006c192,Cauldron Filler (Assessment),1,0,1.0,3
6,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.5,2
7,b2297d292892745a,0006c192,Mushroom Sorter (Assessment),0,4,0.0,0


In [19]:
X_train[(X_train['installation_id'] == '0006c192') & ((X_train['event_code'] == 4100) | (X_train['event_code'] == 4110))]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
6900,392e14df,197a373a77101924,2019-09-14T15:35:54.361Z,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",0006c192,20,4100,12635,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
7279,25fa8af4,b2297d292892745a,2019-10-01T00:53:30.849Z,"{""correct"":false,""stumps"":[0,0,0],""event_count...",0006c192,43,4100,32388,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7292,25fa8af4,b2297d292892745a,2019-10-01T00:53:36.596Z,"{""correct"":false,""stumps"":[0,0,0],""event_count...",0006c192,56,4100,38139,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7312,25fa8af4,b2297d292892745a,2019-10-01T00:53:53.430Z,"{""correct"":false,""stumps"":[2,5,3],""event_count...",0006c192,76,4100,54974,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7341,25fa8af4,b2297d292892745a,2019-10-01T00:54:18.450Z,"{""correct"":false,""stumps"":[3,5,2],""event_count...",0006c192,105,4100,79992,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7387,17113b36,957406a905d59afd,2019-10-01T00:59:35.003Z,"{""correct"":false,""caterpillars"":[3,8,5],""event...",0006c192,27,4110,24847,Bird Measurer (Assessment),Assessment,TREETOPCITY
7393,17113b36,957406a905d59afd,2019-10-01T00:59:42.289Z,"{""correct"":true,""caterpillars"":[4,8,5],""event_...",0006c192,33,4110,32131,Bird Measurer (Assessment),Assessment,TREETOPCITY
7414,070a5291,957406a905d59afd,2019-10-01T01:00:04.842Z,"{""correct"":false,""hats"":[5,4,8],""event_count"":...",0006c192,54,4100,54682,Bird Measurer (Assessment),Assessment,TREETOPCITY


In [20]:
# Sevenths, create 'accuracy' feature = corr / (corre + incorr)

X_train_gt['accuracy'] = X_train_gt['num_correct'] / (X_train_gt['num_correct'] + X_train_gt['num_incorrect'])

# Eighths, create 'accuracy_group' feature
# 3: the assessment was solved on the first attempt
# 2: the assessment was solved on the second attempt
# 1: the assessment was solved after 3 or more attempts
# 0: the assessment was never solved

# If accuracy is 0.0 (no correct attempts), accuracy group is 0 as all observations in X_train_gt by now has at least one attempt
# If accuracy is 1.0 (that is no incorrect attempts), accuracy group is 3
# If accuracy is 0.5 (there is equal amount of correct and incorrect attempts), accuracy group is 2
# Any other case means that accuracy group equals 1, that is 3 or more attempts were needed to make a correct attempt    

X_train_gt['accuracy_group'] = X_train_gt['accuracy'].apply(lambda x: 0 if x == 0.0 else (3 if x == 1.0 else (2 if x == 0.5 else 1)))

In [21]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 7)


### (T) Accuracy groups

In [22]:
X_train_gt['acc_0'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 0 else 0)
X_train_gt['acc_1'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 1 else 0)
X_train_gt['acc_2'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 2 else 0)
X_train_gt['acc_3'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 3 else 0)

In [23]:
# debugging
# X_train_gt[X_train_gt['installation_id'] == '0006a69f']

In [24]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 11)


### (T) Accuracy groups per assessment 'title'

In [25]:
# Accuracy group per assessment title
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)

X_train_gt['bird_accg_0'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['bird_accg_1'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['bird_accg_2'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['bird_accg_3'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['cart_accg_0'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['cart_accg_1'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['cart_accg_2'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['cart_accg_3'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['cauldron_accg_0'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['cauldron_accg_1'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['cauldron_accg_2'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['cauldron_accg_3'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['chest_accg_0'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['chest_accg_1'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['chest_accg_2'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['chest_accg_3'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['mushroom_accg_0'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['mushroom_accg_1'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['mushroom_accg_2'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['mushroom_accg_3'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)


In [26]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 31)


### (T) Accuracy (corr, incorr, accuracy) per assessment

In [27]:
# Accuracy group per assessment title
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)
# E.g. if Bird Measurer has num_correct = 1, add 1, elsewise add 0

X_train_gt['bird_correct'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['bird_incorrect'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['bird_accuracy'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['cart_correct'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['cart_incorrect'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['cart_accuracy'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['cauldron_correct'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['cauldron_incorrect'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['cauldron_accuracy'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['chest_correct'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['chest_incorrect'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['chest_accuracy'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['mushroom_correct'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['mushroom_incorrect'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['mushroom_accuracy'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)'), X_train_gt['accuracy'], 0)

In [28]:
# debugging
# X_train_gt[X_train_gt['installation_id'] == '0006a69f']

In [29]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 46)


#### (fixing) Removing last assessment from train set

* X_train_gt at this point has 41549 assessments
* Can not remove just last one before aggregation 

In [30]:
# Saving whole df for debugging
X_train_gt_check = X_train_gt.copy(deep=True)

In [31]:
# Second, remove the last assessment's attempt from train (new from 200115)

# Build temporary df which holds last assessment
X_train_gt_last = X_train_gt.groupby('installation_id').tail(1).copy(deep=True)
X_train_gt_last_index_list = list(X_train_gt_last.index)

# Removing last assessment attempt from test set
# 'installation_id's drop 3614->3021 as we have users who had just single attempt
X_train_gt = X_train_gt.loc[~X_train_gt.index.isin(X_train_gt_last_index_list)]

In [32]:
# debugging
debugging_ids(X_train_gt_last)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 46)


In [33]:
# debugging
X_train_gt_last.head(5)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
7,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.5,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.5,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
8,ae691ec5ad5652cf,00129856,Bird Measurer (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
13,8fdd5d389d0e272e,001d0ed0,Chest Sorter (Assessment),0,1,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,1,0.0,0,0,0.0
14,619b9c069cf790ca,00225f67,Bird Measurer (Assessment),0,2,0.0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0


In [34]:
# debugging, good case of 0006c192
X_train[(X_train['installation_id'] == '0006c192') & ((X_train['event_code'] == 4100) | (X_train['event_code'] == 4110))]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
6900,392e14df,197a373a77101924,2019-09-14T15:35:54.361Z,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",0006c192,20,4100,12635,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
7279,25fa8af4,b2297d292892745a,2019-10-01T00:53:30.849Z,"{""correct"":false,""stumps"":[0,0,0],""event_count...",0006c192,43,4100,32388,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7292,25fa8af4,b2297d292892745a,2019-10-01T00:53:36.596Z,"{""correct"":false,""stumps"":[0,0,0],""event_count...",0006c192,56,4100,38139,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7312,25fa8af4,b2297d292892745a,2019-10-01T00:53:53.430Z,"{""correct"":false,""stumps"":[2,5,3],""event_count...",0006c192,76,4100,54974,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7341,25fa8af4,b2297d292892745a,2019-10-01T00:54:18.450Z,"{""correct"":false,""stumps"":[3,5,2],""event_count...",0006c192,105,4100,79992,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
7387,17113b36,957406a905d59afd,2019-10-01T00:59:35.003Z,"{""correct"":false,""caterpillars"":[3,8,5],""event...",0006c192,27,4110,24847,Bird Measurer (Assessment),Assessment,TREETOPCITY
7393,17113b36,957406a905d59afd,2019-10-01T00:59:42.289Z,"{""correct"":true,""caterpillars"":[4,8,5],""event_...",0006c192,33,4110,32131,Bird Measurer (Assessment),Assessment,TREETOPCITY
7414,070a5291,957406a905d59afd,2019-10-01T01:00:04.842Z,"{""correct"":false,""hats"":[5,4,8],""event_count"":...",0006c192,54,4100,54682,Bird Measurer (Assessment),Assessment,TREETOPCITY


In [35]:
X_train_gt_last[(X_train_gt_last['installation_id'] == '0006c192')]

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
7,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.5,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.5,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0


In [36]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 2587 
Rows & columns count (14076, 46)


### (~T) Aggregation

Tested the build, updated avoiding extra df, but haven't double-checked sample means or sums

In [37]:
X_train_gt_sum_list = ['num_correct', 'num_incorrect', 
       'bird_correct', 'bird_incorrect',
       'cart_correct', 'cart_incorrect', 'cauldron_correct',
       'cauldron_incorrect', 'chest_correct',
       'chest_incorrect', 'mushroom_correct',
       'mushroom_incorrect', 'acc_0',
       'acc_1', 'acc_2', 'acc_3', 'bird_accg_0', 'bird_accg_1', 'bird_accg_2',
       'bird_accg_3', 'cart_accg_0', 'cart_accg_1', 'cart_accg_2',
       'cart_accg_3', 'cauldron_accg_0', 'cauldron_accg_1', 'cauldron_accg_2',
       'cauldron_accg_3', 'chest_accg_0', 'chest_accg_1', 'chest_accg_2',
       'chest_accg_3', 'mushroom_accg_0', 'mushroom_accg_1', 'mushroom_accg_2',
       'mushroom_accg_3']

X_train_gt_mean_list = ['accuracy',
       'accuracy_group', 'bird_accuracy',
       'cart_accuracy', 'cauldron_accuracy', 'chest_accuracy', 'mushroom_accuracy']

#X_train_gt_unchanged_list = []
#X_train_gt_unchanged_list = ['game_session', 'title'] 
# X_train_gt_remainder_unchanged_list = ['Y_target', 'previous_assessments_count', 'sessions_with_assessment_count'] 

In [38]:
#len(X_train_gt_sum_list), len(X_train_gt_mean_list)

In [39]:
# # debugging, where 3 features disappeared
# temp_list = set(X_train_gt.columns)
# temp_list2 = list(set(X_train_gt_sum_list)) + list(set(X_train_gt_mean_list))
# temp_list3 = (temp_list).difference(temp_list2)

In [40]:
X_train_gt_sum_df = X_train_gt.groupby(['installation_id'], as_index=False, sort=False)[X_train_gt_sum_list].agg(sum)

In [41]:
X_train_gt_sum_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
3,002db7e3,6,24,2,11,1,0,1,0,0,12,2,1,2,1,2,3,0,1,1,0,0,0,0,1,0,0,0,1,2,0,0,0,0,0,1,1
4,003372b0,4,5,1,1,2,0,0,0,0,4,1,0,1,0,1,3,0,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2582,ff90db99,6,2,1,2,2,0,1,0,1,0,1,0,0,1,0,5,0,1,0,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,0,1
2583,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0
2584,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3
2585,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1


In [42]:
X_train_gt_mean_df = X_train_gt.groupby(['installation_id'], as_index=False, sort=False)[X_train_gt_mean_list].agg('mean')

In [43]:
X_train_gt_mean_df

Unnamed: 0,installation_id,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0006a69f,0.625000,2.000000,0.000000,0.000000,0.000000,0.000000,0.625000
1,0006c192,0.500000,1.500000,0.000000,0.000000,0.500000,0.000000,0.000000
2,001d0ed0,0.625000,2.000000,0.000000,0.250000,0.000000,0.000000,0.375000
3,002db7e3,0.511364,1.750000,0.073864,0.125000,0.125000,0.000000,0.187500
4,003372b0,0.700000,2.200000,0.100000,0.400000,0.000000,0.000000,0.200000
...,...,...,...,...,...,...,...,...
2582,ff90db99,0.888889,2.666667,0.055556,0.333333,0.166667,0.166667,0.166667
2583,ff9305d7,0.025641,0.333333,0.025641,0.000000,0.000000,0.000000,0.000000
2584,ff9715db,0.718750,2.250000,0.000000,0.125000,0.125000,0.093750,0.375000
2585,ffc90c32,0.866667,2.600000,0.200000,0.200000,0.200000,0.000000,0.266667


In [44]:
#X_train_gt_unchaged_df = X_train_gt.groupby(['installation_id'], as_index=False, sort=False)[X_train_gt_unchanged_list].last()

In [45]:
X_train_gt = pd.merge(X_train_gt_sum_df, X_train_gt_mean_df, how='left', on=['installation_id'])

In [46]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000000,0.000000,0.000000,0.625000
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000000,0.500000,0.000000,0.000000
2,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250000,0.000000,0.000000,0.375000
3,002db7e3,6,24,2,11,1,0,1,0,0,12,2,1,2,1,2,3,0,1,1,0,0,0,0,1,0,0,0,1,2,0,0,0,0,0,1,1,0.511364,1.750000,0.073864,0.125000,0.125000,0.000000,0.187500
4,003372b0,4,5,1,1,2,0,0,0,0,4,1,0,1,0,1,3,0,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0.700000,2.200000,0.100000,0.400000,0.000000,0.000000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2582,ff90db99,6,2,1,2,2,0,1,0,1,0,1,0,0,1,0,5,0,1,0,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,0,1,0.888889,2.666667,0.055556,0.333333,0.166667,0.166667,0.166667
2583,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000000,0.000000,0.000000,0.000000
2584,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125000,0.125000,0.093750,0.375000
2585,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200000,0.200000,0.000000,0.266667


In [47]:
# NB! Returning the installation_ids which had no previous assessments before the forecasted one
#X_train_gt = pd.concat([X_train_gt_remainder, X_train_gt_last]).sort_index().reset_index(drop=True) index got broken while grouping by

#NB! Important comment - we do not add gt_last at this stage
#X_train_gt = X_train_gt_remainder.append(X_train_gt_last, ignore_index=True)
#X_train_gt = X_train_gt_remainder

In [48]:
# # depreciated! Works - just to test
# X_train_gt.drop(last_assessments_index, axis=0, inplace=True)

In [49]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 2587 
Rows & columns count (2587, 44)


## Adding users w/o previous assessment attempts

In [50]:
train_features_list = X_train_gt.columns

In [51]:
print(f'X_train iids: {len(set(X_train.installation_id))} \nX_train_gt iids: {len(set(X_train_gt.installation_id))} \nX_labels iids: {len(set(X_labels.installation_id))}')

X_train iids: 3614 
X_train_gt iids: 2587 
X_labels iids: 3614


In [52]:
train_users_wo_assessments = set(X_train.installation_id) - set(X_train_gt.installation_id)
len(train_users_wo_assessments)

1027

### Creating empty df matching test's columns

In [53]:
train_users_wo_assessments_df = pd.DataFrame(0, index=np.arange(len(train_users_wo_assessments)), columns=train_features_list)

In [54]:
train_users_wo_assessments_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1024,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1025,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Adding 'installation_id's w/o prior assessments

In [55]:
# We have created installation_id column with zero values. Now will assign missing 'installation_id's:
train_users_wo_assessments_df['installation_id'] = train_users_wo_assessments

In [56]:
train_users_wo_assessments_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,92eafa1a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,229905d3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1ada7dc9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,93f5d648,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,17e0d33c,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,379c5ab6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023,3ef19ad9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1024,232e7bd4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1025,b32bc738,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Merging 'installation_id's with and w/o assessments

In [57]:
X_train_gt = X_train_gt.append(train_users_wo_assessments_df, ignore_index=True)

In [58]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.00,0.000000,0.000,0.000,0.0,0.6250
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.50,0.000000,0.000,0.500,0.0,0.0000
2,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.00,0.000000,0.250,0.000,0.0,0.3750
3,002db7e3,6,24,2,11,1,0,1,0,0,12,2,1,2,1,2,3,0,1,1,0,0,0,0,1,0,0,0,1,2,0,0,0,0,0,1,1,0.511364,1.75,0.073864,0.125,0.125,0.0,0.1875
4,003372b0,4,5,1,1,2,0,0,0,0,4,1,0,1,0,1,3,0,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0.700000,2.20,0.100000,0.400,0.000,0.0,0.2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,379c5ab6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000
3610,3ef19ad9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000
3611,232e7bd4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000
3612,b32bc738,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000


In [59]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 44)


In [60]:
# # debugging
# # we lost the order of 'installation_id', but submission is sorted ascending
# booltrain_label = X_train_gt.installation_id.sort_values(ascending=True).reset_index(drop=True) == X_labels.installation_id
# set(booltrain_label)

### (T) Sorting to match order of initial train set
* Because after merger of users with previous assessments and without we lost the initial ordering

In [61]:
X_train_gt = X_train_gt.sort_values('installation_id', ascending=True).reset_index(drop=True)

In [62]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000,0.000000,0.00000,0.625000
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000,0.500000,0.00000,0.000000
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250,0.000000,0.00000,0.375000
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000,0.000000,0.00000,0.000000
3610,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125,0.125000,0.09375,0.375000
3611,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200,0.200000,0.00000,0.266667
3612,ffd2871d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000


In [63]:
# X_labels

In [64]:
# debugging
# check if sorting of 'installation_id's matches train_labels sorting
# for this need to drop duplicates in X_labels as it contain 17690 rows with 'installation_id's
# ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.drop_duplicates.html
# reseting index and dropping old index via reset_index(drop=True)
# does not lose the sorting
# THIS PART TO BE UNCOMMENTED:
X_labels_unique_installation_id = X_labels.installation_id.drop_duplicates().reset_index(drop=True)
booltrain_label = X_train_gt.installation_id == X_labels_unique_installation_id
set(booltrain_label)

{True}

In [65]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 44)


### Previous assessments count

In [66]:
X_train_gt['previous_assessments_count'] = X_train_gt['num_correct'] + X_train_gt['num_incorrect']
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000,0.000000,0.00000,0.625000,15
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000,0.500000,0.00000,0.000000,5
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250,0.000000,0.00000,0.375000,9
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000,0.000000,0.00000,0.000000,19
3610,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125,0.125000,0.09375,0.375000,17
3611,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200,0.200000,0.00000,0.266667,7
3612,ffd2871d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0


### Adding 'forecasted_assessment' feature

In [67]:
X_train_gt.shape, X_train_gt_last.shape

((3614, 45), (3614, 46))

In [68]:
X_train_gt_last

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
7,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.500000,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.50,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
8,ae691ec5ad5652cf,00129856,Bird Measurer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
13,8fdd5d389d0e272e,001d0ed0,Chest Sorter (Assessment),0,1,0.000000,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.00,0,0,0.0,0,0,0.000000,0,1,0.0,0,0,0.0
14,619b9c069cf790ca,00225f67,Bird Measurer (Assessment),0,2,0.000000,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17670,967f050913e025e7,ff9305d7,Bird Measurer (Assessment),1,3,0.250000,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0.25,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
17679,4767c079c810f31b,ff9715db,Cart Balancer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,1,0,1.0,0,0,0.000000,0,0,0.0,0,0,0.0
17685,460e8bdc2822b340,ffc90c32,Chest Sorter (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.00,0,0,0.0,0,0,0.000000,1,0,1.0,0,0,0.0
17686,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.00,0,0,0.0,1,0,1.000000,0,0,0.0,0,0,0.0


In [69]:
# X_train[(X_train['installation_id'] == '0006c192') & ((X_train['event_code'] == 4100) | (X_train['event_code'] == 4110))]

In [70]:
# # X_train_gt_last is taking X_train index 4137->11337808
# train_forecasted_assessment_df = X_train_gt_last.sort_values('installation_id', ascending=True).reset_index(drop=True)
# train_forecasted_assessment_df

In [71]:
# # check if last df had the right 'title' for forecasted assessment
# X_labels.head(20)

In [72]:
# # double-check sorting - OK
# boollast_label = train_forecasted_assessment_df.installation_id == X_labels_unique_installation_id
# set(boollast_label)

In [73]:
# train_forecasted_assessment_df.shape

In [74]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000,0.000000,0.00000,0.625000,15
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000,0.500000,0.00000,0.000000,5
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250,0.000000,0.00000,0.375000,9
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000,0.000000,0.00000,0.000000,19
3610,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125,0.125000,0.09375,0.375000,17
3611,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200,0.200000,0.00000,0.266667,7
3612,ffd2871d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0


In [75]:
# Need to reset X_train_gt_last index for boolean comparison
X_train_gt_last

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
7,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.500000,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.50,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
8,ae691ec5ad5652cf,00129856,Bird Measurer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
13,8fdd5d389d0e272e,001d0ed0,Chest Sorter (Assessment),0,1,0.000000,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.00,0,0,0.0,0,0,0.000000,0,1,0.0,0,0,0.0
14,619b9c069cf790ca,00225f67,Bird Measurer (Assessment),0,2,0.000000,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17670,967f050913e025e7,ff9305d7,Bird Measurer (Assessment),1,3,0.250000,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0.25,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
17679,4767c079c810f31b,ff9715db,Cart Balancer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,1,0,1.0,0,0,0.000000,0,0,0.0,0,0,0.0
17685,460e8bdc2822b340,ffc90c32,Chest Sorter (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.00,0,0,0.0,0,0,0.000000,1,0,1.0,0,0,0.0
17686,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.00,0,0,0.0,1,0,1.000000,0,0,0.0,0,0,0.0


In [76]:
# Debugging - double-check sorting of X_train_gt_last & X_train_gt
X_train_gt_last = X_train_gt_last.reset_index(drop=True)
# Above we updated the X_train_gt_last index to match 0-3613 (total of 3614)
booltrain_last = X_train_gt.installation_id == X_train_gt_last.installation_id
set(booltrain_last)

{True}

In [77]:
# Updated index:
X_train_gt_last

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
0,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
1,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.500000,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.50,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
2,ae691ec5ad5652cf,00129856,Bird Measurer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
3,8fdd5d389d0e272e,001d0ed0,Chest Sorter (Assessment),0,1,0.000000,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.00,0,0,0.0,0,0,0.000000,0,1,0.0,0,0,0.0
4,619b9c069cf790ca,00225f67,Bird Measurer (Assessment),0,2,0.000000,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0.00,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,967f050913e025e7,ff9305d7,Bird Measurer (Assessment),1,3,0.250000,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0.25,0,0,0.0,0,0,0.000000,0,0,0.0,0,0,0.0
3610,4767c079c810f31b,ff9715db,Cart Balancer (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,1,0,1.0,0,0,0.000000,0,0,0.0,0,0,0.0
3611,460e8bdc2822b340,ffc90c32,Chest Sorter (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.00,0,0,0.0,0,0,0.000000,1,0,1.0,0,0,0.0
3612,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0,1.000000,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.00,0,0,0.0,1,0,1.000000,0,0,0.0,0,0,0.0


In [78]:
X_train_gt['forecasted_assessment'] = X_train_gt_last['title'].map({'Bird Measurer (Assessment)': 0,
                                                                            'Cart Balancer (Assessment)': 1, 
                                                                            'Cauldron Filler (Assessment)': 2, 
                                                                            'Chest Sorter (Assessment)': 3, 
                                                                            'Mushroom Sorter (Assessment)': 4})

In [79]:
# X_train_gt['forecasted_assessment'] = train_forecasted_assessment_df['title'].map({'Bird Measurer (Assessment)': 0,
#                                                                             'Cart Balancer (Assessment)': 1, 
#                                                                             'Cauldron Filler (Assessment)': 2, 
#                                                                             'Chest Sorter (Assessment)': 3, 
#                                                                             'Mushroom Sorter (Assessment)': 4})

In [80]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 46)


In [81]:
set(X_train_gt.forecasted_assessment), X_train_gt.forecasted_assessment.count()

({0, 1, 2, 3, 4}, 3614)

# (~T) Adding non accuracy features
## bugs:
### - data is not truncated after forecasted_event
### - we take last assessment, which might better off be random

Given that test set contains almost half of installation_ids without previous assessments, we need to add other than accuracy features for model to pick up

## (~T) timestamp

In [82]:
# bug - taking the last even, which might be not assessment
# could replace with mean

import datetime as dt

def timestamp_split(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']) # converting argument to pandas datetime
#    df['year'] = df['timestamp'].dt.year # all are in 2019
    df['month'] = (df['timestamp'].dt.month).astype(int)
    #df['day'] = (df['timestamp'].dt.day).astype(int) # returns day of the month 1-31
    df['hour'] = (df['timestamp'].dt.hour).astype(int) 
    df['minute'] = (df['timestamp'].dt.minute).astype(int)
#    df['second'] = df['timestamp'].dt.second # doubt it could give anything
    df['dayofweek'] = (df['timestamp'].dt.dayofweek).astype(int) # returns day of week in 0-6 integer format
    df['dayofyear'] = (df['timestamp'].dt.dayofyear).astype(int) # returns numeric day of year, might be useful for summer holidays
    df['quarter'] = (df['timestamp'].dt.quarter).astype(int)
    df['is_weekend'] = (np.where(df['dayofweek'].isin(['Sunday','Saturday']), 1, 0)).astype(int)
    df.drop(['timestamp'], axis=1, inplace=True)
    return df

# RAM 8.7->10->9.3
# Create new X_train_timefeat which holds time only features  
feat_time = X_train.filter(['installation_id', 'timestamp'], axis=1)
# Prepare time features from given timestamp 
feat_time = timestamp_split(feat_time)

# Defining as last (bug)
feat_time = feat_time.groupby('installation_id', as_index=False).last()

# Merging to the main train set
X_train_gt = pd.merge(X_train_gt, feat_time, on=['installation_id'])

# Deleting
del feat_time
gc.collect()

  mask |= (ar1 == a)


49

## (~T) Title, type and world

In [83]:
# Uses RAM 9.1->13.8->8.7
def title_type_world(df):
    df = pd.get_dummies(data=df, columns=['title', 'type', 'world'])
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' |-|!|\)|\(', '')
    df = df.groupby(['installation_id'], as_index=False, sort=False).agg(sum) 
    return df

# Create new X_train_titletypeworldfeat which holds time only title, type and world features
X_train_titletypeworldfeat = X_train.filter(['installation_id', 'title', 'type', 'world'], axis=1)
X_train_titletypeworldfeat = title_type_world(X_train_titletypeworldfeat)

# Add title, type and world features to the main dataframe
X_train_gt = pd.merge(X_train_gt, X_train_titletypeworldfeat, on=['installation_id'])
# # Count nan in df for debugging purposes
# X_train_gt.isna().sum()

del X_train_titletypeworldfeat
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 105)


## Adding train target

In [84]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count,forecasted_assessment,month,hour,minute,dayofweek,dayofyear,quarter,is_weekend,title_12monkeys,title_airshow,title_allstarsorting,title_balancingact,title_birdmeasurerassessment,title_bottlefilleractivity,title_bubblebath,title_bugmeasureractivity,title_cartbalancerassessment,title_cauldronfillerassessment,title_chestsorterassessment,title_chickenbalanceractivity,title_chowtime,title_costumebox,title_crystalcaveslevel1,title_crystalcaveslevel2,title_crystalcaveslevel3,title_crystalsrule,title_dinodive,title_dinodrink,title_eggdropperactivity,title_fireworksactivity,title_flowerwatereractivity,title_happycamel,"title_heavy,heavier,heaviest",title_honeycake,title_leafleader,title_liftingheavythings,title_magmapeaklevel1,title_magmapeaklevel2,title_mushroomsorterassessment,title_orderingspheres,title_panbalance,title_pirate'stale,title_rulers,title_sandcastlebuilderactivity,title_scrubadub,title_slopproblem,title_treasuremap,title_treetopcitylevel1,title_treetopcitylevel2,title_treetopcitylevel3,title_wateringholeactivity,title_welcometolostlagoon,type_activity,type_assessment,type_clip,type_game,world_crystalcaves,world_magmapeak,world_none,world_treetopcity
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000,0.000000,0.00000,0.625000,15,0,8,16,39,3,241,3,0,2.0,295.0,203.0,0.0,119.0,293.0,248.0,319.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,212.0,64.0,286.0,0.0,299.0,278.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,142.0,2.0,0.0,2.0,2.0,434.0,424.0,6.0,3.0,2.0,2.0,2.0,148.0,4.0,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000,0.500000,0.00000,0.000000,5,0,10,1,4,1,274,4,0,1.0,0.0,0.0,2.0,200.0,250.0,145.0,134.0,0.0,27.0,0.0,111.0,82.0,0.0,3.0,3.0,1.0,6.0,185.0,0.0,0.0,0.0,201.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,116.0,1.0,4.0,1.0,3.0,467.0,221.0,0.0,1.0,3.0,1.0,2.0,43.0,4.0,1206.0,343.0,32.0,643.0,207.0,1343.0,4.0,670.0
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,0,9,16,55,6,265,3,0,0.0,0.0,0.0,1.0,40.0,219.0,0.0,267.0,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,786.0,43.0,1.0,0.0,201.0,319.0,0.0,310.0
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250,0.000000,0.00000,0.375000,9,3,9,19,24,4,256,3,0,0.0,0.0,0.0,2.0,51.0,0.0,0.0,0.0,21.0,0.0,29.0,0.0,184.0,1.0,1.0,2.0,2.0,139.0,0.0,0.0,12.0,64.0,0.0,45.0,2.0,4.0,0.0,5.0,1.0,0.0,101.0,7.0,64.0,2.0,2.0,0.0,280.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,76.0,202.0,38.0,712.0,373.0,281.0,1.0,373.0
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,0,10,0,12,6,279,4,0,1.0,37.0,122.0,1.0,37.0,0.0,0.0,3.0,0.0,0.0,0.0,80.0,231.0,1.0,1.0,1.0,0.0,196.0,0.0,0.0,0.0,86.0,122.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,28.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,291.0,65.0,15.0,586.0,316.0,0.0,1.0,640.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000,0.000000,0.00000,0.000000,19,0,10,12,51,2,275,4,0,0.0,0.0,62.0,1.0,257.0,0.0,0.0,266.0,0.0,0.0,138.0,24.0,237.0,1.0,1.0,6.0,4.0,261.0,248.0,164.0,0.0,1889.0,40.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,68.0,4.0,0.0,0.0,0.0,267.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,2486.0,463.0,25.0,972.0,414.0,680.0,1.0,2851.0
3610,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125,0.125000,0.09375,0.375000,17,1,8,12,58,3,241,3,0,0.0,0.0,59.0,0.0,0.0,251.0,0.0,0.0,120.0,50.0,77.0,0.0,69.0,0.0,1.0,3.0,2.0,181.0,177.0,0.0,23.0,39.0,150.0,60.0,0.0,2.0,125.0,0.0,0.0,4.0,111.0,0.0,107.0,0.0,0.0,400.0,0.0,0.0,0.0,1.0,1.0,2.0,46.0,0.0,909.0,358.0,16.0,778.0,589.0,928.0,0.0,544.0
3611,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200,0.200000,0.00000,0.266667,7,3,7,12,15,1,211,3,0,2.0,87.0,56.0,1.0,48.0,106.0,107.0,0.0,35.0,24.0,38.0,83.0,61.0,1.0,1.0,1.0,1.0,237.0,129.0,103.0,40.0,85.0,60.0,60.0,1.0,1.0,39.0,1.0,1.0,2.0,97.0,6.0,126.0,1.0,1.0,73.0,249.0,1.0,1.0,2.0,1.0,1.0,32.0,2.0,479.0,242.0,29.0,1254.0,489.0,827.0,2.0,686.0
3612,ffd2871d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,2,8,13,20,2,219,3,0,0.0,0.0,0.0,2.0,0.0,638.0,200.0,0.0,0.0,40.0,0.0,206.0,167.0,0.0,1.0,1.0,1.0,0.0,109.0,223.0,128.0,0.0,0.0,169.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,84.0,0.0,0.0,410.0,63.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1382.0,40.0,13.0,1015.0,760.0,1687.0,1.0,2.0


In [85]:
# Update 200117, major bug fix
X_train_gt['Y_target'] = X_train_gt_last['accuracy_group']

In [86]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 106)


In [87]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count,forecasted_assessment,month,hour,minute,dayofweek,dayofyear,quarter,is_weekend,title_12monkeys,title_airshow,title_allstarsorting,title_balancingact,title_birdmeasurerassessment,title_bottlefilleractivity,title_bubblebath,title_bugmeasureractivity,title_cartbalancerassessment,title_cauldronfillerassessment,title_chestsorterassessment,title_chickenbalanceractivity,title_chowtime,title_costumebox,title_crystalcaveslevel1,title_crystalcaveslevel2,title_crystalcaveslevel3,title_crystalsrule,title_dinodive,title_dinodrink,title_eggdropperactivity,title_fireworksactivity,title_flowerwatereractivity,title_happycamel,"title_heavy,heavier,heaviest",title_honeycake,title_leafleader,title_liftingheavythings,title_magmapeaklevel1,title_magmapeaklevel2,title_mushroomsorterassessment,title_orderingspheres,title_panbalance,title_pirate'stale,title_rulers,title_sandcastlebuilderactivity,title_scrubadub,title_slopproblem,title_treasuremap,title_treetopcitylevel1,title_treetopcitylevel2,title_treetopcitylevel3,title_wateringholeactivity,title_welcometolostlagoon,type_activity,type_assessment,type_clip,type_game,world_crystalcaves,world_magmapeak,world_none,world_treetopcity,Y_target
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000,0.000000,0.00000,0.625000,15,0,8,16,39,3,241,3,0,2.0,295.0,203.0,0.0,119.0,293.0,248.0,319.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,212.0,64.0,286.0,0.0,299.0,278.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,142.0,2.0,0.0,2.0,2.0,434.0,424.0,6.0,3.0,2.0,2.0,2.0,148.0,4.0,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,3
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000,0.500000,0.00000,0.000000,5,0,10,1,4,1,274,4,0,1.0,0.0,0.0,2.0,200.0,250.0,145.0,134.0,0.0,27.0,0.0,111.0,82.0,0.0,3.0,3.0,1.0,6.0,185.0,0.0,0.0,0.0,201.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,116.0,1.0,4.0,1.0,3.0,467.0,221.0,0.0,1.0,3.0,1.0,2.0,43.0,4.0,1206.0,343.0,32.0,643.0,207.0,1343.0,4.0,670.0,2
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,0,9,16,55,6,265,3,0,0.0,0.0,0.0,1.0,40.0,219.0,0.0,267.0,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,786.0,43.0,1.0,0.0,201.0,319.0,0.0,310.0,3
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250,0.000000,0.00000,0.375000,9,3,9,19,24,4,256,3,0,0.0,0.0,0.0,2.0,51.0,0.0,0.0,0.0,21.0,0.0,29.0,0.0,184.0,1.0,1.0,2.0,2.0,139.0,0.0,0.0,12.0,64.0,0.0,45.0,2.0,4.0,0.0,5.0,1.0,0.0,101.0,7.0,64.0,2.0,2.0,0.0,280.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,76.0,202.0,38.0,712.0,373.0,281.0,1.0,373.0,0
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,0,10,0,12,6,279,4,0,1.0,37.0,122.0,1.0,37.0,0.0,0.0,3.0,0.0,0.0,0.0,80.0,231.0,1.0,1.0,1.0,0.0,196.0,0.0,0.0,0.0,86.0,122.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,28.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,291.0,65.0,15.0,586.0,316.0,0.0,1.0,640.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000,0.000000,0.00000,0.000000,19,0,10,12,51,2,275,4,0,0.0,0.0,62.0,1.0,257.0,0.0,0.0,266.0,0.0,0.0,138.0,24.0,237.0,1.0,1.0,6.0,4.0,261.0,248.0,164.0,0.0,1889.0,40.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,68.0,4.0,0.0,0.0,0.0,267.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,2486.0,463.0,25.0,972.0,414.0,680.0,1.0,2851.0,1
3610,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125,0.125000,0.09375,0.375000,17,1,8,12,58,3,241,3,0,0.0,0.0,59.0,0.0,0.0,251.0,0.0,0.0,120.0,50.0,77.0,0.0,69.0,0.0,1.0,3.0,2.0,181.0,177.0,0.0,23.0,39.0,150.0,60.0,0.0,2.0,125.0,0.0,0.0,4.0,111.0,0.0,107.0,0.0,0.0,400.0,0.0,0.0,0.0,1.0,1.0,2.0,46.0,0.0,909.0,358.0,16.0,778.0,589.0,928.0,0.0,544.0,3
3611,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200,0.200000,0.00000,0.266667,7,3,7,12,15,1,211,3,0,2.0,87.0,56.0,1.0,48.0,106.0,107.0,0.0,35.0,24.0,38.0,83.0,61.0,1.0,1.0,1.0,1.0,237.0,129.0,103.0,40.0,85.0,60.0,60.0,1.0,1.0,39.0,1.0,1.0,2.0,97.0,6.0,126.0,1.0,1.0,73.0,249.0,1.0,1.0,2.0,1.0,1.0,32.0,2.0,479.0,242.0,29.0,1254.0,489.0,827.0,2.0,686.0,3
3612,ffd2871d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,2,8,13,20,2,219,3,0,0.0,0.0,0.0,2.0,0.0,638.0,200.0,0.0,0.0,40.0,0.0,206.0,167.0,0.0,1.0,1.0,1.0,0.0,109.0,223.0,128.0,0.0,0.0,169.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,84.0,0.0,0.0,410.0,63.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1382.0,40.0,13.0,1015.0,760.0,1687.0,1.0,2.0,3


# Setting target

In [88]:
X_train_model = X_train_gt.copy(deep=True)
#del X_train_gt
#X_train_model

In [89]:
X_train_model.isna().sum()

installation_id       0
num_correct           0
num_incorrect         0
bird_correct          0
bird_incorrect        0
                     ..
world_crystalcaves    0
world_magmapeak       0
world_none            0
world_treetopcity     0
Y_target              0
Length: 106, dtype: int64

In [90]:
# Depreciated, we already have Y_target
# X_train_model['Y_target'] = X_labels.groupby('installation_id')['accuracy_group'].transform('last')

##### Cleaning column names

In [91]:
# Elsewise LightGBMError: Do not support special JSON characters in feature name.
X_train_model.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train_model.columns]

##### Removing non performing features

In [92]:
non_performing_feat = ['title_wateringhole_activity', 'bird_accg_3', 'title_magmapeaklevel2', 'mushroom_accg_1', 'cauldron_correct', 'mushroom_incorrect', 'title_pirate_stale', 'dayofweek', 'cauldron_accg_1', 'title_12monkeys', 'cart_accg_3', 'title_crystalsrule', 'bird_accg_0', 'title_heavy_heavier_heaviest', 'title_honeycake', 'accuracy', 'mushroom_accg_0', 'title_balancingact', 'title_treasuremap', 'cauldron_accg_3', 'month', 'title_slopproblem', 'acc_0', 'title_eggdropper_activity', 'quarter', 'previous_assessments_count', 'chest_correct', 'title_liftingheavythings', 'cart_incorrect', 'world_none', 'mushroom_correct', 'chest_accg_1', 'mushroom_accg_3', 'title_cauldronfiller_assessment', 'title_orderingspheres', 'acc_3', 'title_scrubadub', 'title_sandcastlebuilder_activity', 'cauldron_accuracy', 'world_crystalcaves', 'cauldron_accg_2', 'title_magmapeaklevel1', 'cauldron_incorrect', 'title_dinodive', 'cart_accg_0', 'acc_2', 'chest_incorrect', 'title_treetopcitylevel3', 'cauldron_accg_0', 'type_clip', 'world_magmapeak', 'title_fireworks_activity', 'mushroom_accuracy', 'title_treetopcitylevel2', 'title_treetopcitylevel1', 'title_welcometolostlagoon', 'dayofyear', 'world_treetopcity', 'title_chowtime', 'title_bottlefiller_activity', 'title_leafleader', 'day', 'num_incorrect', 'title_rulers', 'chest_accg_0', 'title_bubblebath', 'title_happycamel', 'title_crystalcaveslevel3', 'type_activity', 'title_bugmeasurer_activity', 'accuracy_group', 'title_crystalcaveslevel2', 'title_crystalcaveslevel1', 'hour']

# # # Drop non performing features
# X_train_model.columns.drop(non_performing_feat)

In [93]:
# Setting target & features
y = X_train_model.Y_target
feature_names = X_train_model.columns.drop(['Y_target', 'installation_id'])
X = X_train_model[feature_names]

# Model w Catboost

In [94]:
list(X_train_model.columns)

['installation_id',
 'num_correct',
 'num_incorrect',
 'bird_correct',
 'bird_incorrect',
 'cart_correct',
 'cart_incorrect',
 'cauldron_correct',
 'cauldron_incorrect',
 'chest_correct',
 'chest_incorrect',
 'mushroom_correct',
 'mushroom_incorrect',
 'acc_0',
 'acc_1',
 'acc_2',
 'acc_3',
 'bird_accg_0',
 'bird_accg_1',
 'bird_accg_2',
 'bird_accg_3',
 'cart_accg_0',
 'cart_accg_1',
 'cart_accg_2',
 'cart_accg_3',
 'cauldron_accg_0',
 'cauldron_accg_1',
 'cauldron_accg_2',
 'cauldron_accg_3',
 'chest_accg_0',
 'chest_accg_1',
 'chest_accg_2',
 'chest_accg_3',
 'mushroom_accg_0',
 'mushroom_accg_1',
 'mushroom_accg_2',
 'mushroom_accg_3',
 'accuracy',
 'accuracy_group',
 'bird_accuracy',
 'cart_accuracy',
 'cauldron_accuracy',
 'chest_accuracy',
 'mushroom_accuracy',
 'previous_assessments_count',
 'forecasted_assessment',
 'month',
 'hour',
 'minute',
 'dayofweek',
 'dayofyear',
 'quarter',
 'is_weekend',
 'title_12monkeys',
 'title_airshow',
 'title_allstarsorting',
 'title_balancin

In [95]:
X_train_model.head(10)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count,forecasted_assessment,month,hour,minute,dayofweek,dayofyear,quarter,is_weekend,title_12monkeys,title_airshow,title_allstarsorting,title_balancingact,title_birdmeasurerassessment,title_bottlefilleractivity,title_bubblebath,title_bugmeasureractivity,title_cartbalancerassessment,title_cauldronfillerassessment,title_chestsorterassessment,title_chickenbalanceractivity,title_chowtime,title_costumebox,title_crystalcaveslevel1,title_crystalcaveslevel2,title_crystalcaveslevel3,title_crystalsrule,title_dinodive,title_dinodrink,title_eggdropperactivity,title_fireworksactivity,title_flowerwatereractivity,title_happycamel,title_heavy_heavier_heaviest,title_honeycake,title_leafleader,title_liftingheavythings,title_magmapeaklevel1,title_magmapeaklevel2,title_mushroomsorterassessment,title_orderingspheres,title_panbalance,title_pirate_stale,title_rulers,title_sandcastlebuilderactivity,title_scrubadub,title_slopproblem,title_treasuremap,title_treetopcitylevel1,title_treetopcitylevel2,title_treetopcitylevel3,title_wateringholeactivity,title_welcometolostlagoon,type_activity,type_assessment,type_clip,type_game,world_crystalcaves,world_magmapeak,world_none,world_treetopcity,Y_target
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625,2.0,0.0,0.0,0.0,0.0,0.625,15,0,8,16,39,3,241,3,0,2.0,295.0,203.0,0.0,119.0,293.0,248.0,319.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,212.0,64.0,286.0,0.0,299.0,278.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,142.0,2.0,0.0,2.0,2.0,434.0,424.0,6.0,3.0,2.0,2.0,2.0,148.0,4.0,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,3
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.5,1.5,0.0,0.0,0.5,0.0,0.0,5,0,10,1,4,1,274,4,0,1.0,0.0,0.0,2.0,200.0,250.0,145.0,134.0,0.0,27.0,0.0,111.0,82.0,0.0,3.0,3.0,1.0,6.0,185.0,0.0,0.0,0.0,201.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,116.0,1.0,4.0,1.0,3.0,467.0,221.0,0.0,1.0,3.0,1.0,2.0,43.0,4.0,1206.0,343.0,32.0,643.0,207.0,1343.0,4.0,670.0,2
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,9,16,55,6,265,3,0,0.0,0.0,0.0,1.0,40.0,219.0,0.0,267.0,0.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,786.0,43.0,1.0,0.0,201.0,319.0,0.0,310.0,3
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625,2.0,0.0,0.25,0.0,0.0,0.375,9,3,9,19,24,4,256,3,0,0.0,0.0,0.0,2.0,51.0,0.0,0.0,0.0,21.0,0.0,29.0,0.0,184.0,1.0,1.0,2.0,2.0,139.0,0.0,0.0,12.0,64.0,0.0,45.0,2.0,4.0,0.0,5.0,1.0,0.0,101.0,7.0,64.0,2.0,2.0,0.0,280.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,76.0,202.0,38.0,712.0,373.0,281.0,1.0,373.0,0
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,10,0,12,6,279,4,0,1.0,37.0,122.0,1.0,37.0,0.0,0.0,3.0,0.0,0.0,0.0,80.0,231.0,1.0,1.0,1.0,0.0,196.0,0.0,0.0,0.0,86.0,122.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,28.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,291.0,65.0,15.0,586.0,316.0,0.0,1.0,640.0,0
5,00279ac5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,9,23,13,3,255,3,0,1.0,274.0,36.0,0.0,0.0,302.0,281.0,707.0,0.0,0.0,140.0,120.0,790.0,2.0,3.0,3.0,3.0,137.0,292.0,189.0,53.0,382.0,177.0,185.0,0.0,0.0,168.0,0.0,3.0,5.0,0.0,2.0,34.0,0.0,1.0,531.0,464.0,1.0,1.0,3.0,9.0,8.0,175.0,4.0,2447.0,140.0,49.0,2850.0,1499.0,2243.0,4.0,1740.0,0
6,002db7e3,6,24,2,11,1,0,1,0,0,12,2,1,2,1,2,3,0,1,1,0,0,0,0,1,0,0,0,1,2,0,0,0,0,0,1,1,0.511364,1.75,0.073864,0.125,0.125,0.0,0.1875,30,0,8,20,4,0,224,3,0,2.0,119.0,113.0,0.0,208.0,757.0,226.0,209.0,35.0,26.0,120.0,216.0,302.0,2.0,2.0,2.0,1.0,0.0,253.0,103.0,0.0,27.0,62.0,47.0,0.0,1.0,0.0,2.0,4.0,3.0,85.0,1.0,78.0,1.0,3.0,200.0,161.0,0.0,1.0,4.0,4.0,2.0,63.0,4.0,1534.0,474.0,39.0,1402.0,806.0,1796.0,4.0,843.0,1
7,003372b0,4,5,1,1,2,0,0,0,0,4,1,0,1,0,1,3,0,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0.7,2.2,0.1,0.4,0.0,0.0,0.2,9,2,9,13,22,0,273,3,0,2.0,72.0,107.0,1.0,41.0,73.0,89.0,105.0,63.0,37.0,62.0,81.0,134.0,2.0,1.0,1.0,1.0,248.0,76.0,132.0,53.0,107.0,24.0,66.0,1.0,2.0,12.0,4.0,1.0,1.0,35.0,4.0,71.0,1.0,1.0,90.0,31.0,0.0,2.0,2.0,1.0,1.0,5.0,2.0,538.0,238.0,31.0,1038.0,553.0,535.0,2.0,755.0,3
8,004c2091,2,1,0,1,0,0,2,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0.666667,2.0,0.0,0.0,0.666667,0.0,0.0,3,0,8,13,25,2,219,3,0,0.0,0.0,0.0,0.0,42.0,45.0,0.0,4.0,19.0,59.0,45.0,0.0,0.0,0.0,2.0,7.0,4.0,0.0,8.0,0.0,0.0,92.0,0.0,0.0,2.0,0.0,7.0,7.0,1.0,3.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.0,0.0,147.0,174.0,27.0,15.0,93.0,122.0,0.0,148.0,0
9,00634433,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1.0,3.0,0.0,0.5,0.5,0.0,0.0,2,3,10,20,42,2,282,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,63.0,70.0,0.0,99.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,160.0,7.0,99.0,200.0,65.0,1.0,0.0,0


In [96]:
X_train_model.filter(items=['installation_id', 'num_correct', 'num_incorrect', 'forecasted_assessment'])

Unnamed: 0,installation_id,num_correct,num_incorrect,forecasted_assessment
0,0006a69f,3,12,0
1,0006c192,1,4,0
2,00129856,0,0,0
3,001d0ed0,3,6,3
4,00225f67,0,0,0
...,...,...,...,...
3609,ff9305d7,1,18,0
3610,ff9715db,7,10,1
3611,ffc90c32,5,2,3
3612,ffd2871d,0,0,2


In [97]:
X_train_gt_check.head(20)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
0,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1,0,1.0
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
2,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1,0,1.0
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1,1,0.5
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
5,197a373a77101924,0006c192,Cauldron Filler (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,1,0,1.0,0,0,0.0,0,0,0.0
6,b2297d292892745a,0006c192,Mushroom Sorter (Assessment),0,4,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,0,4,0.0
7,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.5,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.5,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
8,ae691ec5ad5652cf,00129856,Bird Measurer (Assessment),1,0,1.0,3,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0
9,c046a858e7c8bf03,001d0ed0,Mushroom Sorter (Assessment),1,1,0.5,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,1,1,0.5


In [98]:
# # Catboost Classification
# # 0.5788845026009162 accuracy on cb
# # Skikit-learn method: Cohen Kappa Score of predictions is (if not quadratic) 0.5788845026009162
# # Skikit-learn method with weights='quadratic': Cohen Kappa Score of predictions is (if quadratic) 0.6394643567367756
# # 2nd method quadKappa: 0.6394643567367756
# # 3rd method QWK3: 0.63946436

# # Cat features: Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: 0.6552048470987987

# from sklearn.model_selection import train_test_split
# train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# from catboost import CatBoostClassifier
# from sklearn.metrics import accuracy_score

# params_cb = {
#             'max_depth' : 5,
#             'learning_rate' : 0.01,
#             'n_estimators' : 5000,
#             'verbose' : 200,
#             'loss_function' : 'MultiClass' #200109 new
#             }

# cbc_model = CatBoostClassifier(**params_cb)
# #cbc_model.fit(train_X, train_y)
# cbc_model.fit(train_X, train_y, eval_set=(val_X, val_y), early_stopping_rounds=10, use_best_model=True) #200119 use_best suggestion for bestIteration = 2679, Shrink model to first 2680 iterations
# cbc_preds = cbc_model.predict(val_X)

# # Save Catboost accuracy
# cbc_score = accuracy_score(val_y, cbc_preds)
# print(f'\n****')
# print(f'Accuracy of predictions is: {accuracy_score(val_y, cbc_preds)}')

# # Check Cohen Kappa Score:
# from sklearn.metrics import cohen_kappa_score
# cbc_kappa_score = cohen_kappa_score(val_y, cbc_preds, weights='quadratic')

# # NB! Add weights='quadratic' to get same result as QWK 
# print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, cbc_preds, weights="quadratic")}')

# Model w LightGBM

In [99]:
# Ref: https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

import lightgbm as lgb
# from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# create dataset for lightgbm
lgb_train = lgb.Dataset(train_X, label=train_y)
lgb_eval = lgb.Dataset(val_X, label=val_y, reference=lgb_train)

# specify parameters
params_lgb = {'n_estimators':2000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.04,
            'feature_fraction': 0.9,
         'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'early_stopping_rounds': 100, 'eval_metric': 'cappa'
            }

# #new200120
# params_lgb = {'n_estimators': 5000,
#                     'boosting_type': 'gbdt',
#                     'objective': 'regression',
#                     'metric': 'rmse',
#                     'subsample': 0.75,
#                     'subsample_freq': 1,
#                     'learning_rate': 0.01,
#                     'feature_fraction': 0.9,
#                     'max_depth': 15,
#                     'lambda_l1': 1,  
#                     'lambda_l2': 1,
#                     'early_stopping_rounds': 100
#                     }

print('Starting training...')
# train
#gbm_model = lgb.train(params_lgb, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)
gbm_model = lgb.train(params_lgb, lgb_train, valid_sets=lgb_eval) #new200120 , verbose_eval=verbosity

print('Starting predicting...')
# predict
gbm_pred = gbm_model.predict(val_X, num_iteration=gbm_model.best_iteration)
# eval
print(':', )
print(f'The rmse of prediction is: {mean_squared_error(val_y, gbm_pred) ** 0.5}')
#print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, gbm_pred, weights="quadratic")}')

Starting training...
[1]	valid_0's rmse: 1.26496
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.24926
[3]	valid_0's rmse: 1.23472
[4]	valid_0's rmse: 1.22127
[5]	valid_0's rmse: 1.20782
[6]	valid_0's rmse: 1.1957
[7]	valid_0's rmse: 1.1848
[8]	valid_0's rmse: 1.17373
[9]	valid_0's rmse: 1.16439
[10]	valid_0's rmse: 1.15553
[11]	valid_0's rmse: 1.14587
[12]	valid_0's rmse: 1.13983




[13]	valid_0's rmse: 1.13151
[14]	valid_0's rmse: 1.12486
[15]	valid_0's rmse: 1.11813
[16]	valid_0's rmse: 1.11206
[17]	valid_0's rmse: 1.10509
[18]	valid_0's rmse: 1.09776
[19]	valid_0's rmse: 1.09063
[20]	valid_0's rmse: 1.08527
[21]	valid_0's rmse: 1.08107
[22]	valid_0's rmse: 1.07705
[23]	valid_0's rmse: 1.07455
[24]	valid_0's rmse: 1.0699
[25]	valid_0's rmse: 1.06717
[26]	valid_0's rmse: 1.06471
[27]	valid_0's rmse: 1.06187
[28]	valid_0's rmse: 1.0567
[29]	valid_0's rmse: 1.05404
[30]	valid_0's rmse: 1.05048
[31]	valid_0's rmse: 1.04815
[32]	valid_0's rmse: 1.04502
[33]	valid_0's rmse: 1.04176
[34]	valid_0's rmse: 1.03897
[35]	valid_0's rmse: 1.03591
[36]	valid_0's rmse: 1.03393
[37]	valid_0's rmse: 1.03255
[38]	valid_0's rmse: 1.02964
[39]	valid_0's rmse: 1.02812
[40]	valid_0's rmse: 1.02673
[41]	valid_0's rmse: 1.02622
[42]	valid_0's rmse: 1.02516
[43]	valid_0's rmse: 1.02361
[44]	valid_0's rmse: 1.0232
[45]	valid_0's rmse: 1.02224
[46]	valid_0's rmse: 1.02129
[47]	valid_0's rm

In [100]:
# # CV to assess model's quality
# # Ref: https://scikit-learn.org/stable/modules/model_evaluation.html

# # from sklearn import svm, datasets
# from sklearn.model_selection import cross_val_score
# clf_cbc = CatBoostClassifier(**params_cb)
# #cross_val_score(clf_cbc, X, y, cv=5, scoring='accuracy')
# #scores = cross_val_score(clf_cbc, X, y, cv=5, scoring='accuracy')
# cross_val_score(clf_cbc, X, y, cv=5, scoring=cohen_kappa_score)

# #cross_val_score(clf, X, y, cv=5, scoring='recall_macro')
# #array([0.96..., 0.96..., 0.96..., 0.93..., 1.        ])
# #>>> model = svm.SVC()
# #>>> cross_val_score(model, X, y, cv=5, scoring='wrong_choice')
# #Traceback (most recent call last):

In [101]:
# # Permutation Importance Catboost

# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(cbc_model, random_state=1).fit(val_X, val_y)
# eli5.explain_weights(perm, )
# eli5.show_weights(perm, top=105, feature_names = val_X.columns.tolist())

In [102]:
# # Permutation Importance LightGBM

# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(gbm_model, random_state=1).fit(val_X, val_y)
# eli5.explain_weights(perm, )
# eli5.show_weights(perm, top=105, feature_names = val_X.columns.tolist())

In [103]:
#del X_train_model
gc.collect()

233

In [104]:
# Just to check
#X_train_model_iids_list = X_train_model.installation_id
#check_df = pd.DataFrame(data=X_train_model.installation_id)
#cbc_train_preds = cbc_model.predict(X_train_model)
# submission['accuracy_group'] = cbc_preds.astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()
# submission.accuracy_group.value_counts()

In [105]:
# len(X_train_model.columns)

# Preparing test set

In [106]:
# Preparing test set
X_test = pd.read_csv(path + 'test.csv', usecols = load_columns)

In [107]:
def extract_accuracy_set_test(df):
    X_test_gt = pd.DataFrame(data=None)
    
    # X_test_gt will be used only for accuracy features extraction
    # First, filter assessment events only
    # Second, drop columns which will be processed separately
    
    X_test_gt = df[((df['event_code'] == 4100) & 
                     (df['title'].isin(['Cart Balancer (Assessment)', 
                                        'Cauldron Filler (Assessment)', 
                                        'Mushroom Sorter (Assessment)', 
                                        'Chest Sorter (Assessment)']))) | 
                    ((df['event_code'] == 4110) & 
                     (df['title'] == 'Bird Measurer (Assessment)'))].copy(deep=True)
    
    X_test_gt.drop(['event_id', 
                     'timestamp', 
                     'event_count', 
                     'event_code', 
                     'game_time',
                     'type',
                     'world',], axis=1, inplace=True)
    
    # Third, extract correct and incorrect assessment attempts per user from 'event_data'
    # Create num_correct and num_incorrect columns
    
    corr = '"correct":true'
    incorr = '"correct":false'
    
    X_test_gt['num_correct'] = X_test_gt['event_data'].apply(lambda x: 1 if corr in x else 0)
    X_test_gt['num_incorrect'] = X_test_gt['event_data'].apply(lambda x: 1 if incorr in x else 0)
    
    # Fourth, aggregate (sum) correct and incorrect assessment attempts 
    # per 'game_session', 'installation_id' and assessment 'title'
    # As provided in grount truth (labels.csv)
    
    X_test_gt = X_test_gt.sort_values(['installation_id', 'game_session'], ascending=True).groupby(['game_session', 'installation_id', 'title'], as_index=False, sort=False).agg(sum)
    
    # Fifths, create 'accuracy' feature = corr / (corre + incorr)
    
    X_test_gt['accuracy'] = X_test_gt['num_correct'] / (X_test_gt['num_correct'] + X_test_gt['num_incorrect'])
    
    # Sixths, create 'accuracy_group' feature
    # 3: the assessment was solved on the first attempt
    # 2: the assessment was solved on the second attempt
    # 1: the assessment was solved after 3 or more attempts
    # 0: the assessment was never solved

    # If accuracy is 0.0 (no correct attempts), accuracy group is 0 as all observations in X_test_gt by now has at least one attempt
    # If accuracy is 1.0 (that is no incorrect attempts), accuracy group is 3
    # If accuracy is 0.5 (there is equal amount of correct and incorrect attempts), accuracy group is 2
    # Any other case means that accuracy group equals 1, that is 3 or more attempts were needed to make a correct attempt    

    X_test_gt['accuracy_group'] = X_test_gt['accuracy'].apply(lambda x: 0 if x == 0.0 else (3 if x == 1.0 else (2 if x == 0.5 else 1)))
   
    return X_test_gt

X_test_gt = extract_accuracy_set_test(X_test)

In [108]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 7)


### Creating target

No target for the test set

In [109]:
# Creating column with forecast target to preserve it when cleaning leak
# X_test_gt['Y_target'] = X_test_gt.groupby('installation_id')['accuracy_group'].transform('last')

### Fixing the leak in X_test_gt

No leak in test df

### (down) Forecasted assessment

In [110]:
# #Forecasted assessment
# # Creating the last assessment coll
# # Bug: X_test_gt['forecasted_assessment'] = X_test_gt.groupby('installation_id')['title'].transform('last')
# X_test_gt['forecasted_assessment'] = X_test.groupby('installation_id')['title'].transform('last')

In [111]:
# X_test[X_test['installation_id'] == '01242218']

### (down) Encoding forecasted assessment

In [112]:
# # Casting 5 assessment titles to categorical type and assign unique integer   
# #X_test_gt['forecasted_assessment'] = X_test_gt['forecasted_assessment'].astype('category').cat.codes
# X_test_gt['forecasted_assessment'] = X_test_gt['forecasted_assessment'].map({'Bird Measurer (Assessment)': 0,
#                                                                             'Cart Balancer (Assessment)': 1, 
#                                                                             'Cauldron Filler (Assessment)': 2, 
#                                                                             'Chest Sorter (Assessment)': 3, 
#                                                                             'Mushroom Sorter (Assessment)': 4})

In [113]:
# X_test_gt

### (T) Assessment count
**Adjusted** for test set as:
* not all users took assessment
* in test.csv our forecasted assessment is not under 4100 or 4110 code, therefore does not include in gt df
* feature shows how many unique assessments user took before, not total count of non-unique assessments

In [114]:
# Creating the last assessment coll
X_test_gt['previous_assessments_count'] = X_test_gt.groupby('installation_id')['title'].transform('count')
# Difference with train prep:
# No need to reduce by one as last one under 4100 or 4110 code is not the one we are forecasting
# X_test_gt['previous_assessments_count'] = X_test_gt['previous_assessments_count'].apply(lambda x: x -1 if x > 1 else 0)

In [115]:
X_test_gt.head(2)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,previous_assessments_count
0,8b38fc0d2fd315dc,00abaee7,Cart Balancer (Assessment),1,0,1.0,3,1
1,009c890ce6c4f3e3,01242218,Cauldron Filler (Assessment),1,1,0.5,2,5


In [116]:
# X_test[(X_test['installation_id'] == '01242218') & ((X_test['event_code'] == 4100) | (X_test['event_code'] == 4110))]

In [117]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 8)


### (~T) Accuracy groups

* Should be fine as we do not have forecasted assessment's, that is do not count additional 0 accuracy_group

In [118]:
#Accuracy groups
X_test_gt['acc_0'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 0 else 0)
X_test_gt['acc_1'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 1 else 0)
X_test_gt['acc_2'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 2 else 0)
X_test_gt['acc_3'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 3 else 0)

In [119]:
# X_test_gt.head(5)

In [120]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 12)


### (T) accuracy_group per assessment title

In [121]:
# 'accuracy_group' per assessment 'title'
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)

X_test_gt['bird_accg_0'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['bird_accg_1'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['bird_accg_2'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['bird_accg_3'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['cart_accg_0'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['cart_accg_1'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['cart_accg_2'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['cart_accg_3'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['cauldron_accg_0'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['cauldron_accg_1'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['cauldron_accg_2'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['cauldron_accg_3'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['chest_accg_0'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['chest_accg_1'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['chest_accg_2'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['chest_accg_3'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['mushroom_accg_0'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['mushroom_accg_1'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['mushroom_accg_2'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['mushroom_accg_3'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

In [122]:
# X_test_gt.head(5)

In [123]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 32)


### (T) Accuracy (num_correct, num_incorrect, accuracy) per assessment

In [124]:
# {title}_correct, {title}_incorrect, {title}_accuracy per 'installation_id' per assessment 'title'
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)
# E.g. if Bird Measurer has num_correct = 1, add 1, elsewise add 0
# If Bird Measurer has num_incorrect = 12, add 12, elsewise add 0

X_test_gt['bird_correct'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['bird_incorrect'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['bird_accuracy'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['cart_correct'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['cart_incorrect'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['cart_accuracy'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['cauldron_correct'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['cauldron_incorrect'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['cauldron_accuracy'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['chest_correct'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['chest_incorrect'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['chest_accuracy'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['mushroom_correct'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['mushroom_incorrect'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['mushroom_accuracy'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)'), X_test_gt['accuracy'], 0)

In [125]:
# X_test_gt.head(5)

In [126]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 47)


### (T) Aggregation of features

* Leaving single row per 'installation_id'

##### Headline in train: Saving the index of last (forecasted) assessment

* No need to separate FC assessments row from the rest as it is not included in test set
* Will perform only aggregation

In [127]:
X_test_gt.head(2)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,previous_assessments_count,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
0,8b38fc0d2fd315dc,00abaee7,Cart Balancer (Assessment),1,0,1.0,3,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1,0,1.0,0,0,0.0,0,0,0.0,0,0,0.0
1,009c890ce6c4f3e3,01242218,Cauldron Filler (Assessment),1,1,0.5,2,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0.0,1,1,0.5,0,0,0.0,0,0,0.0


In [128]:
X_test_gt

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,previous_assessments_count,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,bird_correct,bird_incorrect,bird_accuracy,cart_correct,cart_incorrect,cart_accuracy,cauldron_correct,cauldron_incorrect,cauldron_accuracy,chest_correct,chest_incorrect,chest_accuracy,mushroom_correct,mushroom_incorrect,mushroom_accuracy
0,8b38fc0d2fd315dc,00abaee7,Cart Balancer (Assessment),1,0,1.000000,3,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,1,0,1.0,0,0,0.00,0,0,0.0,0,0,0.0
1,009c890ce6c4f3e3,01242218,Cauldron Filler (Assessment),1,1,0.500000,2,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.000000,0,0,0.0,1,1,0.50,0,0,0.0,0,0,0.0
2,31423dbcd717919e,01242218,Mushroom Sorter (Assessment),1,1,0.500000,2,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.000000,0,0,0.0,0,0,0.00,0,0,0.0,1,1,0.5
3,597a8839a5a3468d,01242218,Bird Measurer (Assessment),1,2,0.333333,1,5,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.333333,0,0,0.0,0,0,0.00,0,0,0.0,0,0,0.0
4,ab61cae5e3215355,01242218,Chest Sorter (Assessment),0,3,0.000000,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.000000,0,0,0.0,0,0,0.00,0,3,0.0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,905d31d016c7a685,ffc73fb2,Cauldron Filler (Assessment),1,0,1.000000,3,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.000000,0,0,0.0,1,0,1.00,0,0,0.0,0,0,0.0
2014,b46072c5e2b3d8c5,ffc73fb2,Cauldron Filler (Assessment),1,0,1.000000,3,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.000000,0,0,0.0,1,0,1.00,0,0,0.0,0,0,0.0
2015,70336ec581799feb,ffe00ca8,Cauldron Filler (Assessment),1,1,0.500000,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0.000000,0,0,0.0,1,1,0.50,0,0,0.0,0,0,0.0
2016,c116d9e6f8cf85c3,ffe00ca8,Cauldron Filler (Assessment),1,3,0.250000,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0,0,0.0,1,3,0.25,0,0,0.0,0,0,0.0


In [129]:
# Not applicable to test set:
# # We prepare a dataframe which stores the index of last assessment of each installation_id with assessment attempt
# last_observations_index_df = X_test_gt.reset_index().groupby('installation_id', as_index=False)['index'].last()
# last_observations_index_list = list(last_observations_index_df['index']) 
# X_test_gt.drop(['game_session', 'title'], axis=1, inplace=True)
# # Creating a copy dataframe with last_observations and without them
# X_test_gt_last = X_test_gt.loc[X_test_gt.index.isin(last_observations_index_list)]
# X_test_gt_remainder = X_test_gt.loc[~X_test_gt.index.isin(last_observations_index_list)]

X_test_gt_remainder_sum_list = X_train_gt_sum_list

# X_test_gt_remainder_sum_list = ['num_correct', 'num_incorrect', 
#        'bird_correct', 'bird_incorrect',
#        'cart_correct', 'cart_incorrect', 'cauldron_correct',
#        'cauldron_incorrect', 'chest_correct',
#        'chest_incorrect', 'mushroom_correct',
#        'mushroom_incorrect', 'acc_0',
#        'acc_1', 'acc_2', 'acc_3', 'bird_accg_0', 'bird_accg_1', 'bird_accg_2',
#        'bird_accg_3', 'cart_accg_0', 'cart_accg_1', 'cart_accg_2',
#        'cart_accg_3', 'cauldron_accg_0', 'cauldron_accg_1', 'cauldron_accg_2',
#        'cauldron_accg_3', 'chest_accg_0', 'chest_accg_1', 'chest_accg_2',
#        'chest_accg_3', 'mushroom_accg_0', 'mushroom_accg_1', 'mushroom_accg_2',
#        'mushroom_accg_3']

X_test_gt_remainder_mean_list = X_train_gt_mean_list

# X_test_gt_remainder_mean_list = ['accuracy',
#        'accuracy_group', 'bird_accuracy',
#        'cart_accuracy', 'cauldron_accuracy', 'chest_accuracy', 'mushroom_accuracy']

# !!! Should add 'forecasted_assessment'
# Removed 'sessions_with_assessment_count'
X_test_gt_remainder_unchanged_list = ['previous_assessments_count']

# Difference in train set:
# X_test_gt_remainder_unchanged_list = ['Y_target', 'forecasted_assessment', 'previous_assessments_count', 'sessions_with_assessment_count'] 

# Difference in train set:
# We do not define X_test_gt_remainder and take all in X_test_gt
X_test_gt_sum = X_test_gt.groupby(['installation_id'], as_index=False, sort=False)[X_test_gt_remainder_sum_list].agg(sum)
X_test_gt_mean = X_test_gt.groupby(['installation_id'], as_index=False, sort=False)[X_test_gt_remainder_mean_list].agg('mean')
X_test_gt_unchaged = X_test_gt.groupby(['installation_id'], as_index=False, sort=False)[X_test_gt_remainder_unchanged_list].last()

# Merge both
X_test_gt_remainder = pd.merge(X_test_gt_sum, X_test_gt_mean, how='left', on=['installation_id'])
X_test_gt = pd.merge(X_test_gt_remainder, X_test_gt_unchaged, how='left', on=['installation_id'])

# Not applicable to test set:
# # Returning the installation_ids which had no previous assessments before the forecasted one
# #X_test_gt = pd.concat([X_test_gt_remainder, X_test_gt_last]).sort_index().reset_index(drop=True) index got broken while grouping by
# X_test_gt = X_test_gt_remainder.append(X_test_gt_last, ignore_index=True)

# # Questionable re sorting as it drops installation_id, need to test
# X_test_gt = pd.concat([X_test_gt_remainder, X_test_gt_last]).drop_duplicates('installation_id').reset_index(drop=True)

In [130]:
X_test_gt.head(5)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,00abaee7,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
1,01242218,4,7,1,2,1,0,1,1,0,3,1,1,1,1,2,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0.466667,1.6,0.066667,0.2,0.1,0.0,0.1,5
2,02256298,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
3,027e7ce5,7,2,1,0,1,0,2,2,1,0,2,0,0,1,0,6,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,2,0.904762,2.714286,0.142857,0.142857,0.190476,0.142857,0.285714,7
4,02a29f99,1,14,0,13,0,0,1,1,0,0,0,0,2,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.166667,0.666667,0.0,0.0,0.166667,0.0,0.0,3


In [131]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (557, 45)


In [132]:
# !debugging, finding heavy user
X_test_gt[X_test_gt['num_correct'] == X_test_gt.num_correct.max()]

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
172,56a739ec,52,81,12,21,10,0,11,43,7,16,12,1,1,9,4,39,1,3,1,8,0,0,0,10,0,2,0,9,0,4,2,1,0,0,1,11,0.810861,2.528302,0.176101,0.188679,0.17344,0.05566,0.216981,53


In [133]:
# !debugging on heavy user
X_test[(X_test['installation_id'] == '56a739ec') & (X_test['event_code'] == 4100) & (X_test['title'] == 'Cart Balancer (Assessment)')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
362782,d122731b,e067ade55df3569d,2019-07-26T15:28:32.280Z,"{""correct"":true,""left"":[{""id"":""gem08"",""weight""...",56a739ec,9,4100,9119,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
364586,d122731b,5a801de07332be41,2019-07-27T22:51:21.311Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,9,4100,12898,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
365396,d122731b,8dce74e3c35ae87a,2019-07-28T01:15:24.644Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,9,4100,11518,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
366269,d122731b,ae58ce5ecbe1923f,2019-07-28T03:44:08.364Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,9,4100,13686,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
366924,d122731b,1e404ed424f3b59a,2019-07-31T23:24:41.013Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,14,4100,14782,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
368112,d122731b,4f35b863ff26ee87,2019-08-07T22:28:11.503Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,17,4100,17658,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
368896,d122731b,3a3b1b6f49633249,2019-08-12T16:15:15.125Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,13,4100,14641,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
368915,d122731b,589cd72c36133bde,2019-08-12T16:16:10.436Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,14,4100,20739,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
369139,d122731b,2c3179b736f68fad,2019-08-12T21:07:44.292Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,19,4100,22615,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
370078,d122731b,83a252a993493728,2019-10-07T00:37:18.302Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",56a739ec,26,4100,34496,Cart Balancer (Assessment),Assessment,CRYSTALCAVES


### Adding users w/o previous assessment attempts

* Test set specific as in train set we used only 'intallation_id's with at least one assessment attempt 

In [134]:
test_features_list = X_test_gt.columns
X_test_gt.columns

Index(['installation_id', 'num_correct', 'num_incorrect', 'bird_correct',
       'bird_incorrect', 'cart_correct', 'cart_incorrect', 'cauldron_correct',
       'cauldron_incorrect', 'chest_correct', 'chest_incorrect',
       'mushroom_correct', 'mushroom_incorrect', 'acc_0', 'acc_1', 'acc_2',
       'acc_3', 'bird_accg_0', 'bird_accg_1', 'bird_accg_2', 'bird_accg_3',
       'cart_accg_0', 'cart_accg_1', 'cart_accg_2', 'cart_accg_3',
       'cauldron_accg_0', 'cauldron_accg_1', 'cauldron_accg_2',
       'cauldron_accg_3', 'chest_accg_0', 'chest_accg_1', 'chest_accg_2',
       'chest_accg_3', 'mushroom_accg_0', 'mushroom_accg_1', 'mushroom_accg_2',
       'mushroom_accg_3', 'accuracy', 'accuracy_group', 'bird_accuracy',
       'cart_accuracy', 'cauldron_accuracy', 'chest_accuracy',
       'mushroom_accuracy', 'previous_assessments_count'],
      dtype='object')

In [135]:
test_users_wo_assessments = set(X_test.installation_id) - set(X_test_gt.installation_id)
len(test_users_wo_assessments)

443

### Creating empty df matching test's columns

* Filled with 0
* Alternatively could test with Nan, None or -1

In [136]:
test_users_wo_assessments_df = pd.DataFrame(0, index=np.arange(len(test_users_wo_assessments)), columns=test_features_list)

In [137]:
test_users_wo_assessments_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
439,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Adding 'installation_id's w/o prior assessments

In [138]:
# We have created installation_id column with zero values. Now will assign missing installation_id
test_users_wo_assessments_df['installation_id'] = test_users_wo_assessments

In [139]:
test_users_wo_assessments_df.head(2)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,eb98a24a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,f47ef997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### (~T) Merging 'installation_id's with and w/o assessments

In [140]:
X_test_gt = X_test_gt.append(test_users_wo_assessments_df, ignore_index=True)

In [141]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 45)


In [142]:
# debugging
len(set(X_test_gt.installation_id))

1000

In [143]:
# debugging
# we lost the order of 'installation_id', but submission is sorted ascending
booltest_sub = X_test_gt.installation_id.sort_values(ascending=True).reset_index(drop=True) == submission.installation_id
set(booltest_sub)

{True}

### (T) Sorting to match order of submission

In [144]:
X_test_gt = X_test_gt.sort_values('installation_id', ascending=True).reset_index(drop=True)

In [145]:
X_test_gt.head(10)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,00abaee7,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
1,01242218,4,7,1,2,1,0,1,1,0,3,1,1,1,1,2,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0.466667,1.6,0.066667,0.2,0.1,0.0,0.1,5
2,017c5718,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,01a44906,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,01bc6cb6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,02256298,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
6,0267757a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,027e7ce5,7,2,1,0,1,0,2,2,1,0,2,0,0,1,0,6,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,2,0.904762,2.714286,0.142857,0.142857,0.190476,0.142857,0.285714,7
8,02a29f99,1,14,0,13,0,0,1,1,0,0,0,0,2,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.166667,0.666667,0.0,0.0,0.166667,0.0,0.0,3
9,0300c576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [146]:
# debugging sorting
booltest_train = X_test_gt.installation_id == submission.installation_id
set(booltest_train)

{True}

In [147]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 45)


### (T) Adding 'forecasted_assessment' feature

* To both 'installation_id's with and w/o assessment attempt
* It fixes initial bug where 'installation_id's w/o assessment attempt got their last attempted assessment as their 'forecasted_assessment' 

In [148]:
# Create the forecasted_assessment_df which will contain all test set's installation_ids last forecasted_assessment

forecasted_assessment_df = X_test.groupby(['installation_id'], as_index=False, sort=False).agg('last')

# Reduce forecasted_assessment_df to users only w/o assessment (1000 -> 443):
# forecasted_assessment_df = forecasted_assessment_df[forecasted_assessment_df.installation_id.isin(test_users_wo_assessments)]
# Reseting the index, otherwise will get Nans when mapping:
# forecasted_assessment_df.reset_index()

In [149]:
#forecasted_assessment_df[forecasted_assessment_df['installation_id'] == '00abaee7']

* Add 'forecasted_assessment' feature to the test set

In [150]:
# Add forecasted_assessment number to X_test_gt:
# Map is how train set has assigned values to assessment titles:
# 0 Bird Measurer (Assessment)
# 1 Cart Balancer (Assessment)
# 2 Cauldron Filler (Assessment)
# 3 Chest Sorter (Assessment)
# 4 Mushroom Sorter (Assessment)
X_test_gt['forecasted_assessment'] = forecasted_assessment_df['title'].map({'Bird Measurer (Assessment)': 0,
                                                                                               'Cart Balancer (Assessment)': 1, 
                                                                                               'Cauldron Filler (Assessment)': 2, 
                                                                                               'Chest Sorter (Assessment)': 3, 
                                                                                    'Mushroom Sorter (Assessment)': 4})

In [151]:
X_test_gt.head(2)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count,forecasted_assessment
0,00abaee7,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1,2
1,01242218,4,7,1,2,1,0,1,1,0,3,1,1,1,1,2,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0.466667,1.6,0.066667,0.2,0.1,0.0,0.1,5,1


In [152]:
# debugging
set(X_test_gt.forecasted_assessment), X_test_gt.forecasted_assessment.count()

({0, 1, 2, 3, 4}, 1000)

In [153]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 46)


In [154]:
# debugging
X_test_gt.loc[441, ['forecasted_assessment']]

forecasted_assessment    3
Name: 441, dtype: object

In [155]:
# debugging
X_test_gt.loc[441,]

installation_id               779b71a3
num_correct                          1
num_incorrect                        0
bird_correct                         0
bird_incorrect                       0
cart_correct                         1
cart_incorrect                       0
cauldron_correct                     0
cauldron_incorrect                   0
chest_correct                        0
chest_incorrect                      0
mushroom_correct                     0
mushroom_incorrect                   0
acc_0                                0
acc_1                                0
acc_2                                0
acc_3                                1
bird_accg_0                          0
bird_accg_1                          0
bird_accg_2                          0
bird_accg_3                          0
cart_accg_0                          0
cart_accg_1                          0
cart_accg_2                          0
cart_accg_3                          1
cauldron_accg_0          

In [156]:
# debugging OK - 'forecasted_assessment' of '779b71a3' is 'Chest Sorter (Assessment)' or encoded 3 
X_test[X_test['installation_id'] == '779b71a3'].tail()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
492159,b2e5b0f1,a1c5bb957e51257c,2019-08-10T12:28:20.719Z,"{""session_duration"":16735,""event_count"":13,""ga...",779b71a3,13,2010,16735,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
492160,27253bdc,84479499d0e82123,2019-08-10T12:28:35.802Z,"{""event_code"": 2000, ""event_count"": 1}",779b71a3,1,2000,0,Crystal Caves - Level 3,Clip,CRYSTALCAVES
492161,27253bdc,b7e1ca2ec56fa214,2019-08-10T12:29:04.539Z,"{""event_code"": 2000, ""event_count"": 1}",779b71a3,1,2000,0,"Heavy, Heavier, Heaviest",Clip,CRYSTALCAVES
492162,9c5ef70c,03ff73e98f3cdd74,2019-08-10T12:30:19.242Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",779b71a3,1,2000,0,Pan Balance,Game,CRYSTALCAVES
492163,5b49460a,1156e1b072cf4725,2019-08-10T12:30:37.913Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",779b71a3,1,2000,0,Chest Sorter (Assessment),Assessment,CRYSTALCAVES


# Adding none acc features to the test set

### (T) timestamp

* Many other features could be extracted

In [157]:
# Re-using f-ion used in train
# Create new X_test_timefeat which holds time only timestamp features  
X_test_timefeat = X_test.filter(['installation_id', 'timestamp'], axis=1)
# Prepare time features from given timestamp 
X_test_timefeat = timestamp_split(X_test_timefeat)
X_test_timefeat

  mask |= (ar1 == a)


Unnamed: 0,installation_id,month,hour,minute,dayofweek,dayofyear,quarter,is_weekend
0,00abaee7,9,16,50,1,253,3,0
1,00abaee7,9,16,50,1,253,3,0
2,00abaee7,9,16,51,1,253,3,0
3,00abaee7,9,16,53,1,253,3,0
4,00abaee7,9,16,54,1,253,3,0
...,...,...,...,...,...,...,...,...
1156409,ffe774cc,9,21,20,5,271,3,0
1156410,ffe774cc,9,21,20,5,271,3,0
1156411,ffe774cc,9,21,20,5,271,3,0
1156412,ffe774cc,9,21,21,5,271,3,0


In [158]:
# Taking only the last installation_id's row's value of timestamp 
X_test_timefeat = X_test_timefeat.groupby('installation_id', as_index=False).last()

In [159]:
X_test_timefeat

Unnamed: 0,installation_id,month,hour,minute,dayofweek,dayofyear,quarter,is_weekend
0,00abaee7,9,13,52,3,255,3,0
1,01242218,10,20,23,2,282,4,0
2,017c5718,9,11,28,5,264,3,0
3,01a44906,7,16,28,5,208,3,0
4,01bc6cb6,9,18,5,4,249,3,0
...,...,...,...,...,...,...,...,...
995,fee254cf,9,1,44,5,271,3,0
996,ff57e602,10,17,18,6,286,4,0
997,ffc73fb2,10,20,36,0,280,4,0
998,ffe00ca8,9,19,0,1,253,3,0


In [160]:
# debugging OK
X_test[X_test['installation_id'] == '01242218'].tail()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
3582,36fa3ebe,b721d4026ce65fd9,2019-10-09T20:22:47.259Z,"{""duration"":10217,""misses"":0,""round"":3,""event_...",1242218,59,2030,68691,Happy Camel,Game,CRYSTALCAVES
3583,c7fe2a55,b721d4026ce65fd9,2019-10-09T20:22:49.167Z,"{""description"":""You have amazing powers!"",""ide...",1242218,60,3021,70557,Happy Camel,Game,CRYSTALCAVES
3584,a8a78786,b721d4026ce65fd9,2019-10-09T20:22:52.930Z,"{""description"":""You have amazing powers!"",""ide...",1242218,61,3121,74358,Happy Camel,Game,CRYSTALCAVES
3585,d51b1749,b721d4026ce65fd9,2019-10-09T20:22:55.447Z,"{""movie_id"":""Outro"",""duration"":10466,""descript...",1242218,62,2080,76879,Happy Camel,Game,CRYSTALCAVES
3586,7ad3efc6,1fef5d54cb4b775a,2019-10-09T20:23:16.209Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",1242218,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES


In [161]:
type(X_test_timefeat.month[0])

numpy.int64

### (T) title, type and world

In [162]:
# Re-using f-ion used in train
# Create new titletypeworldfeat which holds time only title, type and world features  
X_test_titletypeworldfeat = X_test.filter(['installation_id', 'title', 'type', 'world'], axis=1)
# Prepare title, type and world features from given timestamp 
X_test_titletypeworldfeat = title_type_world(X_test_titletypeworldfeat)
X_test_titletypeworldfeat

Unnamed: 0,installation_id,title_12monkeys,title_airshow,title_allstarsorting,title_balancingact,title_birdmeasurerassessment,title_bottlefilleractivity,title_bubblebath,title_bugmeasureractivity,title_cartbalancerassessment,title_cauldronfillerassessment,title_chestsorterassessment,title_chickenbalanceractivity,title_chowtime,title_costumebox,title_crystalcaveslevel1,title_crystalcaveslevel2,title_crystalcaveslevel3,title_crystalsrule,title_dinodive,title_dinodrink,title_eggdropperactivity,title_fireworksactivity,title_flowerwatereractivity,title_happycamel,"title_heavy,heavier,heaviest",title_honeycake,title_leafleader,title_liftingheavythings,title_magmapeaklevel1,title_magmapeaklevel2,title_mushroomsorterassessment,title_orderingspheres,title_panbalance,title_pirate'stale,title_rulers,title_sandcastlebuilderactivity,title_scrubadub,title_slopproblem,title_treasuremap,title_treetopcitylevel1,title_treetopcitylevel2,title_treetopcitylevel3,title_wateringholeactivity,title_welcometolostlagoon,type_activity,type_assessment,type_clip,type_game,world_crystalcaves,world_magmapeak,world_none,world_treetopcity
0,00abaee7,2.0,0.0,79.0,1.0,0.0,0.0,0.0,26.0,26.0,1.0,0.0,0.0,159.0,1.0,1.0,2.0,1.0,0.0,135.0,0.0,61.0,264.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,103.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,454.0,27.0,14.0,373.0,253.0,241.0,1.0,373.0
1,01242218,1.0,72.0,56.0,3.0,61.0,221.0,51.0,146.0,14.0,36.0,83.0,226.0,77.0,1.0,1.0,2.0,1.0,230.0,79.0,111.0,186.0,197.0,160.0,123.0,1.0,2.0,40.0,2.0,2.0,2.0,51.0,1.0,124.0,1.0,1.0,179.0,126.0,0.0,1.0,2.0,2.0,1.0,41.0,2.0,1356.0,245.0,29.0,1089.0,885.0,848.0,2.0,984.0
2,017c5718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0,143.0,1.0,6.0,0.0,0.0,0.0,4.0,146.0
3,01a44906,1.0,0.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,108.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,145.0,1.0,10.0,78.0,0.0,0.0,3.0,231.0
4,01bc6cb6,0.0,0.0,420.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,226.0,158.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,226.0,1.0,17.0,708.0,522.0,3.0,3.0,424.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,fee254cf,0.0,0.0,92.0,0.0,82.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,116.0,4.0,92.0,0.0,1.0,1.0,210.0
996,ff57e602,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,28.0,0.0,1.0,75.0,57.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,52.0,0.0,0.0,1.0,0.0,79.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,127.0,29.0,11.0,136.0,246.0,0.0,1.0,56.0
997,ffc73fb2,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,70.0,139.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,182.0,0.0,0.0,1.0,3.0,0.0,4.0,1.0,4.0,46.0,1.0,0.0,1.0,3.0,56.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,238.0,256.0,32.0,0.0,154.0,132.0,1.0,239.0
998,ffe00ca8,0.0,0.0,0.0,1.0,0.0,0.0,0.0,110.0,1.0,109.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,1.0,2.0,1.0,2.0,1.0,123.0,110.0,11.0,15.0,5.0,139.0,1.0,114.0


In [163]:
# debugging OK, 'ffe00ca8' has 5 rows in 'world' 'CRYSTALCAVES'
X_test[(X_test['installation_id'] == 'ffe00ca8') & (X_test['world'] == 'CRYSTALCAVES')]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
1156130,27253bdc,956906880a4a58da,2019-09-09T18:47:01.102Z,"{""event_code"": 2000, ""event_count"": 1}",ffe00ca8,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
1156131,27253bdc,2a8ea0f319d282ae,2019-09-09T18:47:35.068Z,"{""event_code"": 2000, ""event_count"": 1}",ffe00ca8,1,2000,0,Balancing Act,Clip,CRYSTALCAVES
1156366,27253bdc,aff0416f95256304,2019-09-10T18:58:43.430Z,"{""event_code"": 2000, ""event_count"": 1}",ffe00ca8,1,2000,0,Crystal Caves - Level 2,Clip,CRYSTALCAVES
1156367,27253bdc,c1e0b0b06a77139f,2019-09-10T18:59:39.881Z,"{""event_code"": 2000, ""event_count"": 1}",ffe00ca8,1,2000,0,Crystal Caves - Level 2,Clip,CRYSTALCAVES
1156368,7ad3efc6,8d0fdec0ad44aefb,2019-09-10T19:00:42.792Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",ffe00ca8,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES


### (T) merge of timestamp, type, title and world features to main test set

##### debugging index before merger

* to avoid incorrectly assigning features from another 'installation_id's  

In [164]:
# debugging sorting of timefeat

booltest_timefeat = X_test_gt.installation_id == X_test_timefeat.installation_id
set(booltest_timefeat)

{True}

In [165]:
# debugging sorting of X_test_titletypeworldfeat

booltest_titletypeworldfeat = X_test_gt.installation_id == X_test_titletypeworldfeat.installation_id
set(booltest_titletypeworldfeat)

{True}

In [166]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 46)


##### merging time features

In [167]:
# debugging
debugging_ids(X_test_timefeat)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 8)


In [168]:
# Merging new features to main test set

# Add time features to the main dataframe
X_test_gt = pd.merge(X_test_gt, X_test_timefeat, on=['installation_id'])

In [169]:
len(set(X_test_gt.installation_id)), X_test_gt.shape

(1000, (1000, 53))

In [170]:
# # debugging - count nan in df - OK
# X_test_gt.isna().sum()

In [171]:
# Add title, type and world features to the main dataframe
X_test_gt = pd.merge(X_test_gt, X_test_titletypeworldfeat, on=['installation_id'])

In [172]:
len(set(X_test_gt.installation_id)), X_test_gt.shape

(1000, (1000, 105))

In [173]:
# # Count nan in df for debugging purposes
#set(X_test_gt.isna().sum())

In [174]:
# debugging sorting
booltest_sub = X_test_gt.installation_id == submission.installation_id
set(booltest_sub)

{True}

#### Cleaning unused dfs and variables

In [175]:
#del X_test, X_test_gt_remainder_sum_list, X_test_gt_remainder_mean_list, X_test_gt_remainder_unchanged_list, X_test_gt_sum, X_test_gt_mean, X_test_gt_unchaged, test_features_list, test_users_wo_assessments, test_users_wo_assessments_df, forecasted_assessment_df, X_test_timefeat, X_test_titletypeworldfeat
gc.collect()

78

# Submission

In [176]:
len(set(X_test_gt.installation_id)), X_test_gt.shape

(1000, (1000, 105))

In [177]:
# debugging - check if df feature types
X_test_gt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Columns: 105 entries, installation_id to world_treetopcity
dtypes: float64(59), int64(45), object(1)
memory usage: 828.1+ KB


In [178]:
# debugging sorting
booltest_sub = X_test_gt.installation_id == submission.installation_id
set(booltest_sub)

{True}

In [179]:
# drop installation_id
X_test_gt = X_test_gt.drop(['installation_id'], axis=1)

In [180]:
len(set(X_test_gt.index)), X_test_gt.shape

(1000, (1000, 104))

In [181]:
list(X_test_gt.columns)

['num_correct',
 'num_incorrect',
 'bird_correct',
 'bird_incorrect',
 'cart_correct',
 'cart_incorrect',
 'cauldron_correct',
 'cauldron_incorrect',
 'chest_correct',
 'chest_incorrect',
 'mushroom_correct',
 'mushroom_incorrect',
 'acc_0',
 'acc_1',
 'acc_2',
 'acc_3',
 'bird_accg_0',
 'bird_accg_1',
 'bird_accg_2',
 'bird_accg_3',
 'cart_accg_0',
 'cart_accg_1',
 'cart_accg_2',
 'cart_accg_3',
 'cauldron_accg_0',
 'cauldron_accg_1',
 'cauldron_accg_2',
 'cauldron_accg_3',
 'chest_accg_0',
 'chest_accg_1',
 'chest_accg_2',
 'chest_accg_3',
 'mushroom_accg_0',
 'mushroom_accg_1',
 'mushroom_accg_2',
 'mushroom_accg_3',
 'accuracy',
 'accuracy_group',
 'bird_accuracy',
 'cart_accuracy',
 'cauldron_accuracy',
 'chest_accuracy',
 'mushroom_accuracy',
 'previous_assessments_count',
 'forecasted_assessment',
 'month',
 'hour',
 'minute',
 'dayofweek',
 'dayofyear',
 'quarter',
 'is_weekend',
 'title_12monkeys',
 'title_airshow',
 'title_allstarsorting',
 'title_balancingact',
 'title_birdm

##### Cleaning column names


In [182]:
# Elsewise LightGBMError: Do not support special JSON characters in feature name.
X_test_gt.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test_gt.columns]

##### Removing non performing features

In [183]:
# # # Drop non performing features
# X_test_gt.columns.drop(non_performing_feat)

In [184]:
# # Catboost Classifier submission
# cbc_preds = cbc_model.predict(X_test_gt)
# submission['accuracy_group'] = cbc_preds.astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()

###### Weighting & submission

In [185]:
# LightGBM

submission = pd.read_csv(path + 'sample_submission.csv')
gbm_preds = gbm_model.predict(X_test_gt)

submission['accuracy_group'] = gbm_preds

submission['accuracy_group_weight0'] = np.where((submission['accuracy_group'] <= 1.58391901), 0, 0)
submission['accuracy_group_weight1'] = np.where((submission['accuracy_group'] > 1.58391901) & (submission['accuracy_group'] <= 1.79932900), 1, 0)
submission['accuracy_group_weight2'] = np.where((submission['accuracy_group'] > 1.79932900) & (submission['accuracy_group'] <= 1.99473900), 2, 0)
submission['accuracy_group_weight3'] = np.where((submission['accuracy_group'] > 1.99473900), 3, 0)
submission['accuracy_group'] = submission['accuracy_group_weight0'] + submission['accuracy_group_weight1'] + submission['accuracy_group_weight2'] + submission['accuracy_group_weight3']
submission = submission.drop(['accuracy_group_weight0', 'accuracy_group_weight1', 'accuracy_group_weight2', 'accuracy_group_weight3'], axis=1)

submission.to_csv("submission.csv", index = False)

submission.accuracy_group.value_counts()

3    445
0    288
1    134
2    133
Name: accuracy_group, dtype: int64

In [186]:
#submission[submission['accuracy_group_weight3'] == 3]
submission[submission['accuracy_group'] == 0]

Unnamed: 0,installation_id,accuracy_group
6,0267757a,0
8,02a29f99,0
9,0300c576,0
10,03885368,0
14,04a7bc3f,0
...,...,...
974,f86a6ed4,0
976,f8dacbde,0
985,fbe1fea6,0
992,fe5f7da8,0
