# vX3

# **Accessing working environment Kaggle**

In [1]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/data-science-bowl-2019/sample_submission.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/test.csv
/kaggle/input/data-science-bowl-2019/train.csv


# **Importing libraries**

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 200)
from time import time
import datetime as dt
import gc # clear garbage

# Debugging f-ions

In [3]:
def debugging_ids(df):
    return print(f'Debugging submitted dataframe: \nUnique installation_ids: {len(set(df.installation_id))} \nRows & columns count {df.shape}')

# **Loading data**

In [4]:
load_columns = ['event_id',
                'game_session',
                'timestamp',                
                'installation_id',
                'event_count',
                'event_code',
                'game_time',
                'title',
                'type',
                'world',
                'event_data']

path = '/kaggle/input/data-science-bowl-2019/' # create url path to the datasets

t0 = time()

print('Loading datasets...')
X_train = pd.read_csv(path + 'train.csv', usecols = load_columns)
X_labels = pd.read_csv(path + 'train_labels.csv')
# specs = pd.read_csv(path + 'specs.csv')
#X_test = pd.read_csv(path + 'test.csv', usecols = load_columns)
#submission = pd.read_csv(path + 'sample_submission.csv')
print("Datasets loaded successfully! \nLoading time:", round(time() - t0, 3), "s")

Loading datasets...
Datasets loaded successfully! 
Loading time: 68.926 s


# **Data preparation**

### **(T) Reducing train df with users having accuracy scores (17000 -> 3614 installation_ids)**

In [5]:
# X_train has 17000 installation_id's, however there are only for 3614 installation_id's (X_labels and X_train) with Assessment attempt
# Reducing X_train to 17000 -> 3614 installation_ids
X_train = X_train[X_train['installation_id'].isin(set(X_labels.installation_id))]

### **Extracting accuracy of previous Assessment attempts**

* Preparing train set which is identical to train_labels except:
* accuracy differs for 46 observations due to saving in more floating points (16 ours vs 9 train_labels.csv)
* removed the last assessment's (target) row

#### (T) Create X_train_gt by extracting only rows with assessments events

In [6]:
# Creating X_train_gt to hold only rows with assessment attempts

X_train_gt = pd.DataFrame(data=None)

# X_train_gt will be used only for accuracy features extraction
# First, filter assessment events only

X_train_gt = X_train[((X_train['event_code'] == 4100) & 
                 (X_train['title'].isin(['Cart Balancer (Assessment)', 
                                    'Cauldron Filler (Assessment)', 
                                    'Mushroom Sorter (Assessment)', 
                                    'Chest Sorter (Assessment)']))) | 
                ((X_train['event_code'] == 4110) & 
                 (X_train['title'] == 'Bird Measurer (Assessment)'))].copy(deep=True)   

In [7]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (41549, 11)


In [8]:
#X_train_gt[X_train_gt['installation_id'] == '0006c192']

#### (T) Drop columns which will be processed later

In [9]:
# Fourth, drop columns which will be processed separately

X_train_gt.drop(['event_id', 
                 'timestamp', 
                 'event_count', 
                 'event_code', 
                 'game_time',
                 'type',
                 'world',], axis=1, inplace=True)

In [10]:
gc.collect()

0

In [11]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (41549, 4)


#### (T) Extract accuracy features from 'event_data'

In [12]:
# Fifths, extract correct and incorrect assessment attempts per user from 'event_data'
# Create num_correct and num_incorrect columns

corr = '"correct":true'
incorr = '"correct":false'

X_train_gt['num_correct'] = X_train_gt['event_data'].apply(lambda x: 1 if corr in x else 0)
X_train_gt['num_incorrect'] = X_train_gt['event_data'].apply(lambda x: 1 if incorr in x else 0)

In [13]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (41549, 6)


In [14]:
#X_train_gt

In [15]:
# Sixths, aggregate (sum) correct and incorrect assessment attempts 
# per 'game_session', 'installation_id' and assessment 'title'
# As provided in grount truth (labels.csv)

# previous aggregation was made together with sorting to match train_labels format
#X_train_gt = X_train_gt.sort_values(['installation_id', 'game_session'], ascending=True).groupby(['game_session', 'installation_id', 'title'], as_index=False, sort=False).agg(sum)
# a) difficult to extract last assessment
# b) difficult to truncate
# c) difficult to accumulate actions before assessment
X_train_gt = X_train_gt.groupby(['game_session', 'installation_id', 'title'], as_index=False, sort=False).agg(sum)

In [16]:
#X_train_gt

In [17]:
#X_labels

In [18]:
# # Great, because w/o sorting by game_session and installation_id 
# # we preserve the original order of events by timestamp 
# X_train_gt[X_train_gt['installation_id'] == '0006c192']

In [19]:
#X_labels[X_labels['installation_id'] == '0006c192']

In [20]:
#X_train[(X_train['installation_id'] == '0006c192') & ((X_train['event_code'] == 4100) | (X_train['event_code'] == 4110))]

In [21]:
# Sevenths, create 'accuracy' feature = corr / (corre + incorr)

X_train_gt['accuracy'] = X_train_gt['num_correct'] / (X_train_gt['num_correct'] + X_train_gt['num_incorrect'])

# Eighths, create 'accuracy_group' feature
# 3: the assessment was solved on the first attempt
# 2: the assessment was solved on the second attempt
# 1: the assessment was solved after 3 or more attempts
# 0: the assessment was never solved

# If accuracy is 0.0 (no correct attempts), accuracy group is 0 as all observations in X_train_gt by now has at least one attempt
# If accuracy is 1.0 (that is no incorrect attempts), accuracy group is 3
# If accuracy is 0.5 (there is equal amount of correct and incorrect attempts), accuracy group is 2
# Any other case means that accuracy group equals 1, that is 3 or more attempts were needed to make a correct attempt    

X_train_gt['accuracy_group'] = X_train_gt['accuracy'].apply(lambda x: 0 if x == 0.0 else (3 if x == 1.0 else (2 if x == 0.5 else 1)))

In [22]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 7)


In [23]:
# # task is to forecast 'accuracy_group' in the last 'game_session' of single 'installation_id'
# # E.g. 'installation_id' '0006a69f' last assessment
# # in last 'game_session' 'a9ef3ecb3d1acc6a' was 'Bird Measurer (Assessment)'
# # our task is to forecast that his 'accuracy_group' was '3' 
# X_train_gt

In [24]:
# # Comparing with ground truth sample:
# X_labels.head(8)
# # As we removed sorting, only overall count should match

In [25]:
# # Double check accuracy figures in X_train_gt and X_labels

# print(f'SUM (OK)')
# print(f'X_train_gt has accuracy_group sum of {sum(X_train_gt["accuracy_group"])} \nX_labels has accuracy_group sum of {sum(X_labels["accuracy_group"])}')

# print(f'\nTYPE (OK)')
# print(f'Type of X_train_gt num_correct is {type(X_train_gt["num_correct"][0])} \nType of X_labels num_correct is {type(X_labels["num_correct"][0])}')
# print(f'Type of X_train_gt num_incorrect is {type(X_train_gt["num_incorrect"][0])} \nType of X_labels num_incorrect is {type(X_labels["num_incorrect"][0])}') 
# print(f'Type of X_train_gt accuracy is {type(X_train_gt["accuracy"][0])} \nType of X_labels accuracy is {type(X_labels["accuracy"][0])}') 
# print(f'Type of X_train_gt accuracy_group is {type(X_train_gt["accuracy_group"][0])} \nType of X_labels accuracy_group is {type(X_labels["accuracy_group"][0])}')

# print(f'\nDIFFERENCES')
# print(f'Difference between accuracy column in X_train_gt and X_labels is: {set(X_train_gt["accuracy"] - X_labels["accuracy"])}')
# print(f'Difference between accuracy_group column in X_train_gt and X_labels is: {set(X_train_gt["accuracy_group"] - X_labels["accuracy_group"])}')
# print(f'Accuracy set len in X_train_gt is: {len(set(X_train_gt["accuracy"]))}')
# print(f'Accuracy set len in X_labels is: {len(set(X_labels["accuracy"]))}')
# print(f'Difference between num_correct column in X_train_gt and X_labels is: {set(X_train_gt["num_correct"] - X_labels["num_correct"])}')
# print(f'Difference between num_incorrect column in X_train_gt and X_labels is: {set(X_train_gt["num_incorrect"] - X_labels["num_incorrect"])}')

# print(f'\nEQUAL VALUES ROW BY ROW')

# booltest_session = X_train_gt.game_session == X_labels.game_session
# booltest_ids = X_train_gt.installation_id == X_labels.installation_id
# booltest_title = X_train_gt.title == X_labels.title
# booltest_num_correct = X_train_gt.num_correct == X_labels.num_correct
# booltest_num_incorrect = X_train_gt.num_incorrect == X_labels.num_incorrect
# booltest_accuracy = X_train_gt.accuracy == X_labels.accuracy
# booltest_accuracy_group = X_train_gt.accuracy_group == X_labels.accuracy_group

# print(f'Equal values (TRUE) of game_session in X_train_gt and X_labels: \n{booltest_session.value_counts()}')
# print(f'Equal values (TRUE) of installation_id in X_train_gt and X_labels: \n{booltest_ids.value_counts()}')
# print(f'Equal values (TRUE) of title in X_train_gt and X_labels: \n{booltest_title.value_counts()}')
# print(f'Equal values (TRUE) of num_correct in X_train_gt and X_labels: \n{booltest_num_correct.value_counts()}')
# print(f'Equal values (TRUE) of num_incorrect in X_train_gt and X_labels: \n{booltest_num_incorrect.value_counts()}')
# print(f'Equal values (TRUE) of accuracy in X_train_gt and X_labels: \n{booltest_accuracy.value_counts()}')
# print(f'Equal values (TRUE) of accuracy_group in X_train_gt and X_labels: \n{booltest_accuracy_group.value_counts()}')

# # Changelog:
# # Index was fixed by applying .sort_values(['installation_id', 'game_session'], ascending=True) in the groupby part
# # Now difference between accuracy_group columns in X_train_gt and X_labels should be {0}

In [26]:
## Debugging 46 accuracy scores which do not match.
# not_matching_accuracy_df = X_train_gt.accuracy - X_labels.accuracy
# not_matching_accuracy_df = not_matching_accuracy_df[not_matching_accuracy_df != 0]
# #len(not_matching_accuracy_df) = 46
# X_train_gt[X_train_gt.index.isin(not_matching_accuracy_df.index)]
# # X_labels[X_labels.index.isin(not_matching_accuracy_df.index)]
# # Conclusion: We produce 16 digits after comma, train_labels.csv has 9
#X_train_gt[X_train_gt.index.isin(not_matching_accuracy_df.index)].to_csv("different_accuracies.csv", index = False)

### (T) Accuracy groups

In [27]:
X_train_gt['acc_0'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 0 else 0)
X_train_gt['acc_1'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 1 else 0)
X_train_gt['acc_2'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 2 else 0)
X_train_gt['acc_3'] = X_train_gt['accuracy_group'].apply(lambda x: 1 if x == 3 else 0)

In [28]:
# debugging
# X_train_gt[X_train_gt['installation_id'] == '0006a69f']

In [29]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 11)


### (T) Accuracy groups per assessment 'title'

In [30]:
# Accuracy group per assessment title
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)

X_train_gt['bird_accg_0'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['bird_accg_1'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['bird_accg_2'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['bird_accg_3'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['cart_accg_0'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['cart_accg_1'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['cart_accg_2'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['cart_accg_3'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['cauldron_accg_0'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['cauldron_accg_1'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['cauldron_accg_2'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['cauldron_accg_3'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['chest_accg_0'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['chest_accg_1'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['chest_accg_2'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['chest_accg_3'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

X_train_gt['mushroom_accg_0'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 0), 1, 0)
X_train_gt['mushroom_accg_1'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 1), 1, 0)
X_train_gt['mushroom_accg_2'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 2), 1, 0)
X_train_gt['mushroom_accg_3'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['accuracy_group'] == 3), 1, 0)

In [31]:
# debugging
#X_train_gt['mushroom_accg_0'][17688]

In [32]:
# debugging
#X_train_gt[X_train_gt['installation_id'] == '0006a69f']

In [33]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 31)


### (T) Accuracy (corr, incorr, accuracy) per assessment

In [34]:
# Accuracy group per assessment title
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)
# E.g. if Bird Measurer has num_correct = 1, add 1, elsewise add 0

X_train_gt['bird_correct'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['bird_incorrect'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['bird_accuracy'] = np.where((X_train_gt['title'] == 'Bird Measurer (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['cart_correct'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['cart_incorrect'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['cart_accuracy'] = np.where((X_train_gt['title'] == 'Cart Balancer (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['cauldron_correct'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['cauldron_incorrect'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['cauldron_accuracy'] = np.where((X_train_gt['title'] == 'Cauldron Filler (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['chest_correct'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['chest_incorrect'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['chest_accuracy'] = np.where((X_train_gt['title'] == 'Chest Sorter (Assessment)'), X_train_gt['accuracy'], 0)

X_train_gt['mushroom_correct'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['num_correct'] == 1), 1, 0)
X_train_gt['mushroom_incorrect'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_train_gt['num_incorrect'] > 0), X_train_gt['num_incorrect'], 0)
X_train_gt['mushroom_accuracy'] = np.where((X_train_gt['title'] == 'Mushroom Sorter (Assessment)'), X_train_gt['accuracy'], 0)

In [35]:
# debugging
# X_train_gt[X_train_gt['installation_id'] == '0006a69f']

In [36]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (17690, 46)


### Removing last assessment from train set

* X_train_gt at this point has 41549 assessments
* Can not remove just last one before aggregation 

In [37]:
# Remove the last assessment's attempt from train (new from 200115)

# Build temporary df which holds last assessment
X_train_gt_last = X_train_gt.groupby('installation_id').tail(1).copy(deep=True)
X_train_gt_last_index_list = list(X_train_gt_last.index)

# Removing last assessment attempt from test set
# 'installation_id's drop 3614->3021 as we have users who had just single attempt
X_train_gt = X_train_gt.loc[~X_train_gt.index.isin(X_train_gt_last_index_list)]

In [38]:
# debugging
debugging_ids(X_train_gt_last)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 46)


In [39]:
# # debugging
# X_train_gt_last.head(5)

In [40]:
# # debugging, good case of 0006c192
# X_train[(X_train['installation_id'] == '0006c192') & ((X_train['event_code'] == 4100) | (X_train['event_code'] == 4110))]

In [41]:
# X_train_gt_last[(X_train_gt_last['installation_id'] == '0006c192')]

In [42]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 2587 
Rows & columns count (14076, 46)


### (~T) Aggregation

Tested the build, updated avoiding extra df, but haven't double-checked sample means or sums

In [43]:
X_train_gt_sum_list = ['num_correct', 'num_incorrect', 
       'bird_correct', 'bird_incorrect',
       'cart_correct', 'cart_incorrect', 'cauldron_correct',
       'cauldron_incorrect', 'chest_correct',
       'chest_incorrect', 'mushroom_correct',
       'mushroom_incorrect', 'acc_0',
       'acc_1', 'acc_2', 'acc_3', 'bird_accg_0', 'bird_accg_1', 'bird_accg_2',
       'bird_accg_3', 'cart_accg_0', 'cart_accg_1', 'cart_accg_2',
       'cart_accg_3', 'cauldron_accg_0', 'cauldron_accg_1', 'cauldron_accg_2',
       'cauldron_accg_3', 'chest_accg_0', 'chest_accg_1', 'chest_accg_2',
       'chest_accg_3', 'mushroom_accg_0', 'mushroom_accg_1', 'mushroom_accg_2',
       'mushroom_accg_3']

X_train_gt_mean_list = ['accuracy',
       'accuracy_group', 'bird_accuracy',
       'cart_accuracy', 'cauldron_accuracy', 'chest_accuracy', 'mushroom_accuracy']

In [44]:
#len(X_train_gt_sum_list), len(X_train_gt_mean_list)

In [45]:
X_train_gt_sum_df = X_train_gt.groupby(['installation_id'], as_index=False, sort=False)[X_train_gt_sum_list].agg(sum)

In [46]:
#X_train_gt_sum_df

In [47]:
X_train_gt_mean_df = X_train_gt.groupby(['installation_id'], as_index=False, sort=False)[X_train_gt_mean_list].agg('mean')

In [48]:
#X_train_gt_mean_df

In [49]:
#X_train_gt_unchaged_df = X_train_gt.groupby(['installation_id'], as_index=False, sort=False)[X_train_gt_unchanged_list].last()

In [50]:
X_train_gt = pd.merge(X_train_gt_sum_df, X_train_gt_mean_df, how='left', on=['installation_id'])

In [51]:
del X_train_gt_sum_df, X_train_gt_mean_df
gc.collect()

0

In [52]:
#X_train_gt

In [53]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 2587 
Rows & columns count (2587, 44)


## Adding users w/o previous assessment attempts

In [54]:
train_features_list = X_train_gt.columns

In [55]:
print(f'X_train iids: {len(set(X_train.installation_id))} \nX_train_gt iids: {len(set(X_train_gt.installation_id))} \nX_labels iids: {len(set(X_labels.installation_id))}')

X_train iids: 3614 
X_train_gt iids: 2587 
X_labels iids: 3614


In [56]:
train_users_wo_assessments = set(X_train.installation_id) - set(X_train_gt.installation_id)
len(train_users_wo_assessments)

1027

### Creating empty df matching test's columns

In [57]:
train_users_wo_assessments_df = pd.DataFrame(0, index=np.arange(len(train_users_wo_assessments)), columns=train_features_list)

In [58]:
train_users_wo_assessments_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1024,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1025,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Adding 'installation_id's w/o prior assessments

In [59]:
# We have created installation_id column with zero values. Now will assign missing 'installation_id's:
train_users_wo_assessments_df['installation_id'] = train_users_wo_assessments

In [60]:
train_users_wo_assessments_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,211c5b5b,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,10629430,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,de0a0baf,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,26ab8f19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,d596f097,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022,aa38f420,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1023,9bf2c48f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1024,172eb0d1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1025,89da02a3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Merging 'installation_id's with and w/o assessments

In [61]:
X_train_gt = X_train_gt.append(train_users_wo_assessments_df, ignore_index=True)

In [62]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.00,0.000000,0.000,0.000,0.0,0.6250
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.50,0.000000,0.000,0.500,0.0,0.0000
2,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.00,0.000000,0.250,0.000,0.0,0.3750
3,002db7e3,6,24,2,11,1,0,1,0,0,12,2,1,2,1,2,3,0,1,1,0,0,0,0,1,0,0,0,1,2,0,0,0,0,0,1,1,0.511364,1.75,0.073864,0.125,0.125,0.0,0.1875
4,003372b0,4,5,1,1,2,0,0,0,0,4,1,0,1,0,1,3,0,0,1,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,1,0.700000,2.20,0.100000,0.400,0.000,0.0,0.2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,aa38f420,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000
3610,9bf2c48f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000
3611,172eb0d1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000
3612,89da02a3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.00,0.000000,0.000,0.000,0.0,0.0000


In [63]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 44)


In [64]:
# # debugging
# # we lost the order of 'installation_id', but submission is sorted ascending
# booltrain_label = X_train_gt.installation_id.sort_values(ascending=True).reset_index(drop=True) == X_labels.installation_id
# set(booltrain_label)

In [65]:
del train_users_wo_assessments_df
gc.collect()

0

### (T) Sorting to match order of initial train set
* Because after merger of users with previous assessments and without we lost the initial ordering

In [66]:
X_train_gt = X_train_gt.sort_values('installation_id', ascending=True).reset_index(drop=True)

In [67]:
# X_labels

In [68]:
# # debugging
# # check if sorting of 'installation_id's matches train_labels sorting
# # for this need to drop duplicates in X_labels as it contain 17690 rows with 'installation_id's
# # ref: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.drop_duplicates.html
# # reseting index and dropping old index via reset_index(drop=True)
# # does not lose the sorting
# # THIS PART TO BE UNCOMMENTED:
# X_labels_unique_installation_id = X_labels.installation_id.drop_duplicates().reset_index(drop=True)
# booltrain_label = X_train_gt.installation_id == X_labels_unique_installation_id
# set(booltrain_label)

In [69]:
# del X_labels_unique_installation_id, booltrain_label
# gc.collect()

In [70]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 44)


### Adding previous assessments count

In [71]:
X_train_gt['previous_assessments_count'] = X_train_gt['num_correct'] + X_train_gt['num_incorrect']

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 45)


### Adding 'forecasted_assessment' feature

In [72]:
X_train_gt.shape, X_train_gt_last.shape

((3614, 45), (3614, 46))

In [73]:
#X_train_gt_last

In [74]:
# X_train[(X_train['installation_id'] == '0006c192') & ((X_train['event_code'] == 4100) | (X_train['event_code'] == 4110))]

In [75]:
# # X_train_gt_last is taking X_train index 4137->11337808
# train_forecasted_assessment_df = X_train_gt_last.sort_values('installation_id', ascending=True).reset_index(drop=True)
# train_forecasted_assessment_df

In [76]:
# # check if last df had the right 'title' for forecasted assessment
# X_labels.head(20)

In [77]:
# # double-check sorting - OK
# boollast_label = train_forecasted_assessment_df.installation_id == X_labels_unique_installation_id
# set(boollast_label)

In [78]:
# train_forecasted_assessment_df.shape

In [79]:
#X_train_gt

In [80]:
# # Need to reset X_train_gt_last index for boolean comparison
# X_train_gt_last

In [81]:
# Debugging - double-check sorting of X_train_gt_last & X_train_gt
X_train_gt_last = X_train_gt_last.reset_index(drop=True)
# Above we updated the X_train_gt_last index to match 0-3613 (total of 3614)
booltrain_last = X_train_gt.installation_id == X_train_gt_last.installation_id
set(booltrain_last)

{True}

In [82]:
del booltrain_last
gc.collect()

0

In [83]:
# # Updated index:
# X_train_gt_last

In [84]:
X_train_gt['forecasted_assessment'] = X_train_gt_last['title'].map({'Bird Measurer (Assessment)': 0,
                                                                            'Cart Balancer (Assessment)': 1, 
                                                                            'Cauldron Filler (Assessment)': 2, 
                                                                            'Chest Sorter (Assessment)': 3, 
                                                                            'Mushroom Sorter (Assessment)': 4})

In [85]:
# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 46)


In [86]:
set(X_train_gt.forecasted_assessment), X_train_gt.forecasted_assessment.count()

({0, 1, 2, 3, 4}, 3614)

# (~T) Adding non accuracy features
### bugs:
#### - data is not truncated after forecasted_event
#### - we take last assessment, which might better off be random

Given that test set contains almost half of installation_ids without previous assessments, we need to add other than accuracy features for model to pick up

## (~T) event_code

#### Preparing event_code features

In [87]:
#X_train

In [88]:
def event_code(df):
    df = pd.get_dummies(data=df, columns=['event_code'])
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' |-|!|\)|\(', '')
    df = df.groupby(['installation_id'], as_index=False, sort=False).agg(sum)  
    return df

In [89]:
# Uses ~3 GB of RAM for this operation (9->12->9)
X_train_eventcode = X_train.filter(['installation_id', 'event_code'], axis=1)
X_train_eventcode = event_code(X_train)

In [90]:
#X_train_eventcode

#### Merging event_code features to the main train set

In [91]:
# Add event_code features to the main dataframe
X_train_gt = pd.merge(X_train_gt, X_train_eventcode, on=['installation_id'])
# # Count nan in df for debugging purposes
# X_train_gt.isna().sum()

del X_train_eventcode
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 90)


In [92]:
X_train_gt

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count,forecasted_assessment,event_count,game_time,event_code_2000,event_code_2010,event_code_2020,event_code_2025,event_code_2030,event_code_2035,event_code_2040,event_code_2050,event_code_2060,event_code_2070,event_code_2075,event_code_2080,event_code_2081,event_code_2083,event_code_3010,event_code_3020,event_code_3021,event_code_3110,event_code_3120,event_code_3121,event_code_4010,event_code_4020,event_code_4021,event_code_4022,event_code_4025,event_code_4030,event_code_4031,event_code_4035,event_code_4040,event_code_4045,event_code_4050,event_code_4070,event_code_4080,event_code_4090,event_code_4095,event_code_4100,event_code_4110,event_code_4220,event_code_4230,event_code_4235,event_code_5000,event_code_5010
0,0006a69f,3,12,0,11,0,0,0,0,0,0,3,1,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0.625000,2.000000,0.000000,0.000,0.000000,0.00000,0.625000,15,0,226162,313548319,80.0,4.0,112.0,12.0,97.0,8.0,21.0,18.0,7.0,2.0,3.0,17.0,6.0,10.0,620.0,34.0,89.0,610.0,34.0,89.0,19.0,404.0,67.0,45.0,128.0,471.0,25.0,35.0,18.0,4.0,0.0,592.0,0.0,4.0,2.0,13.0,14.0,13.0,25.0,25.0,12.0,12.0
1,0006c192,1,4,0,0,0,0,1,0,0,0,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.500000,1.500000,0.000000,0.000,0.500000,0.00000,0.000000,5,0,195452,201045937,50.0,0.0,52.0,2.0,45.0,2.0,6.0,5.0,1.0,1.0,0.0,4.0,1.0,3.0,251.0,29.0,22.0,245.0,29.0,20.0,7.0,197.0,65.0,37.0,34.0,340.0,0.0,49.0,20.0,1.0,0.0,676.0,0.0,4.0,1.0,6.0,2.0,5.0,6.0,6.0,0.0,0.0
2,00129856,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,0,71329,82287318,9.0,1.0,21.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,184.0,0.0,1.0,183.0,0.0,1.0,0.0,87.0,12.0,8.0,22.0,164.0,0.0,33.0,0.0,0.0,0.0,64.0,6.0,12.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,001d0ed0,3,6,0,5,1,0,0,0,0,0,2,1,1,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.625000,2.000000,0.000000,0.250,0.000000,0.00000,0.375000,9,3,68233,79805035,52.0,3.0,52.0,2.0,44.0,2.0,10.0,9.0,0.0,0.0,0.0,8.0,2.0,6.0,124.0,39.0,34.0,121.0,38.0,33.0,6.0,96.0,0.0,0.0,26.0,96.0,0.0,15.0,6.0,0.0,0.0,186.0,0.0,2.0,2.0,9.0,5.0,0.0,0.0,0.0,0.0,0.0
4,00225f67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,0,69922,67371483,25.0,0.0,12.0,3.0,9.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,92.0,55.0,10.0,91.0,53.0,8.0,4.0,101.0,0.0,18.0,21.0,150.0,0.0,31.0,0.0,0.0,2.0,267.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,ff9305d7,1,18,1,12,0,0,0,0,0,6,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0.025641,0.333333,0.025641,0.000,0.000000,0.00000,0.000000,19,0,698911,418496299,47.0,0.0,35.0,2.0,26.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,565.0,61.0,44.0,558.0,60.0,43.0,6.0,541.0,24.0,0.0,86.0,713.0,15.0,48.0,36.0,0.0,0.0,1005.0,0.0,0.0,2.0,7.0,17.0,0.0,0.0,0.0,0.0,0.0
3610,ff9715db,7,10,0,0,1,6,1,0,2,4,3,0,1,1,1,5,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,3,0.718750,2.250000,0.000000,0.125,0.125000,0.09375,0.375000,17,1,112546,130804117,43.0,7.0,60.0,4.0,51.0,3.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,219.0,36.0,45.0,215.0,34.0,43.0,7.0,248.0,54.0,31.0,40.0,316.0,0.0,38.0,22.0,0.0,0.0,504.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,0.0,3.0,3.0
3611,ffc90c32,5,2,1,0,1,0,1,0,0,0,2,2,0,1,0,4,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0.866667,2.600000,0.200000,0.200,0.200000,0.00000,0.266667,7,3,120923,158478188,53.0,5.0,63.0,5.0,56.0,3.0,4.0,3.0,4.0,1.0,3.0,6.0,2.0,4.0,269.0,16.0,68.0,266.0,16.0,66.0,12.0,195.0,13.0,11.0,37.0,216.0,11.0,21.0,20.0,2.0,0.0,517.0,0.0,0.0,2.0,19.0,1.0,5.0,2.0,2.0,3.0,2.0
3612,ffd2871d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000,0.000000,0.00000,0.000000,0,2,222002,424476883,32.0,1.0,83.0,4.0,73.0,2.0,4.0,3.0,3.0,3.0,0.0,8.0,1.0,7.0,261.0,32.0,61.0,258.0,32.0,59.0,11.0,272.0,30.0,1.0,29.0,373.0,24.0,79.0,32.0,7.0,0.0,627.0,0.0,3.0,5.0,2.0,0.0,4.0,12.0,12.0,0.0,0.0


## (~T) Title, type, world and event_code

#### Preparing title, type and world features

In [93]:
gc.collect()

0

In [94]:
# Uses RAM 9.1->13.8->8.7
def title_type_world(df):
    df = pd.get_dummies(data=df, columns=['title', 'type', 'world'])
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' |-|!|\)|\(', '')
    df = df.groupby(['installation_id'], as_index=False, sort=False).agg(sum) 
    return df

In [95]:
# Create new X_train_titletypeworldfeat which holds time only title, type and world features
X_train_titletypeworldfeat = X_train.filter(['installation_id', 'title', 'type', 'world'], axis=1)
X_train_titletypeworldfeat = title_type_world(X_train_titletypeworldfeat)

In [96]:
#X_train_titletypeworldfeat

#### Merging title, type and world features to the main train set

In [97]:
# Add title, type and world features to the main dataframe
X_train_gt = pd.merge(X_train_gt, X_train_titletypeworldfeat, on=['installation_id'])
# # Count nan in df for debugging purposes
# X_train_gt.isna().sum()

del X_train_titletypeworldfeat
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 142)


## (~T) Other features

* all_actions_time

*     Aggregate amount (in ms) of time spent on Assessments, Activities and Games
*     Clips do not have time spent feature

* all_actions_time
* action_duration_mean (!!!)
* event_code_count_mean
* number_of_sessions_nu
* event_count_mean (!!!)

###### (T) all_actions_time

In [98]:
# nstallation_id	game_time
# 0	0006a69f	36368
# 1	0006c192	216374
# 2	00129856	39701
# 3	001d0ed0	38115
# 4	00225f67	26517
# ...	...	...
# 3609	ff9305d7	59417
# 3610	ff9715db	28408
# 3611	ffc90c32	43142
# 3612	ffd2871d	54533
# 3613	ffeb0b1b	71511

# vs
# installation_id	game_time
# 0	0006a69f	36368
# 1	0006c192	216374
# 2	00129856	39701
# 3	001d0ed0	38115
# 4	00225f67	26517
# ...	...	...
# 3609	ff9305d7	59417
# 3610	ff9715db	28408
# 3611	ffc90c32	43142
# 3612	ffd2871d	54533
# 3613	ffeb0b1b	71511


In [99]:
# Tested, works well, except truncation after last assessment
# Creating all_actions_time (games, activities and assessments)
# RAM: 8.7->8.5-8.7 GB
feat_gametime = X_train[X_train['type'].isin(['Assessment', 'Game', 'Activity'])]

# Extracting last assessment's time
feat_gametime_last = feat_gametime.groupby(['installation_id', 'game_session'], as_index=False, sort=False)[['game_time', 'type']].last()
feat_gametime_last = feat_gametime_last[feat_gametime_last['type'] == 'Assessment'].groupby('installation_id', as_index=False, sort=False)['game_time'].last()

# Finalizing the whole time
feat_gametime = feat_gametime.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['game_time'].last()
feat_gametime = feat_gametime.groupby('installation_id', as_index=False, sort=False)['game_time'].sum()

# Removing last assessments time which is not available in test set
feat_gametime['game_time'] = feat_gametime['game_time'] - feat_gametime_last['game_time']
# Difference is correct, tested

# Merging to the main train set
X_train_gt['all_actions_time'] = feat_gametime['game_time']

# Deleting
del feat_gametime, feat_gametime_last 
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 143)


###### action_duration_mean

In [100]:
# Creating action_duration_mean (games, activities and assessments) (!!!)
# RAM: 8.7->9.6->8.7 GB
feat_gametimemean = X_train[X_train['type'].isin(['Assessment', 'Game', 'Activity'])]
feat_gametimemean = feat_gametimemean.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['game_time'].last()
feat_gametimemean = feat_gametimemean.groupby('installation_id', as_index=False, sort=False)['game_time'].mean()

# Merging to the main train set
X_train_gt['action_duration_mean'] = feat_gametimemean['game_time']

# Deleting
del feat_gametimemean
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 144)


###### event_code_count_mean

In [101]:
# Creating event_code_count_mean (!!!)
# RAM: OK, flat
feat_eventcodecountmean = X_train.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['event_code'].count()
feat_eventcodecountmean = feat_eventcodecountmean.groupby('installation_id', as_index=False, sort=False)['event_code'].mean()

# Merging to the main train set
X_train_gt['event_code_count_mean'] = feat_eventcodecountmean['event_code']

# Deleting
del feat_eventcodecountmean
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 145)


##### number_of_sessions_nu

In [102]:
# Creating event_code_count_mean
# RAM: OK, flat
feat_numberofsessions = X_train.groupby(['installation_id'], as_index=False, sort=False)['game_session'].count()

# Merging to the main train set
X_train_gt['number_of_sessions_nu'] = feat_numberofsessions['game_session']

# Deleting
del feat_numberofsessions
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 146)


##### event_count_mean

In [103]:
# Creating event_count_mean (!!!)
# RAM: OK, flat
feat_eventcountmean = X_train.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['event_count'].last()
feat_eventcountmean = feat_eventcountmean.groupby('installation_id', as_index=False, sort=False)['event_count'].mean()

# Merging to the main train set
X_train_gt['event_count_mean'] = feat_eventcountmean['event_count']

# Deleting
del feat_eventcountmean
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 147)


## (~T) timestamp

In [104]:
# bug - taking the last even, which might be not assessment
# could replace with mean

import datetime as dt

def timestamp_split(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']) # converting argument to pandas datetime
#    df['year'] = df['timestamp'].dt.year # all are in 2019
    df['month'] = (df['timestamp'].dt.month).astype(int)
    df['day'] = (df['timestamp'].dt.day).astype(int) # returns day of the month 1-31
    df['hour'] = (df['timestamp'].dt.hour).astype(int) 
    df['minute'] = (df['timestamp'].dt.minute).astype(int)
#    df['second'] = df['timestamp'].dt.second # doubt it could give anything
    df['dayofweek'] = (df['timestamp'].dt.dayofweek).astype(int) # returns day of week in 0-6 integer format
    df['dayofyear'] = (df['timestamp'].dt.dayofyear).astype(int) # returns numeric day of year, might be useful for summer holidays
    df['quarter'] = (df['timestamp'].dt.quarter).astype(int)
    df['is_weekend'] = (np.where(df['dayofweek'].isin(['Sunday','Saturday']), 1, 0)).astype(int)
    df.drop(['timestamp'], axis=1, inplace=True)
    return df

In [105]:
# RAM 8.7->10->9.3
# Create new X_train_timefeat which holds time only features  
feat_time = X_train.filter(['installation_id', 'timestamp'], axis=1)
# Prepare time features from given timestamp 
feat_time = timestamp_split(feat_time)

  mask |= (ar1 == a)


In [106]:
# Defining as last (bug)
feat_time = feat_time.groupby('installation_id', as_index=False).last()

# Merging to the main train set
X_train_gt = pd.merge(X_train_gt, feat_time, on=['installation_id'])

# Deleting
del feat_time
gc.collect()

82

In [107]:
del X_train, X_labels
gc.collect()

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 155)


## Adding train target

In [108]:
# Update 200117, major bug fix
X_train_gt['Y_target'] = X_train_gt_last['accuracy_group']

# debugging
debugging_ids(X_train_gt)

Debugging submitted dataframe: 
Unique installation_ids: 3614 
Rows & columns count (3614, 156)


## Preparing X, y

In [109]:
X_train_model = X_train_gt.copy(deep=True)

#del X_train_gt
#gc.collect()

In [110]:
X_train_model.isna().sum()

installation_id    0
num_correct        0
num_incorrect      0
bird_correct       0
bird_incorrect     0
                  ..
dayofweek          0
dayofyear          0
quarter            0
is_weekend         0
Y_target           0
Length: 156, dtype: int64

In [111]:
# # Casting categorical features to str (must in Catboost & Eli5)
# X_train_model['forecasted_assessment'] = X_train_model['forecasted_assessment'].astype(str)
# categorical_features = ['forecasted_assessment'] #200119
# type(X_train_model.forecasted_assessment[3612])

In [112]:
# Elsewise LightGBMError: Do not support special JSON characters in feature name.
X_train_model.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train_model.columns]

##### StandardScaler

In [113]:
# Dropping non numeric column 'installation_id'
X_train_model = X_train_model.drop(['installation_id'], axis=1)

In [114]:
# Defining scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

##### Targets

In [115]:
# Setting target & features
y = X_train_model.Y_target
feature_names = X_train_model.columns.drop(['Y_target'])
X = X_train_model[feature_names]

In [116]:
# Scaling
X_scaled = scaler.fit_transform(X.astype(np.float64))

##### Resampling

In [117]:
# import tensorflow as tf
# # from collections import Counter
# # from sklearn.datasets import make_classification
# # from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
# from imblearn import undersampling, oversampling
# from imblearn import under_sampling 
# from imblearn import over_sampling
# from imblearn.over_sampling import SMOTE

In [118]:
# # Ref: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
# # from collections import Counter
# # from sklearn.datasets import make_classification
# from imblearn.over_sampling import SMOTE
# # X, y = make_classification(n_classes=2, 
# #                            class_sep=2,
# #                            weights=[0.1, 0.9], 
# #                            n_informative=3, 
# #                            n_redundant=1, 
# #                            flip_y=0,
# #                            n_features=20, 
# #                            n_clusters_per_class=1, 
# #                            n_samples=1000, 
# #                            random_state=10)

# # print('Original dataset shape %s' % Counter(y))

# sm = SMOTE(random_state=42)
# X_res, y_res = sm.fit_resample(X_scaled, y)
# print('Resampled dataset shape %s' % Counter(y_res))

# Metric

In [119]:
# Check Cohen Kappa Score:
from sklearn.metrics import cohen_kappa_score

# Model w XGBoost

In [120]:
# from sklearn.model_selection import train_test_split
# train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=42)

# from sklearn.metrics import accuracy_score
# import xgboost as xgb

# xgb_clf = xgb.XGBClassifier(learning_rate=0.5,
#                     n_estimators=2000,
#                     max_depth=6,
#                     min_child_weight=0,
#                     gamma=0,
#                     reg_lambda=1,
#                     subsample=1,
#                     colsample_bytree=0.75,
#                     scale_pos_weight=1,
#                     objective='multi:softprob',
#                     num_class=4,
#                     verbose=200,
#                     random_state=42,
#                     early_stopping_rounds=10,
#                     verbose_eval=True)

# xgb_model = xgb_clf.fit(train_X, train_y)
# xgb_preds = xgb_model.predict(val_X)
# xgb_proba = xgb_model.predict_proba(val_X)

# xgb_kappa_score = cohen_kappa_score(val_y, xgb_preds, weights='quadratic')

# print(f'\n****')
# print(f'Accuracy of predictions is: {accuracy_score(val_y, xgb_preds)}')
# # NB! Add weights='quadratic' to get same result as QWK 
# print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, xgb_preds, weights="quadratic")}')

# Model w Catboost

In [121]:
#X_train_model.head(10)

In [122]:
#X_train_model.filter(items=['installation_id', 'num_correct', 'num_incorrect', 'forecasted_assessment'])

In [123]:
list(X_train_model.columns)

['num_correct',
 'num_incorrect',
 'bird_correct',
 'bird_incorrect',
 'cart_correct',
 'cart_incorrect',
 'cauldron_correct',
 'cauldron_incorrect',
 'chest_correct',
 'chest_incorrect',
 'mushroom_correct',
 'mushroom_incorrect',
 'acc_0',
 'acc_1',
 'acc_2',
 'acc_3',
 'bird_accg_0',
 'bird_accg_1',
 'bird_accg_2',
 'bird_accg_3',
 'cart_accg_0',
 'cart_accg_1',
 'cart_accg_2',
 'cart_accg_3',
 'cauldron_accg_0',
 'cauldron_accg_1',
 'cauldron_accg_2',
 'cauldron_accg_3',
 'chest_accg_0',
 'chest_accg_1',
 'chest_accg_2',
 'chest_accg_3',
 'mushroom_accg_0',
 'mushroom_accg_1',
 'mushroom_accg_2',
 'mushroom_accg_3',
 'accuracy',
 'accuracy_group',
 'bird_accuracy',
 'cart_accuracy',
 'cauldron_accuracy',
 'chest_accuracy',
 'mushroom_accuracy',
 'previous_assessments_count',
 'forecasted_assessment',
 'event_count',
 'game_time',
 'event_code_2000',
 'event_code_2010',
 'event_code_2020',
 'event_code_2025',
 'event_code_2030',
 'event_code_2035',
 'event_code_2040',
 'event_code_2

In [124]:
# # Catboost Classification
# # Important: X_scaled added 200121
# from sklearn.model_selection import train_test_split
# train_X, val_X, train_y, val_y = train_test_split(X_scaled, y, random_state = 0)

# from catboost import CatBoostClassifier
# from sklearn.metrics import accuracy_score

# params_cb = {
#             'max_depth' : 5,
#             'learning_rate' : 0.01,
#             'n_estimators' : 1493,
#             'verbose' : 200,
# #            'od_type': 'Iter',
#             'loss_function' : 'MultiClass' #200109 new
#             }

# cbc_model = CatBoostClassifier(**params_cb)
# cbc_model.fit(train_X, train_y)
# #cbc_model.fit(train_X, train_y, eval_set=(val_X, val_y), early_stopping_rounds=10, use_best_model=True) #200119 use_best suggestion for bestIteration = 2679, Shrink model to first 2680 iterations
# cbc_preds = cbc_model.predict(val_X)

# # Save Catboost accuracy
# cbc_score = accuracy_score(val_y, cbc_preds)
# print(f'\n****')
# print(f'Accuracy of predictions is: {accuracy_score(val_y, cbc_preds)}')

# # Check Cohen Kappa Score:
# from sklearn.metrics import cohen_kappa_score
# cbc_kappa_score = cohen_kappa_score(val_y, cbc_preds, weights='quadratic')

# # NB! Add weights='quadratic' to get same result as QWK 
# print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, cbc_preds, weights="quadratic")}')

# Model w Catboost regressor

In [125]:
# # CatBoostRegressor
# # Stopped by overfitting detector  (10 iterations wait)
# # bestTest = 0.8568023128
# # bestIteration = 1141
# # Shrink model to first 1142 iterations.
# # ****
# # Accuracy of predictions is: 0.734110203229486


# from sklearn.model_selection import train_test_split
# train_X, val_X, train_y, val_y = train_test_split(X_scaled, y, random_state = 0)

# from catboost import CatBoostRegressor
# #from sklearn.metrics import accuracy_score
# from sklearn.metrics import mean_squared_error

# params_cb = {
#             'max_depth' : 5,
#             'learning_rate' : 0.01,
#             'n_estimators' : 1142,
#             'verbose' : 200,
# #            'od_type': 'Iter',
#             'loss_function' : 'RMSE' #200109 new
#             }

# cbc_model = CatBoostRegressor(**params_cb)
# cbc_model.fit(train_X, train_y)
# #cbc_model.fit(train_X, train_y, eval_set=(val_X, val_y), early_stopping_rounds=10, use_best_model=True) #200119 use_best suggestion for bestIteration = 2679, Shrink model to first 2680 iterations
# cbc_preds = cbc_model.predict(val_X)

# # Save Catboost accuracy
# cbc_score = mean_squared_error(val_y, cbc_preds)
# print(f'\n****')
# print(f'Accuracy of predictions is: {mean_squared_error(val_y, cbc_preds)}')

# # # Check Cohen Kappa Score:
# # from sklearn.metrics import cohen_kappa_score
# # cbc_kappa_score = cohen_kappa_score(val_y, cbc_preds, weights='quadratic')

# # # NB! Add weights='quadratic' to get same result as QWK 
# # print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, cbc_preds, weights="quadratic")}')

In [126]:
# # GridSearchCV
# from sklearn.metrics import cohen_kappa_score, make_scorer
# from sklearn.model_selection import GridSearchCV

# kappa_scorer = make_scorer(cohen_kappa_score()
# grid = GridSearchCV(CatBoostClassifier(), param_grid={'C': [1, 10]}, scoring=kappa_scorer)

In [127]:
# # CV to assess model's quality
# # Ref: https://scikit-learn.org/stable/modules/model_evaluation.html

# # Creat 
# from sklearn.metrics import cohen_kappa_score, make_scorer
# kappa_scorer = make_scorer(cohen_kappa_score)

# # from sklearn import svm, datasets
# from sklearn.model_selection import cross_val_score
# clf_cbc = CatBoostClassifier(**params_cb)
# #cross_val_score(clf_cbc, X, y, cv=5, scoring='accuracy')
# #scores = cross_val_score(clf_cbc, X, y, cv=5, scoring='accuracy')
# cross_val_score(clf_cbc, X, y, cv=5, scoring=kappa_scorer) # scoring=cohen_kappa_score

# #cross_val_score(clf, X, y, cv=5, scoring='recall_macro')
# #array([0.96..., 0.96..., 0.96..., 0.93..., 1.        ])
# #>>> model = svm.SVC()
# #>>> cross_val_score(model, X, y, cv=5, scoring='wrong_choice')
# #Traceback (most recent call last):

In [128]:
# # Permutation Importance

# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(cbc_model, random_state=1).fit(val_X, val_y)
# eli5.show_weights(perm, top=160, feature_names = list(feature_names)) #val_X.columns.tolist() -> list(feature_names)

In [129]:
# # Permutation Importance XGBoost

# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(xgb_model, random_state=1).fit(val_X, val_y)
# eli5.show_weights(perm, top=150, feature_names = val_X.columns.tolist())

In [130]:
#type(val_X.forecasted_assessment[1087])

# PCA

In [131]:
# Apply PCA for dimension reduction
from sklearn.decomposition import PCA
pca = PCA(n_components=10).fit(X)
X_pca = pca.transform(X)
print(sum(pca.explained_variance_ratio_))

0.9999999999995036


# Model w LightGBM
* First - Classifier

In [132]:
# len(X), len(y), len(train_X), len(train_y), len(val_X), len(val_y)

In [133]:
# # Light GBM Classifier

# from sklearn.model_selection import train_test_split
# train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2765, random_state = 0)

# import lightgbm as lgb
# # from sklearn.metrics import accuracy_score
# from sklearn.metrics import log_loss
# # create dataset for lightgbm
# lgb_train = lgb.Dataset(train_X, train_y)
# lgb_eval = lgb.Dataset(val_X, val_y)

# # specify parameters
# params_lgb = {
#             'boosting_type': 'gbdt',
#             'objective': 'multiclass', #            'objective': 'multiclass',
#             'num_class': 4,
#             'metric': '',
#             'num_leaves': 31,
#             'learning_rate': 0.01,
#             'feature_fraction': 0.9,
#             'bagging_fraction': 0.8,
#             'bagging_freq': 5,
#             'verbose': 0,
#            'is_unbalance': True,
#             'num_iterations': 3000
#             }

# print('Starting training...')
# # train
# gbm_model = lgb.train(params_lgb,
#                       lgb_train,
#                      num_boost_round=20,
#                      valid_sets=lgb_eval,
#                      early_stopping_rounds=5
#                      )

# print('Starting predicting...')
# # predict
# gbm_pred = gbm_model.predict(val_X, num_iteration=gbm_model.best_iteration)
# # eval
# print(':', )
# print(f'log_loss of predictions is: {log_loss(val_y, gbm_preds)}')
# #print(f'Accuracy of predictions is: {accuracy_score(val_y, gbm_preds)}')
# #print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, gbm_pred, weights="quadratic")}')

In [134]:
# LGBM with PCA X_pca

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X_pca, y, random_state = 0)

import lightgbm as lgb
# from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# create dataset for lightgbm
lgb_train = lgb.Dataset(train_X, label=train_y)
lgb_eval = lgb.Dataset(val_X, label=val_y, reference=lgb_train)

# specify parameters
params_lgb = {'n_estimators': 10000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.04,
            'feature_fraction': 0.9,
             'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
#            'early_stopping_rounds': 100, 
            'eval_metric': 'cappa'
            }

print('Starting training...')
# train
gbm_model = lgb.train(params_lgb, lgb_train, num_boost_round=20, valid_sets=lgb_eval) #, early_stopping_rounds=100)
#gbm_model = lgb.train(params_lgb, lgb_train, valid_sets=lgb_eval) #new200120 , verbose_eval=verbosity

print('Starting predicting...')
# predict
gbm_pred = gbm_model.predict(val_X, num_iteration=gbm_model.best_iteration)
# eval
print(':', )
print(f'The rmse of prediction is: {mean_squared_error(val_y, gbm_pred) ** 0.5}')
#print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, gbm_pred, weights="quadratic")}')

Starting training...
[1]	valid_0's rmse: 1.27862
[2]	valid_0's rmse: 1.27524
[3]	valid_0's rmse: 1.27347
[4]	valid_0's rmse: 1.2713
[5]	valid_0's rmse: 1.26926
[6]	valid_0's rmse: 1.2666
[7]	valid_0's rmse: 1.26432
[8]	valid_0's rmse: 1.26331
[9]	valid_0's rmse: 1.2619
[10]	valid_0's rmse: 1.26033
[11]	valid_0's rmse: 1.2592
[12]	valid_0's rmse: 1.25777
[13]	valid_0's rmse: 1.2565
[14]	valid_0's rmse: 1.25572
[15]	valid_0's rmse: 1.25553
[16]	valid_0's rmse: 1.25507
[17]	valid_0's rmse: 1.25425
[18]	valid_0's rmse: 1.25271
[19]	valid_0's rmse: 1.25166
[20]	valid_0's rmse: 1.25066
[21]	valid_0's rmse: 1.25073
[22]	valid_0's rmse: 1.24959
[23]	valid_0's rmse: 1.24846
[24]	valid_0's rmse: 1.24811
[25]	valid_0's rmse: 1.24766
[26]	valid_0's rmse: 1.24761
[27]	valid_0's rmse: 1.24788
[28]	valid_0's rmse: 1.2479
[29]	valid_0's rmse: 1.24749
[30]	valid_0's rmse: 1.24642
[31]	valid_0's rmse: 1.24672
[32]	valid_0's rmse: 1.24688
[33]	valid_0's rmse: 1.2459
[34]	valid_0's rmse: 1.24513
[35]	vali



[44]	valid_0's rmse: 1.24549
[45]	valid_0's rmse: 1.24557
[46]	valid_0's rmse: 1.24567
[47]	valid_0's rmse: 1.24595
[48]	valid_0's rmse: 1.24495
[49]	valid_0's rmse: 1.2447
[50]	valid_0's rmse: 1.24545
[51]	valid_0's rmse: 1.24559
[52]	valid_0's rmse: 1.24543
[53]	valid_0's rmse: 1.24643
[54]	valid_0's rmse: 1.24666
[55]	valid_0's rmse: 1.24631
[56]	valid_0's rmse: 1.24679
[57]	valid_0's rmse: 1.246
[58]	valid_0's rmse: 1.24577
[59]	valid_0's rmse: 1.24599
[60]	valid_0's rmse: 1.2463
[61]	valid_0's rmse: 1.24582
[62]	valid_0's rmse: 1.24646
[63]	valid_0's rmse: 1.24607
[64]	valid_0's rmse: 1.24583
[65]	valid_0's rmse: 1.24648
[66]	valid_0's rmse: 1.24674
[67]	valid_0's rmse: 1.24755
[68]	valid_0's rmse: 1.24759
[69]	valid_0's rmse: 1.24757
[70]	valid_0's rmse: 1.24742
[71]	valid_0's rmse: 1.24745
[72]	valid_0's rmse: 1.2481
[73]	valid_0's rmse: 1.24783
[74]	valid_0's rmse: 1.24804
[75]	valid_0's rmse: 1.24721
[76]	valid_0's rmse: 1.2476
[77]	valid_0's rmse: 1.24789
[78]	valid_0's rmse:

[345]	valid_0's rmse: 1.28648
[346]	valid_0's rmse: 1.28657
[347]	valid_0's rmse: 1.28641
[348]	valid_0's rmse: 1.28633
[349]	valid_0's rmse: 1.28648
[350]	valid_0's rmse: 1.28642
[351]	valid_0's rmse: 1.28654
[352]	valid_0's rmse: 1.2867
[353]	valid_0's rmse: 1.28674
[354]	valid_0's rmse: 1.28665
[355]	valid_0's rmse: 1.28681
[356]	valid_0's rmse: 1.28653
[357]	valid_0's rmse: 1.28648
[358]	valid_0's rmse: 1.28644
[359]	valid_0's rmse: 1.28696
[360]	valid_0's rmse: 1.28682
[361]	valid_0's rmse: 1.28701
[362]	valid_0's rmse: 1.28734
[363]	valid_0's rmse: 1.28747
[364]	valid_0's rmse: 1.28735
[365]	valid_0's rmse: 1.28727
[366]	valid_0's rmse: 1.28743
[367]	valid_0's rmse: 1.28783
[368]	valid_0's rmse: 1.28825
[369]	valid_0's rmse: 1.28885
[370]	valid_0's rmse: 1.28926
[371]	valid_0's rmse: 1.28931
[372]	valid_0's rmse: 1.28964
[373]	valid_0's rmse: 1.28971
[374]	valid_0's rmse: 1.29029
[375]	valid_0's rmse: 1.29066
[376]	valid_0's rmse: 1.29063
[377]	valid_0's rmse: 1.29043
[378]	valid

[653]	valid_0's rmse: 1.31596
[654]	valid_0's rmse: 1.31604
[655]	valid_0's rmse: 1.31611
[656]	valid_0's rmse: 1.31625
[657]	valid_0's rmse: 1.31647
[658]	valid_0's rmse: 1.3167
[659]	valid_0's rmse: 1.31673
[660]	valid_0's rmse: 1.31697
[661]	valid_0's rmse: 1.31699
[662]	valid_0's rmse: 1.31719
[663]	valid_0's rmse: 1.3173
[664]	valid_0's rmse: 1.31724
[665]	valid_0's rmse: 1.31723
[666]	valid_0's rmse: 1.31718
[667]	valid_0's rmse: 1.31719
[668]	valid_0's rmse: 1.31709
[669]	valid_0's rmse: 1.31707
[670]	valid_0's rmse: 1.31718
[671]	valid_0's rmse: 1.31716
[672]	valid_0's rmse: 1.31709
[673]	valid_0's rmse: 1.31704
[674]	valid_0's rmse: 1.3171
[675]	valid_0's rmse: 1.31728
[676]	valid_0's rmse: 1.31754
[677]	valid_0's rmse: 1.3174
[678]	valid_0's rmse: 1.31745
[679]	valid_0's rmse: 1.31751
[680]	valid_0's rmse: 1.31726
[681]	valid_0's rmse: 1.31734
[682]	valid_0's rmse: 1.31751
[683]	valid_0's rmse: 1.31759
[684]	valid_0's rmse: 1.31762
[685]	valid_0's rmse: 1.31772
[686]	valid_0'

[965]	valid_0's rmse: 1.33167
[966]	valid_0's rmse: 1.33173
[967]	valid_0's rmse: 1.3318
[968]	valid_0's rmse: 1.33171
[969]	valid_0's rmse: 1.33162
[970]	valid_0's rmse: 1.3318
[971]	valid_0's rmse: 1.33188
[972]	valid_0's rmse: 1.33206
[973]	valid_0's rmse: 1.33204
[974]	valid_0's rmse: 1.33217
[975]	valid_0's rmse: 1.33218
[976]	valid_0's rmse: 1.33226
[977]	valid_0's rmse: 1.33231
[978]	valid_0's rmse: 1.33237
[979]	valid_0's rmse: 1.33249
[980]	valid_0's rmse: 1.3325
[981]	valid_0's rmse: 1.33266
[982]	valid_0's rmse: 1.33256
[983]	valid_0's rmse: 1.33261
[984]	valid_0's rmse: 1.33281
[985]	valid_0's rmse: 1.3328
[986]	valid_0's rmse: 1.33282
[987]	valid_0's rmse: 1.33295
[988]	valid_0's rmse: 1.33299
[989]	valid_0's rmse: 1.33303
[990]	valid_0's rmse: 1.33299
[991]	valid_0's rmse: 1.33305
[992]	valid_0's rmse: 1.33304
[993]	valid_0's rmse: 1.33311
[994]	valid_0's rmse: 1.33319
[995]	valid_0's rmse: 1.33321
[996]	valid_0's rmse: 1.33317
[997]	valid_0's rmse: 1.33318
[998]	valid_0'

[1262]	valid_0's rmse: 1.34187
[1263]	valid_0's rmse: 1.34184
[1264]	valid_0's rmse: 1.34188
[1265]	valid_0's rmse: 1.34181
[1266]	valid_0's rmse: 1.34177
[1267]	valid_0's rmse: 1.34163
[1268]	valid_0's rmse: 1.34171
[1269]	valid_0's rmse: 1.34195
[1270]	valid_0's rmse: 1.34195
[1271]	valid_0's rmse: 1.34193
[1272]	valid_0's rmse: 1.3419
[1273]	valid_0's rmse: 1.3419
[1274]	valid_0's rmse: 1.34184
[1275]	valid_0's rmse: 1.34191
[1276]	valid_0's rmse: 1.34186
[1277]	valid_0's rmse: 1.3419
[1278]	valid_0's rmse: 1.34194
[1279]	valid_0's rmse: 1.34196
[1280]	valid_0's rmse: 1.34183
[1281]	valid_0's rmse: 1.34182
[1282]	valid_0's rmse: 1.34178
[1283]	valid_0's rmse: 1.34175
[1284]	valid_0's rmse: 1.34175
[1285]	valid_0's rmse: 1.34192
[1286]	valid_0's rmse: 1.34195
[1287]	valid_0's rmse: 1.34195
[1288]	valid_0's rmse: 1.34206
[1289]	valid_0's rmse: 1.34209
[1290]	valid_0's rmse: 1.3421
[1291]	valid_0's rmse: 1.34221
[1292]	valid_0's rmse: 1.34229
[1293]	valid_0's rmse: 1.34242
[1294]	valid

[1565]	valid_0's rmse: 1.34817
[1566]	valid_0's rmse: 1.34822
[1567]	valid_0's rmse: 1.34819
[1568]	valid_0's rmse: 1.34828
[1569]	valid_0's rmse: 1.34835
[1570]	valid_0's rmse: 1.34837
[1571]	valid_0's rmse: 1.3483
[1572]	valid_0's rmse: 1.34835
[1573]	valid_0's rmse: 1.34826
[1574]	valid_0's rmse: 1.34835
[1575]	valid_0's rmse: 1.34843
[1576]	valid_0's rmse: 1.34843
[1577]	valid_0's rmse: 1.34837
[1578]	valid_0's rmse: 1.34849
[1579]	valid_0's rmse: 1.34851
[1580]	valid_0's rmse: 1.34855
[1581]	valid_0's rmse: 1.34852
[1582]	valid_0's rmse: 1.34861
[1583]	valid_0's rmse: 1.34864
[1584]	valid_0's rmse: 1.34869
[1585]	valid_0's rmse: 1.34864
[1586]	valid_0's rmse: 1.34861
[1587]	valid_0's rmse: 1.34861
[1588]	valid_0's rmse: 1.34864
[1589]	valid_0's rmse: 1.34864
[1590]	valid_0's rmse: 1.34859
[1591]	valid_0's rmse: 1.34859
[1592]	valid_0's rmse: 1.34858
[1593]	valid_0's rmse: 1.34864
[1594]	valid_0's rmse: 1.34865
[1595]	valid_0's rmse: 1.34872
[1596]	valid_0's rmse: 1.34868
[1597]	va

[1884]	valid_0's rmse: 1.35313
[1885]	valid_0's rmse: 1.35322
[1886]	valid_0's rmse: 1.3532
[1887]	valid_0's rmse: 1.35323
[1888]	valid_0's rmse: 1.3533
[1889]	valid_0's rmse: 1.35339
[1890]	valid_0's rmse: 1.35338
[1891]	valid_0's rmse: 1.35343
[1892]	valid_0's rmse: 1.35348
[1893]	valid_0's rmse: 1.3535
[1894]	valid_0's rmse: 1.35354
[1895]	valid_0's rmse: 1.35356
[1896]	valid_0's rmse: 1.35353
[1897]	valid_0's rmse: 1.35354
[1898]	valid_0's rmse: 1.35355
[1899]	valid_0's rmse: 1.35356
[1900]	valid_0's rmse: 1.35362
[1901]	valid_0's rmse: 1.3536
[1902]	valid_0's rmse: 1.35358
[1903]	valid_0's rmse: 1.35362
[1904]	valid_0's rmse: 1.35363
[1905]	valid_0's rmse: 1.35358
[1906]	valid_0's rmse: 1.35359
[1907]	valid_0's rmse: 1.35365
[1908]	valid_0's rmse: 1.35367
[1909]	valid_0's rmse: 1.35365
[1910]	valid_0's rmse: 1.35368
[1911]	valid_0's rmse: 1.3537
[1912]	valid_0's rmse: 1.35374
[1913]	valid_0's rmse: 1.35379
[1914]	valid_0's rmse: 1.35375
[1915]	valid_0's rmse: 1.3538
[1916]	valid_0

[2201]	valid_0's rmse: 1.35726
[2202]	valid_0's rmse: 1.35726
[2203]	valid_0's rmse: 1.35726
[2204]	valid_0's rmse: 1.35731
[2205]	valid_0's rmse: 1.35731
[2206]	valid_0's rmse: 1.35733
[2207]	valid_0's rmse: 1.35735
[2208]	valid_0's rmse: 1.35735
[2209]	valid_0's rmse: 1.35733
[2210]	valid_0's rmse: 1.35738
[2211]	valid_0's rmse: 1.35742
[2212]	valid_0's rmse: 1.35741
[2213]	valid_0's rmse: 1.35745
[2214]	valid_0's rmse: 1.35748
[2215]	valid_0's rmse: 1.3575
[2216]	valid_0's rmse: 1.35746
[2217]	valid_0's rmse: 1.35746
[2218]	valid_0's rmse: 1.35748
[2219]	valid_0's rmse: 1.35748
[2220]	valid_0's rmse: 1.35743
[2221]	valid_0's rmse: 1.35743
[2222]	valid_0's rmse: 1.35743
[2223]	valid_0's rmse: 1.35745
[2224]	valid_0's rmse: 1.35747
[2225]	valid_0's rmse: 1.35744
[2226]	valid_0's rmse: 1.35742
[2227]	valid_0's rmse: 1.35743
[2228]	valid_0's rmse: 1.35744
[2229]	valid_0's rmse: 1.35742
[2230]	valid_0's rmse: 1.3574
[2231]	valid_0's rmse: 1.3574
[2232]	valid_0's rmse: 1.35748
[2233]	vali

[2517]	valid_0's rmse: 1.36007
[2518]	valid_0's rmse: 1.36008
[2519]	valid_0's rmse: 1.36008
[2520]	valid_0's rmse: 1.36008
[2521]	valid_0's rmse: 1.36012
[2522]	valid_0's rmse: 1.36007
[2523]	valid_0's rmse: 1.36009
[2524]	valid_0's rmse: 1.36004
[2525]	valid_0's rmse: 1.36008
[2526]	valid_0's rmse: 1.36005
[2527]	valid_0's rmse: 1.36005
[2528]	valid_0's rmse: 1.36005
[2529]	valid_0's rmse: 1.36007
[2530]	valid_0's rmse: 1.36005
[2531]	valid_0's rmse: 1.36004
[2532]	valid_0's rmse: 1.3601
[2533]	valid_0's rmse: 1.36012
[2534]	valid_0's rmse: 1.36013
[2535]	valid_0's rmse: 1.36015
[2536]	valid_0's rmse: 1.36014
[2537]	valid_0's rmse: 1.36015
[2538]	valid_0's rmse: 1.36019
[2539]	valid_0's rmse: 1.36018
[2540]	valid_0's rmse: 1.3602
[2541]	valid_0's rmse: 1.36021
[2542]	valid_0's rmse: 1.36026
[2543]	valid_0's rmse: 1.36026
[2544]	valid_0's rmse: 1.36028
[2545]	valid_0's rmse: 1.36027
[2546]	valid_0's rmse: 1.36028
[2547]	valid_0's rmse: 1.3603
[2548]	valid_0's rmse: 1.36032
[2549]	vali

[2837]	valid_0's rmse: 1.36196
[2838]	valid_0's rmse: 1.36199
[2839]	valid_0's rmse: 1.36202
[2840]	valid_0's rmse: 1.36201
[2841]	valid_0's rmse: 1.36202
[2842]	valid_0's rmse: 1.36202
[2843]	valid_0's rmse: 1.36199
[2844]	valid_0's rmse: 1.36199
[2845]	valid_0's rmse: 1.36203
[2846]	valid_0's rmse: 1.36204
[2847]	valid_0's rmse: 1.36206
[2848]	valid_0's rmse: 1.36209
[2849]	valid_0's rmse: 1.36211
[2850]	valid_0's rmse: 1.3621
[2851]	valid_0's rmse: 1.36211
[2852]	valid_0's rmse: 1.36208
[2853]	valid_0's rmse: 1.36211
[2854]	valid_0's rmse: 1.36209
[2855]	valid_0's rmse: 1.36209
[2856]	valid_0's rmse: 1.36206
[2857]	valid_0's rmse: 1.36207
[2858]	valid_0's rmse: 1.36208
[2859]	valid_0's rmse: 1.36209
[2860]	valid_0's rmse: 1.36209
[2861]	valid_0's rmse: 1.36212
[2862]	valid_0's rmse: 1.36213
[2863]	valid_0's rmse: 1.36213
[2864]	valid_0's rmse: 1.3622
[2865]	valid_0's rmse: 1.36219
[2866]	valid_0's rmse: 1.36219
[2867]	valid_0's rmse: 1.36214
[2868]	valid_0's rmse: 1.36216
[2869]	val

[3155]	valid_0's rmse: 1.36305
[3156]	valid_0's rmse: 1.36309
[3157]	valid_0's rmse: 1.36308
[3158]	valid_0's rmse: 1.36312
[3159]	valid_0's rmse: 1.36315
[3160]	valid_0's rmse: 1.36315
[3161]	valid_0's rmse: 1.36316
[3162]	valid_0's rmse: 1.36316
[3163]	valid_0's rmse: 1.36316
[3164]	valid_0's rmse: 1.36317
[3165]	valid_0's rmse: 1.36315
[3166]	valid_0's rmse: 1.36316
[3167]	valid_0's rmse: 1.36315
[3168]	valid_0's rmse: 1.36315
[3169]	valid_0's rmse: 1.36314
[3170]	valid_0's rmse: 1.36314
[3171]	valid_0's rmse: 1.36314
[3172]	valid_0's rmse: 1.36314
[3173]	valid_0's rmse: 1.36313
[3174]	valid_0's rmse: 1.36315
[3175]	valid_0's rmse: 1.36315
[3176]	valid_0's rmse: 1.36315
[3177]	valid_0's rmse: 1.36316
[3178]	valid_0's rmse: 1.36314
[3179]	valid_0's rmse: 1.36315
[3180]	valid_0's rmse: 1.36316
[3181]	valid_0's rmse: 1.36316
[3182]	valid_0's rmse: 1.36316
[3183]	valid_0's rmse: 1.36315
[3184]	valid_0's rmse: 1.36317
[3185]	valid_0's rmse: 1.36316
[3186]	valid_0's rmse: 1.36316
[3187]	v

[3428]	valid_0's rmse: 1.36422
[3429]	valid_0's rmse: 1.36422
[3430]	valid_0's rmse: 1.36421
[3431]	valid_0's rmse: 1.36421
[3432]	valid_0's rmse: 1.36421
[3433]	valid_0's rmse: 1.36423
[3434]	valid_0's rmse: 1.36419
[3435]	valid_0's rmse: 1.36419
[3436]	valid_0's rmse: 1.36419
[3437]	valid_0's rmse: 1.36419
[3438]	valid_0's rmse: 1.36421
[3439]	valid_0's rmse: 1.36419
[3440]	valid_0's rmse: 1.36421
[3441]	valid_0's rmse: 1.3642
[3442]	valid_0's rmse: 1.36421
[3443]	valid_0's rmse: 1.36421
[3444]	valid_0's rmse: 1.36421
[3445]	valid_0's rmse: 1.36423
[3446]	valid_0's rmse: 1.36423
[3447]	valid_0's rmse: 1.36424
[3448]	valid_0's rmse: 1.36423
[3449]	valid_0's rmse: 1.3642
[3450]	valid_0's rmse: 1.36422
[3451]	valid_0's rmse: 1.36421
[3452]	valid_0's rmse: 1.36421
[3453]	valid_0's rmse: 1.3642
[3454]	valid_0's rmse: 1.36421
[3455]	valid_0's rmse: 1.36422
[3456]	valid_0's rmse: 1.36424
[3457]	valid_0's rmse: 1.36423
[3458]	valid_0's rmse: 1.36422
[3459]	valid_0's rmse: 1.36423
[3460]	vali

[3729]	valid_0's rmse: 1.36495
[3730]	valid_0's rmse: 1.36495
[3731]	valid_0's rmse: 1.36494
[3732]	valid_0's rmse: 1.36494
[3733]	valid_0's rmse: 1.36495
[3734]	valid_0's rmse: 1.36496
[3735]	valid_0's rmse: 1.36493
[3736]	valid_0's rmse: 1.36493
[3737]	valid_0's rmse: 1.36494
[3738]	valid_0's rmse: 1.36495
[3739]	valid_0's rmse: 1.36493
[3740]	valid_0's rmse: 1.36495
[3741]	valid_0's rmse: 1.36494
[3742]	valid_0's rmse: 1.36494
[3743]	valid_0's rmse: 1.36494
[3744]	valid_0's rmse: 1.36494
[3745]	valid_0's rmse: 1.36493
[3746]	valid_0's rmse: 1.36494
[3747]	valid_0's rmse: 1.36493
[3748]	valid_0's rmse: 1.36492
[3749]	valid_0's rmse: 1.36494
[3750]	valid_0's rmse: 1.36494
[3751]	valid_0's rmse: 1.36493
[3752]	valid_0's rmse: 1.36494
[3753]	valid_0's rmse: 1.36495
[3754]	valid_0's rmse: 1.36497
[3755]	valid_0's rmse: 1.36497
[3756]	valid_0's rmse: 1.36497
[3757]	valid_0's rmse: 1.36498
[3758]	valid_0's rmse: 1.36499
[3759]	valid_0's rmse: 1.365
[3760]	valid_0's rmse: 1.365
[3761]	valid

[4018]	valid_0's rmse: 1.36568
[4019]	valid_0's rmse: 1.36568
[4020]	valid_0's rmse: 1.36567
[4021]	valid_0's rmse: 1.36566
[4022]	valid_0's rmse: 1.36566
[4023]	valid_0's rmse: 1.36565
[4024]	valid_0's rmse: 1.36564
[4025]	valid_0's rmse: 1.36563
[4026]	valid_0's rmse: 1.36563
[4027]	valid_0's rmse: 1.36562
[4028]	valid_0's rmse: 1.36561
[4029]	valid_0's rmse: 1.36561
[4030]	valid_0's rmse: 1.36561
[4031]	valid_0's rmse: 1.36561
[4032]	valid_0's rmse: 1.36561
[4033]	valid_0's rmse: 1.36562
[4034]	valid_0's rmse: 1.36563
[4035]	valid_0's rmse: 1.36562
[4036]	valid_0's rmse: 1.3656
[4037]	valid_0's rmse: 1.36564
[4038]	valid_0's rmse: 1.36561
[4039]	valid_0's rmse: 1.3656
[4040]	valid_0's rmse: 1.3656
[4041]	valid_0's rmse: 1.3656
[4042]	valid_0's rmse: 1.3656
[4043]	valid_0's rmse: 1.36559
[4044]	valid_0's rmse: 1.3656
[4045]	valid_0's rmse: 1.36559
[4046]	valid_0's rmse: 1.36559
[4047]	valid_0's rmse: 1.36561
[4048]	valid_0's rmse: 1.36562
[4049]	valid_0's rmse: 1.36563
[4050]	valid_0

[4343]	valid_0's rmse: 1.36619
[4344]	valid_0's rmse: 1.36618
[4345]	valid_0's rmse: 1.3662
[4346]	valid_0's rmse: 1.3662
[4347]	valid_0's rmse: 1.36621
[4348]	valid_0's rmse: 1.3662
[4349]	valid_0's rmse: 1.3662
[4350]	valid_0's rmse: 1.36619
[4351]	valid_0's rmse: 1.36618
[4352]	valid_0's rmse: 1.36619
[4353]	valid_0's rmse: 1.36621
[4354]	valid_0's rmse: 1.3662
[4355]	valid_0's rmse: 1.36621
[4356]	valid_0's rmse: 1.3662
[4357]	valid_0's rmse: 1.36619
[4358]	valid_0's rmse: 1.36619
[4359]	valid_0's rmse: 1.36618
[4360]	valid_0's rmse: 1.36619
[4361]	valid_0's rmse: 1.3662
[4362]	valid_0's rmse: 1.36621
[4363]	valid_0's rmse: 1.36621
[4364]	valid_0's rmse: 1.36621
[4365]	valid_0's rmse: 1.36621
[4366]	valid_0's rmse: 1.36621
[4367]	valid_0's rmse: 1.36622
[4368]	valid_0's rmse: 1.36622
[4369]	valid_0's rmse: 1.36622
[4370]	valid_0's rmse: 1.36623
[4371]	valid_0's rmse: 1.36624
[4372]	valid_0's rmse: 1.36625
[4373]	valid_0's rmse: 1.36625
[4374]	valid_0's rmse: 1.36625
[4375]	valid_0'

[4621]	valid_0's rmse: 1.36655
[4622]	valid_0's rmse: 1.36655
[4623]	valid_0's rmse: 1.36654
[4624]	valid_0's rmse: 1.36655
[4625]	valid_0's rmse: 1.36656
[4626]	valid_0's rmse: 1.36657
[4627]	valid_0's rmse: 1.36658
[4628]	valid_0's rmse: 1.36658
[4629]	valid_0's rmse: 1.36659
[4630]	valid_0's rmse: 1.36659
[4631]	valid_0's rmse: 1.3666
[4632]	valid_0's rmse: 1.36659
[4633]	valid_0's rmse: 1.36658
[4634]	valid_0's rmse: 1.36658
[4635]	valid_0's rmse: 1.36657
[4636]	valid_0's rmse: 1.36657
[4637]	valid_0's rmse: 1.36658
[4638]	valid_0's rmse: 1.36658
[4639]	valid_0's rmse: 1.36658
[4640]	valid_0's rmse: 1.36658
[4641]	valid_0's rmse: 1.36657
[4642]	valid_0's rmse: 1.36656
[4643]	valid_0's rmse: 1.36656
[4644]	valid_0's rmse: 1.36657
[4645]	valid_0's rmse: 1.36658
[4646]	valid_0's rmse: 1.36658
[4647]	valid_0's rmse: 1.36656
[4648]	valid_0's rmse: 1.36657
[4649]	valid_0's rmse: 1.36656
[4650]	valid_0's rmse: 1.36656
[4651]	valid_0's rmse: 1.36654
[4652]	valid_0's rmse: 1.36654
[4653]	va

[4939]	valid_0's rmse: 1.36679
[4940]	valid_0's rmse: 1.3668
[4941]	valid_0's rmse: 1.3668
[4942]	valid_0's rmse: 1.3668
[4943]	valid_0's rmse: 1.3668
[4944]	valid_0's rmse: 1.36679
[4945]	valid_0's rmse: 1.36679
[4946]	valid_0's rmse: 1.36678
[4947]	valid_0's rmse: 1.36679
[4948]	valid_0's rmse: 1.3668
[4949]	valid_0's rmse: 1.36681
[4950]	valid_0's rmse: 1.36683
[4951]	valid_0's rmse: 1.36683
[4952]	valid_0's rmse: 1.36684
[4953]	valid_0's rmse: 1.36684
[4954]	valid_0's rmse: 1.36685
[4955]	valid_0's rmse: 1.36683
[4956]	valid_0's rmse: 1.36684
[4957]	valid_0's rmse: 1.36684
[4958]	valid_0's rmse: 1.36683
[4959]	valid_0's rmse: 1.36681
[4960]	valid_0's rmse: 1.36682
[4961]	valid_0's rmse: 1.3668
[4962]	valid_0's rmse: 1.3668
[4963]	valid_0's rmse: 1.36681
[4964]	valid_0's rmse: 1.3668
[4965]	valid_0's rmse: 1.3668
[4966]	valid_0's rmse: 1.3668
[4967]	valid_0's rmse: 1.36678
[4968]	valid_0's rmse: 1.3668
[4969]	valid_0's rmse: 1.3668
[4970]	valid_0's rmse: 1.36682
[4971]	valid_0's rms

[5267]	valid_0's rmse: 1.36713
[5268]	valid_0's rmse: 1.36715
[5269]	valid_0's rmse: 1.36714
[5270]	valid_0's rmse: 1.36714
[5271]	valid_0's rmse: 1.36713
[5272]	valid_0's rmse: 1.36712
[5273]	valid_0's rmse: 1.36711
[5274]	valid_0's rmse: 1.36711
[5275]	valid_0's rmse: 1.36711
[5276]	valid_0's rmse: 1.36711
[5277]	valid_0's rmse: 1.3671
[5278]	valid_0's rmse: 1.36711
[5279]	valid_0's rmse: 1.36711
[5280]	valid_0's rmse: 1.36711
[5281]	valid_0's rmse: 1.36711
[5282]	valid_0's rmse: 1.36712
[5283]	valid_0's rmse: 1.36711
[5284]	valid_0's rmse: 1.36712
[5285]	valid_0's rmse: 1.36711
[5286]	valid_0's rmse: 1.36712
[5287]	valid_0's rmse: 1.36713
[5288]	valid_0's rmse: 1.36714
[5289]	valid_0's rmse: 1.36714
[5290]	valid_0's rmse: 1.36715
[5291]	valid_0's rmse: 1.36714
[5292]	valid_0's rmse: 1.36715
[5293]	valid_0's rmse: 1.36715
[5294]	valid_0's rmse: 1.36716
[5295]	valid_0's rmse: 1.36716
[5296]	valid_0's rmse: 1.36715
[5297]	valid_0's rmse: 1.36715
[5298]	valid_0's rmse: 1.36715
[5299]	va

[5555]	valid_0's rmse: 1.36749
[5556]	valid_0's rmse: 1.36748
[5557]	valid_0's rmse: 1.36748
[5558]	valid_0's rmse: 1.36749
[5559]	valid_0's rmse: 1.36748
[5560]	valid_0's rmse: 1.36748
[5561]	valid_0's rmse: 1.36747
[5562]	valid_0's rmse: 1.36748
[5563]	valid_0's rmse: 1.36749
[5564]	valid_0's rmse: 1.3675
[5565]	valid_0's rmse: 1.36751
[5566]	valid_0's rmse: 1.36751
[5567]	valid_0's rmse: 1.36751
[5568]	valid_0's rmse: 1.36751
[5569]	valid_0's rmse: 1.36753
[5570]	valid_0's rmse: 1.36753
[5571]	valid_0's rmse: 1.36754
[5572]	valid_0's rmse: 1.36754
[5573]	valid_0's rmse: 1.36754
[5574]	valid_0's rmse: 1.36754
[5575]	valid_0's rmse: 1.36754
[5576]	valid_0's rmse: 1.36753
[5577]	valid_0's rmse: 1.36752
[5578]	valid_0's rmse: 1.36751
[5579]	valid_0's rmse: 1.3675
[5580]	valid_0's rmse: 1.36749
[5581]	valid_0's rmse: 1.3675
[5582]	valid_0's rmse: 1.36749
[5583]	valid_0's rmse: 1.36752
[5584]	valid_0's rmse: 1.36751
[5585]	valid_0's rmse: 1.36751
[5586]	valid_0's rmse: 1.36751
[5587]	vali

[5833]	valid_0's rmse: 1.36772
[5834]	valid_0's rmse: 1.36773
[5835]	valid_0's rmse: 1.36773
[5836]	valid_0's rmse: 1.36773
[5837]	valid_0's rmse: 1.36772
[5838]	valid_0's rmse: 1.36773
[5839]	valid_0's rmse: 1.36773
[5840]	valid_0's rmse: 1.36773
[5841]	valid_0's rmse: 1.36774
[5842]	valid_0's rmse: 1.36776
[5843]	valid_0's rmse: 1.36774
[5844]	valid_0's rmse: 1.36774
[5845]	valid_0's rmse: 1.36774
[5846]	valid_0's rmse: 1.36774
[5847]	valid_0's rmse: 1.36775
[5848]	valid_0's rmse: 1.36775
[5849]	valid_0's rmse: 1.36775
[5850]	valid_0's rmse: 1.36775
[5851]	valid_0's rmse: 1.36774
[5852]	valid_0's rmse: 1.36775
[5853]	valid_0's rmse: 1.36776
[5854]	valid_0's rmse: 1.36777
[5855]	valid_0's rmse: 1.36776
[5856]	valid_0's rmse: 1.36778
[5857]	valid_0's rmse: 1.36778
[5858]	valid_0's rmse: 1.36777
[5859]	valid_0's rmse: 1.36777
[5860]	valid_0's rmse: 1.36776
[5861]	valid_0's rmse: 1.36778
[5862]	valid_0's rmse: 1.36778
[5863]	valid_0's rmse: 1.36779
[5864]	valid_0's rmse: 1.3678
[5865]	va

[6108]	valid_0's rmse: 1.36795
[6109]	valid_0's rmse: 1.36796
[6110]	valid_0's rmse: 1.36795
[6111]	valid_0's rmse: 1.36795
[6112]	valid_0's rmse: 1.36795
[6113]	valid_0's rmse: 1.36794
[6114]	valid_0's rmse: 1.36795
[6115]	valid_0's rmse: 1.36795
[6116]	valid_0's rmse: 1.36795
[6117]	valid_0's rmse: 1.36796
[6118]	valid_0's rmse: 1.36797
[6119]	valid_0's rmse: 1.36797
[6120]	valid_0's rmse: 1.36795
[6121]	valid_0's rmse: 1.36795
[6122]	valid_0's rmse: 1.36796
[6123]	valid_0's rmse: 1.36796
[6124]	valid_0's rmse: 1.36796
[6125]	valid_0's rmse: 1.36796
[6126]	valid_0's rmse: 1.36796
[6127]	valid_0's rmse: 1.36796
[6128]	valid_0's rmse: 1.36797
[6129]	valid_0's rmse: 1.36797
[6130]	valid_0's rmse: 1.36797
[6131]	valid_0's rmse: 1.36796
[6132]	valid_0's rmse: 1.36797
[6133]	valid_0's rmse: 1.36797
[6134]	valid_0's rmse: 1.36798
[6135]	valid_0's rmse: 1.36799
[6136]	valid_0's rmse: 1.368
[6137]	valid_0's rmse: 1.36799
[6138]	valid_0's rmse: 1.36799
[6139]	valid_0's rmse: 1.36799
[6140]	val

[6386]	valid_0's rmse: 1.36821
[6387]	valid_0's rmse: 1.36822
[6388]	valid_0's rmse: 1.36822
[6389]	valid_0's rmse: 1.36821
[6390]	valid_0's rmse: 1.36821
[6391]	valid_0's rmse: 1.36821
[6392]	valid_0's rmse: 1.36822
[6393]	valid_0's rmse: 1.36822
[6394]	valid_0's rmse: 1.36821
[6395]	valid_0's rmse: 1.36821
[6396]	valid_0's rmse: 1.36823
[6397]	valid_0's rmse: 1.36824
[6398]	valid_0's rmse: 1.36823
[6399]	valid_0's rmse: 1.36824
[6400]	valid_0's rmse: 1.36824
[6401]	valid_0's rmse: 1.36825
[6402]	valid_0's rmse: 1.36824
[6403]	valid_0's rmse: 1.36825
[6404]	valid_0's rmse: 1.36826
[6405]	valid_0's rmse: 1.36827
[6406]	valid_0's rmse: 1.36827
[6407]	valid_0's rmse: 1.36825
[6408]	valid_0's rmse: 1.36826
[6409]	valid_0's rmse: 1.36826
[6410]	valid_0's rmse: 1.36826
[6411]	valid_0's rmse: 1.36826
[6412]	valid_0's rmse: 1.36826
[6413]	valid_0's rmse: 1.36826
[6414]	valid_0's rmse: 1.36827
[6415]	valid_0's rmse: 1.36827
[6416]	valid_0's rmse: 1.36827
[6417]	valid_0's rmse: 1.36829
[6418]	v

[6693]	valid_0's rmse: 1.36844
[6694]	valid_0's rmse: 1.36844
[6695]	valid_0's rmse: 1.36844
[6696]	valid_0's rmse: 1.36844
[6697]	valid_0's rmse: 1.36843
[6698]	valid_0's rmse: 1.36843
[6699]	valid_0's rmse: 1.36843
[6700]	valid_0's rmse: 1.36843
[6701]	valid_0's rmse: 1.36843
[6702]	valid_0's rmse: 1.36844
[6703]	valid_0's rmse: 1.36843
[6704]	valid_0's rmse: 1.36844
[6705]	valid_0's rmse: 1.36843
[6706]	valid_0's rmse: 1.36844
[6707]	valid_0's rmse: 1.36843
[6708]	valid_0's rmse: 1.36844
[6709]	valid_0's rmse: 1.36844
[6710]	valid_0's rmse: 1.36845
[6711]	valid_0's rmse: 1.36844
[6712]	valid_0's rmse: 1.36844
[6713]	valid_0's rmse: 1.36843
[6714]	valid_0's rmse: 1.36843
[6715]	valid_0's rmse: 1.36842
[6716]	valid_0's rmse: 1.36843
[6717]	valid_0's rmse: 1.36843
[6718]	valid_0's rmse: 1.36843
[6719]	valid_0's rmse: 1.36841
[6720]	valid_0's rmse: 1.36842
[6721]	valid_0's rmse: 1.36843
[6722]	valid_0's rmse: 1.36843
[6723]	valid_0's rmse: 1.36844
[6724]	valid_0's rmse: 1.36842
[6725]	v

[7040]	valid_0's rmse: 1.36868
[7041]	valid_0's rmse: 1.36868
[7042]	valid_0's rmse: 1.36868
[7043]	valid_0's rmse: 1.36867
[7044]	valid_0's rmse: 1.36866
[7045]	valid_0's rmse: 1.36865
[7046]	valid_0's rmse: 1.36865
[7047]	valid_0's rmse: 1.36865
[7048]	valid_0's rmse: 1.36866
[7049]	valid_0's rmse: 1.36865
[7050]	valid_0's rmse: 1.36864
[7051]	valid_0's rmse: 1.36864
[7052]	valid_0's rmse: 1.36864
[7053]	valid_0's rmse: 1.36864
[7054]	valid_0's rmse: 1.36865
[7055]	valid_0's rmse: 1.36865
[7056]	valid_0's rmse: 1.36865
[7057]	valid_0's rmse: 1.36866
[7058]	valid_0's rmse: 1.36865
[7059]	valid_0's rmse: 1.36865
[7060]	valid_0's rmse: 1.36865
[7061]	valid_0's rmse: 1.36864
[7062]	valid_0's rmse: 1.36863
[7063]	valid_0's rmse: 1.36863
[7064]	valid_0's rmse: 1.36863
[7065]	valid_0's rmse: 1.36864
[7066]	valid_0's rmse: 1.36864
[7067]	valid_0's rmse: 1.36864
[7068]	valid_0's rmse: 1.36864
[7069]	valid_0's rmse: 1.36863
[7070]	valid_0's rmse: 1.36863
[7071]	valid_0's rmse: 1.36862
[7072]	v

[7351]	valid_0's rmse: 1.36867
[7352]	valid_0's rmse: 1.36867
[7353]	valid_0's rmse: 1.36867
[7354]	valid_0's rmse: 1.36868
[7355]	valid_0's rmse: 1.36869
[7356]	valid_0's rmse: 1.3687
[7357]	valid_0's rmse: 1.3687
[7358]	valid_0's rmse: 1.3687
[7359]	valid_0's rmse: 1.3687
[7360]	valid_0's rmse: 1.36871
[7361]	valid_0's rmse: 1.36871
[7362]	valid_0's rmse: 1.36871
[7363]	valid_0's rmse: 1.3687
[7364]	valid_0's rmse: 1.3687
[7365]	valid_0's rmse: 1.3687
[7366]	valid_0's rmse: 1.3687
[7367]	valid_0's rmse: 1.3687
[7368]	valid_0's rmse: 1.36871
[7369]	valid_0's rmse: 1.36871
[7370]	valid_0's rmse: 1.36871
[7371]	valid_0's rmse: 1.3687
[7372]	valid_0's rmse: 1.3687
[7373]	valid_0's rmse: 1.3687
[7374]	valid_0's rmse: 1.3687
[7375]	valid_0's rmse: 1.3687
[7376]	valid_0's rmse: 1.3687
[7377]	valid_0's rmse: 1.3687
[7378]	valid_0's rmse: 1.36869
[7379]	valid_0's rmse: 1.36869
[7380]	valid_0's rmse: 1.3687
[7381]	valid_0's rmse: 1.36869
[7382]	valid_0's rmse: 1.3687
[7383]	valid_0's rmse: 1.3

[7648]	valid_0's rmse: 1.36887
[7649]	valid_0's rmse: 1.36887
[7650]	valid_0's rmse: 1.36887
[7651]	valid_0's rmse: 1.36887
[7652]	valid_0's rmse: 1.36888
[7653]	valid_0's rmse: 1.36888
[7654]	valid_0's rmse: 1.36889
[7655]	valid_0's rmse: 1.36889
[7656]	valid_0's rmse: 1.36888
[7657]	valid_0's rmse: 1.36887
[7658]	valid_0's rmse: 1.36887
[7659]	valid_0's rmse: 1.36888
[7660]	valid_0's rmse: 1.36888
[7661]	valid_0's rmse: 1.36889
[7662]	valid_0's rmse: 1.36889
[7663]	valid_0's rmse: 1.36889
[7664]	valid_0's rmse: 1.36888
[7665]	valid_0's rmse: 1.36889
[7666]	valid_0's rmse: 1.36889
[7667]	valid_0's rmse: 1.36889
[7668]	valid_0's rmse: 1.36888
[7669]	valid_0's rmse: 1.36888
[7670]	valid_0's rmse: 1.36888
[7671]	valid_0's rmse: 1.36888
[7672]	valid_0's rmse: 1.36889
[7673]	valid_0's rmse: 1.36889
[7674]	valid_0's rmse: 1.3689
[7675]	valid_0's rmse: 1.3689
[7676]	valid_0's rmse: 1.3689
[7677]	valid_0's rmse: 1.3689
[7678]	valid_0's rmse: 1.3689
[7679]	valid_0's rmse: 1.36891
[7680]	valid_

[7929]	valid_0's rmse: 1.36902
[7930]	valid_0's rmse: 1.36902
[7931]	valid_0's rmse: 1.36903
[7932]	valid_0's rmse: 1.36903
[7933]	valid_0's rmse: 1.36903
[7934]	valid_0's rmse: 1.36902
[7935]	valid_0's rmse: 1.36903
[7936]	valid_0's rmse: 1.36903
[7937]	valid_0's rmse: 1.36904
[7938]	valid_0's rmse: 1.36903
[7939]	valid_0's rmse: 1.36902
[7940]	valid_0's rmse: 1.36902
[7941]	valid_0's rmse: 1.36901
[7942]	valid_0's rmse: 1.36901
[7943]	valid_0's rmse: 1.36901
[7944]	valid_0's rmse: 1.369
[7945]	valid_0's rmse: 1.36901
[7946]	valid_0's rmse: 1.36902
[7947]	valid_0's rmse: 1.36901
[7948]	valid_0's rmse: 1.36901
[7949]	valid_0's rmse: 1.36901
[7950]	valid_0's rmse: 1.36901
[7951]	valid_0's rmse: 1.36901
[7952]	valid_0's rmse: 1.369
[7953]	valid_0's rmse: 1.369
[7954]	valid_0's rmse: 1.369
[7955]	valid_0's rmse: 1.369
[7956]	valid_0's rmse: 1.369
[7957]	valid_0's rmse: 1.369
[7958]	valid_0's rmse: 1.36901
[7959]	valid_0's rmse: 1.36901
[7960]	valid_0's rmse: 1.36901
[7961]	valid_0's rmse:

[8261]	valid_0's rmse: 1.36911
[8262]	valid_0's rmse: 1.36911
[8263]	valid_0's rmse: 1.36911
[8264]	valid_0's rmse: 1.36911
[8265]	valid_0's rmse: 1.36911
[8266]	valid_0's rmse: 1.36911
[8267]	valid_0's rmse: 1.36912
[8268]	valid_0's rmse: 1.36912
[8269]	valid_0's rmse: 1.36912
[8270]	valid_0's rmse: 1.3691
[8271]	valid_0's rmse: 1.3691
[8272]	valid_0's rmse: 1.36911
[8273]	valid_0's rmse: 1.36911
[8274]	valid_0's rmse: 1.3691
[8275]	valid_0's rmse: 1.3691
[8276]	valid_0's rmse: 1.3691
[8277]	valid_0's rmse: 1.3691
[8278]	valid_0's rmse: 1.3691
[8279]	valid_0's rmse: 1.36911
[8280]	valid_0's rmse: 1.36911
[8281]	valid_0's rmse: 1.36911
[8282]	valid_0's rmse: 1.36911
[8283]	valid_0's rmse: 1.36911
[8284]	valid_0's rmse: 1.3691
[8285]	valid_0's rmse: 1.3691
[8286]	valid_0's rmse: 1.3691
[8287]	valid_0's rmse: 1.3691
[8288]	valid_0's rmse: 1.36909
[8289]	valid_0's rmse: 1.3691
[8290]	valid_0's rmse: 1.36911
[8291]	valid_0's rmse: 1.36911
[8292]	valid_0's rmse: 1.36912
[8293]	valid_0's rms

[8563]	valid_0's rmse: 1.36923
[8564]	valid_0's rmse: 1.36922
[8565]	valid_0's rmse: 1.36924
[8566]	valid_0's rmse: 1.36924
[8567]	valid_0's rmse: 1.36924
[8568]	valid_0's rmse: 1.36924
[8569]	valid_0's rmse: 1.36925
[8570]	valid_0's rmse: 1.36925
[8571]	valid_0's rmse: 1.36926
[8572]	valid_0's rmse: 1.36925
[8573]	valid_0's rmse: 1.36926
[8574]	valid_0's rmse: 1.36925
[8575]	valid_0's rmse: 1.36926
[8576]	valid_0's rmse: 1.36925
[8577]	valid_0's rmse: 1.36925
[8578]	valid_0's rmse: 1.36925
[8579]	valid_0's rmse: 1.36926
[8580]	valid_0's rmse: 1.36926
[8581]	valid_0's rmse: 1.36927
[8582]	valid_0's rmse: 1.36928
[8583]	valid_0's rmse: 1.36928
[8584]	valid_0's rmse: 1.36928
[8585]	valid_0's rmse: 1.36928
[8586]	valid_0's rmse: 1.36928
[8587]	valid_0's rmse: 1.36928
[8588]	valid_0's rmse: 1.36928
[8589]	valid_0's rmse: 1.36927
[8590]	valid_0's rmse: 1.36928
[8591]	valid_0's rmse: 1.36929
[8592]	valid_0's rmse: 1.36929
[8593]	valid_0's rmse: 1.36929
[8594]	valid_0's rmse: 1.36929
[8595]	v

[8883]	valid_0's rmse: 1.36933
[8884]	valid_0's rmse: 1.36933
[8885]	valid_0's rmse: 1.36933
[8886]	valid_0's rmse: 1.36933
[8887]	valid_0's rmse: 1.36933
[8888]	valid_0's rmse: 1.36933
[8889]	valid_0's rmse: 1.36933
[8890]	valid_0's rmse: 1.36933
[8891]	valid_0's rmse: 1.36933
[8892]	valid_0's rmse: 1.36933
[8893]	valid_0's rmse: 1.36933
[8894]	valid_0's rmse: 1.36933
[8895]	valid_0's rmse: 1.36933
[8896]	valid_0's rmse: 1.36933
[8897]	valid_0's rmse: 1.36933
[8898]	valid_0's rmse: 1.36933
[8899]	valid_0's rmse: 1.36933
[8900]	valid_0's rmse: 1.36933
[8901]	valid_0's rmse: 1.36933
[8902]	valid_0's rmse: 1.36933
[8903]	valid_0's rmse: 1.36933
[8904]	valid_0's rmse: 1.36933
[8905]	valid_0's rmse: 1.36933
[8906]	valid_0's rmse: 1.36933
[8907]	valid_0's rmse: 1.36933
[8908]	valid_0's rmse: 1.36933
[8909]	valid_0's rmse: 1.36933
[8910]	valid_0's rmse: 1.36933
[8911]	valid_0's rmse: 1.36933
[8912]	valid_0's rmse: 1.36933
[8913]	valid_0's rmse: 1.36933
[8914]	valid_0's rmse: 1.36933
[8915]	v

[9191]	valid_0's rmse: 1.36933
[9192]	valid_0's rmse: 1.36933
[9193]	valid_0's rmse: 1.36933
[9194]	valid_0's rmse: 1.36933
[9195]	valid_0's rmse: 1.36933
[9196]	valid_0's rmse: 1.36933
[9197]	valid_0's rmse: 1.36933
[9198]	valid_0's rmse: 1.36933
[9199]	valid_0's rmse: 1.36933
[9200]	valid_0's rmse: 1.36933
[9201]	valid_0's rmse: 1.36933
[9202]	valid_0's rmse: 1.36933
[9203]	valid_0's rmse: 1.36933
[9204]	valid_0's rmse: 1.36933
[9205]	valid_0's rmse: 1.36933
[9206]	valid_0's rmse: 1.36933
[9207]	valid_0's rmse: 1.36933
[9208]	valid_0's rmse: 1.36933
[9209]	valid_0's rmse: 1.36933
[9210]	valid_0's rmse: 1.36933
[9211]	valid_0's rmse: 1.36933
[9212]	valid_0's rmse: 1.36933
[9213]	valid_0's rmse: 1.36933
[9214]	valid_0's rmse: 1.36933
[9215]	valid_0's rmse: 1.36933
[9216]	valid_0's rmse: 1.36933
[9217]	valid_0's rmse: 1.36933
[9218]	valid_0's rmse: 1.36933
[9219]	valid_0's rmse: 1.36933
[9220]	valid_0's rmse: 1.36933
[9221]	valid_0's rmse: 1.36933
[9222]	valid_0's rmse: 1.36933
[9223]	v

[9490]	valid_0's rmse: 1.36933
[9491]	valid_0's rmse: 1.36933
[9492]	valid_0's rmse: 1.36933
[9493]	valid_0's rmse: 1.36933
[9494]	valid_0's rmse: 1.36933
[9495]	valid_0's rmse: 1.36933
[9496]	valid_0's rmse: 1.36933
[9497]	valid_0's rmse: 1.36933
[9498]	valid_0's rmse: 1.36933
[9499]	valid_0's rmse: 1.36933
[9500]	valid_0's rmse: 1.36933
[9501]	valid_0's rmse: 1.36933
[9502]	valid_0's rmse: 1.36933
[9503]	valid_0's rmse: 1.36933
[9504]	valid_0's rmse: 1.36933
[9505]	valid_0's rmse: 1.36933
[9506]	valid_0's rmse: 1.36933
[9507]	valid_0's rmse: 1.36933
[9508]	valid_0's rmse: 1.36933
[9509]	valid_0's rmse: 1.36933
[9510]	valid_0's rmse: 1.36933
[9511]	valid_0's rmse: 1.36933
[9512]	valid_0's rmse: 1.36933
[9513]	valid_0's rmse: 1.36933
[9514]	valid_0's rmse: 1.36933
[9515]	valid_0's rmse: 1.36933
[9516]	valid_0's rmse: 1.36933
[9517]	valid_0's rmse: 1.36933
[9518]	valid_0's rmse: 1.36933
[9519]	valid_0's rmse: 1.36933
[9520]	valid_0's rmse: 1.36933
[9521]	valid_0's rmse: 1.36933
[9522]	v

[9838]	valid_0's rmse: 1.36933
[9839]	valid_0's rmse: 1.36933
[9840]	valid_0's rmse: 1.36933
[9841]	valid_0's rmse: 1.36933
[9842]	valid_0's rmse: 1.36933
[9843]	valid_0's rmse: 1.36933
[9844]	valid_0's rmse: 1.36933
[9845]	valid_0's rmse: 1.36933
[9846]	valid_0's rmse: 1.36933
[9847]	valid_0's rmse: 1.36933
[9848]	valid_0's rmse: 1.36933
[9849]	valid_0's rmse: 1.36933
[9850]	valid_0's rmse: 1.36933
[9851]	valid_0's rmse: 1.36933
[9852]	valid_0's rmse: 1.36933
[9853]	valid_0's rmse: 1.36933
[9854]	valid_0's rmse: 1.36933
[9855]	valid_0's rmse: 1.36933
[9856]	valid_0's rmse: 1.36933
[9857]	valid_0's rmse: 1.36933
[9858]	valid_0's rmse: 1.36933
[9859]	valid_0's rmse: 1.36933
[9860]	valid_0's rmse: 1.36933
[9861]	valid_0's rmse: 1.36933
[9862]	valid_0's rmse: 1.36933
[9863]	valid_0's rmse: 1.36933
[9864]	valid_0's rmse: 1.36933
[9865]	valid_0's rmse: 1.36933
[9866]	valid_0's rmse: 1.36933
[9867]	valid_0's rmse: 1.36933
[9868]	valid_0's rmse: 1.36933
[9869]	valid_0's rmse: 1.36933
[9870]	v

In [135]:
# # Ref: https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py

# from sklearn.model_selection import train_test_split
# train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# import lightgbm as lgb
# # from sklearn.metrics import accuracy_score
# from sklearn.metrics import mean_squared_error

# # create dataset for lightgbm
# lgb_train = lgb.Dataset(train_X, label=train_y)
# lgb_eval = lgb.Dataset(val_X, label=val_y, reference=lgb_train)

# # specify parameters
# params_lgb = {'n_estimators': 10000,
#             'boosting_type': 'gbdt',
#             'objective': 'regression',
#             'metric': 'rmse',
#             'subsample': 0.75,
#             'subsample_freq': 1,
#             'learning_rate': 0.04,
#             'feature_fraction': 0.9,
#              'max_depth': 15,
#             'lambda_l1': 1,  
#             'lambda_l2': 1,
#             'verbose': 100,
#             'early_stopping_rounds': 100, 'eval_metric': 'cappa'
#             }

# print('Starting training...')
# # train
# #gbm_model = lgb.train(params_lgb, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)
# gbm_model = lgb.train(params_lgb, lgb_train, valid_sets=lgb_eval) #new200120 , verbose_eval=verbosity

# print('Starting predicting...')
# # predict
# gbm_pred = gbm_model.predict(val_X, num_iteration=gbm_model.best_iteration)
# # eval
# print(':', )
# print(f'The rmse of prediction is: {mean_squared_error(val_y, gbm_pred) ** 0.5}')
# #print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(val_y, gbm_pred, weights="quadratic")}')

In [136]:
# # Train on all dataset LightGBM Reg

# import lightgbm as lgb
# from sklearn.metrics import mean_squared_error

# # Create dataset for lightgbm on full train set
# lgb_train = lgb.Dataset(X, label=y)

# # specify parameters
# params_lgb = {'n_estimators': 142,
#             'boosting_type': 'gbdt',
#             'objective': 'regression',
#             'metric': 'rmse',
#             'subsample': 0.75,
#             'subsample_freq': 1,
#             'learning_rate': 0.04,
#             'feature_fraction': 0.9,
#              'max_depth': 15,
#             'lambda_l1': 1,  
#             'lambda_l2': 1,
#             'verbose': 100,
# #            'early_stopping_rounds': 100, 
#             'eval_metric': 'cappa'
#             }

# print('Starting training...')
# # train
# #gbm_model = lgb.train(params_lgb, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)
# gbm_model = lgb.train(params_lgb, lgb_train) #new200120 , verbose_eval=verbosity
# print('Training done...')

# print('***')
# print('Starting predicting...')
# # predict
# gbm_pred = gbm_model.predict(X, num_iteration=gbm_model.best_iteration)
# # eval
# print('***')
# print(f'The rmse of prediction is: {mean_squared_error(y, gbm_pred) ** 0.5}')
# #print(f'Skikit-learn Cohen Kappa Score (Quadratic) of predictions is: {cohen_kappa_score(y, gbm_pred, weights="quadratic")}')

In [137]:
# # LightGBM

# submission = pd.read_csv(path + 'sample_submission.csv')
# gbm_preds = gbm_model.predict(X_test_gt)

# submission['accuracy_group'] = gbm_preds

# submission['accuracy_group_weight0'] = np.where((submission['accuracy_group'] <= 0.81387600), 0, 0)
# submission['accuracy_group_weight1'] = np.where((submission['accuracy_group'] > 0.81387600) & (submission['accuracy_group'] <= 1.09392700), 1, 0)
# submission['accuracy_group_weight2'] = np.where((submission['accuracy_group'] > 1.09392700) & (submission['accuracy_group'] <= 1.42779600), 2, 0)
# submission['accuracy_group_weight3'] = np.where((submission['accuracy_group'] > 1.42779600), 3, 0)
# submission['accuracy_group'] = submission['accuracy_group_weight0'] + submission['accuracy_group_weight1'] + submission['accuracy_group_weight2'] + submission['accuracy_group_weight3']
# submission = submission.drop(['accuracy_group_weight0', 'accuracy_group_weight1', 'accuracy_group_weight2', 'accuracy_group_weight3'], axis=1)

# submission.to_csv("submission.csv", index = False)

# submission.accuracy_group.value_counts()

In [138]:
# gbm_preds = gbm_model.predict(X_test_gt)
# submission['accuracy_group'] = np.round(gbm_preds).astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()
# submission.accuracy_group.value_counts()

In [139]:
del X_train_model
gc.collect()

26

# Preparing test set

In [140]:
# Preparing test set
X_test = pd.read_csv(path + 'test.csv', usecols = load_columns)
submission = pd.read_csv(path + 'sample_submission.csv')

In [141]:
def extract_accuracy_set_test(df):
    X_test_gt = pd.DataFrame(data=None)
    
    # X_test_gt will be used only for accuracy features extraction
    # First, filter assessment events only
    # Second, drop columns which will be processed separately
    
    X_test_gt = df[((df['event_code'] == 4100) & 
                     (df['title'].isin(['Cart Balancer (Assessment)', 
                                        'Cauldron Filler (Assessment)', 
                                        'Mushroom Sorter (Assessment)', 
                                        'Chest Sorter (Assessment)']))) | 
                    ((df['event_code'] == 4110) & 
                     (df['title'] == 'Bird Measurer (Assessment)'))].copy(deep=True)

    
#     #quick add of assessments_time
    
#     X_test_game_time = X_test_gt.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['game_time'].last()
#     X_test_game_time = X_test_game_time.groupby('installation_id', as_index=False, sort=False)['game_time'].sum()
    
    X_test_gt.drop(['event_id', 
                     'timestamp', 
                     'event_count', 
                     'event_code', 
                     'game_time',
                     'type',
                     'world',], axis=1, inplace=True)
    
    # Third, extract correct and incorrect assessment attempts per user from 'event_data'
    # Create num_correct and num_incorrect columns
    
    corr = '"correct":true'
    incorr = '"correct":false'
    
    X_test_gt['num_correct'] = X_test_gt['event_data'].apply(lambda x: 1 if corr in x else 0)
    X_test_gt['num_incorrect'] = X_test_gt['event_data'].apply(lambda x: 1 if incorr in x else 0)
    
    # Fourth, aggregate (sum) correct and incorrect assessment attempts 
    # per 'game_session', 'installation_id' and assessment 'title'
    # As provided in grount truth (labels.csv)
    
    X_test_gt = X_test_gt.sort_values(['installation_id', 'game_session'], ascending=True).groupby(['game_session', 'installation_id', 'title'], as_index=False, sort=False).agg(sum)
    
    # Fifths, create 'accuracy' feature = corr / (corre + incorr)
    
    X_test_gt['accuracy'] = X_test_gt['num_correct'] / (X_test_gt['num_correct'] + X_test_gt['num_incorrect'])
    
    # Sixths, create 'accuracy_group' feature
    # 3: the assessment was solved on the first attempt
    # 2: the assessment was solved on the second attempt
    # 1: the assessment was solved after 3 or more attempts
    # 0: the assessment was never solved

    # If accuracy is 0.0 (no correct attempts), accuracy group is 0 as all observations in X_test_gt by now has at least one attempt
    # If accuracy is 1.0 (that is no incorrect attempts), accuracy group is 3
    # If accuracy is 0.5 (there is equal amount of correct and incorrect attempts), accuracy group is 2
    # Any other case means that accuracy group equals 1, that is 3 or more attempts were needed to make a correct attempt    

    X_test_gt['accuracy_group'] = X_test_gt['accuracy'].apply(lambda x: 0 if x == 0.0 else (3 if x == 1.0 else (2 if x == 0.5 else 1)))
   
    return X_test_gt

X_test_gt = extract_accuracy_set_test(X_test)

In [142]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 7)


### (T) Assessment count
**Adjusted** for test set as:
* not all users took assessment
* in test.csv our forecasted assessment is not under 4100 or 4110 code, therefore does not include in gt df
* feature shows how many unique assessments user took before, not total count of non-unique assessments

In [143]:
# Creating the last assessment coll
X_test_gt['previous_assessments_count'] = X_test_gt.groupby('installation_id')['title'].transform('count')
# Difference with train prep:
# No need to reduce by one as last one under 4100 or 4110 code is not the one we are forecasting
# X_test_gt['previous_assessments_count'] = X_test_gt['previous_assessments_count'].apply(lambda x: x -1 if x > 1 else 0)

In [144]:
#X_test_gt.head(2)

In [145]:
# X_test[(X_test['installation_id'] == '01242218') & ((X_test['event_code'] == 4100) | (X_test['event_code'] == 4110))]

In [146]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 8)


### (~T) Accuracy groups

* Should be fine as we do not have forecasted assessment's, that is do not count additional 0 accuracy_group

In [147]:
#Accuracy groups
X_test_gt['acc_0'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 0 else 0)
X_test_gt['acc_1'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 1 else 0)
X_test_gt['acc_2'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 2 else 0)
X_test_gt['acc_3'] = X_test_gt['accuracy_group'].apply(lambda x: 1 if x == 3 else 0)

In [148]:
# X_test_gt.head(5)

In [149]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 12)


### (T) accuracy_group per assessment title

In [150]:
# 'accuracy_group' per assessment 'title'
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)

X_test_gt['bird_accg_0'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['bird_accg_1'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['bird_accg_2'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['bird_accg_3'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['cart_accg_0'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['cart_accg_1'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['cart_accg_2'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['cart_accg_3'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['cauldron_accg_0'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['cauldron_accg_1'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['cauldron_accg_2'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['cauldron_accg_3'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['chest_accg_0'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['chest_accg_1'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['chest_accg_2'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['chest_accg_3'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

X_test_gt['mushroom_accg_0'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 0), 1, 0)
X_test_gt['mushroom_accg_1'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 1), 1, 0)
X_test_gt['mushroom_accg_2'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 2), 1, 0)
X_test_gt['mushroom_accg_3'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['accuracy_group'] == 3), 1, 0)

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 32)


### (T) Accuracy (num_correct, num_incorrect, accuracy) per assessment

In [151]:
# {title}_correct, {title}_incorrect, {title}_accuracy per 'installation_id' per assessment 'title'
# Ref: https://stackoverflow.com/questions/27474921/compare-two-columns-using-pandas/27475029
# (condition, output value, else)
# E.g. if Bird Measurer has num_correct = 1, add 1, elsewise add 0
# If Bird Measurer has num_incorrect = 12, add 12, elsewise add 0

X_test_gt['bird_correct'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['bird_incorrect'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['bird_accuracy'] = np.where((X_test_gt['title'] == 'Bird Measurer (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['cart_correct'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['cart_incorrect'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['cart_accuracy'] = np.where((X_test_gt['title'] == 'Cart Balancer (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['cauldron_correct'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['cauldron_incorrect'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['cauldron_accuracy'] = np.where((X_test_gt['title'] == 'Cauldron Filler (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['chest_correct'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['chest_incorrect'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['chest_accuracy'] = np.where((X_test_gt['title'] == 'Chest Sorter (Assessment)'), X_test_gt['accuracy'], 0)

X_test_gt['mushroom_correct'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['num_correct'] == 1), 1, 0)
X_test_gt['mushroom_incorrect'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)') & (X_test_gt['num_incorrect'] > 0), X_test_gt['num_incorrect'], 0)
X_test_gt['mushroom_accuracy'] = np.where((X_test_gt['title'] == 'Mushroom Sorter (Assessment)'), X_test_gt['accuracy'], 0)

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (2018, 47)


### (T) Aggregation of features

* Leaving single row per 'installation_id'

##### Headline in train: Saving the index of last (forecasted) assessment

* No need to separate FC assessments row from the rest as it is not included in test set
* Will perform only aggregation

In [152]:
# Not applicable to test set:
# # We prepare a dataframe which stores the index of last assessment of each installation_id with assessment attempt
# last_observations_index_df = X_test_gt.reset_index().groupby('installation_id', as_index=False)['index'].last()
# last_observations_index_list = list(last_observations_index_df['index']) 
# X_test_gt.drop(['game_session', 'title'], axis=1, inplace=True)
# # Creating a copy dataframe with last_observations and without them
# X_test_gt_last = X_test_gt.loc[X_test_gt.index.isin(last_observations_index_list)]
# X_test_gt_remainder = X_test_gt.loc[~X_test_gt.index.isin(last_observations_index_list)]

X_test_gt_remainder_sum_list = X_train_gt_sum_list

# X_test_gt_remainder_sum_list = ['num_correct', 'num_incorrect', 
#        'bird_correct', 'bird_incorrect',
#        'cart_correct', 'cart_incorrect', 'cauldron_correct',
#        'cauldron_incorrect', 'chest_correct',
#        'chest_incorrect', 'mushroom_correct',
#        'mushroom_incorrect', 'acc_0',
#        'acc_1', 'acc_2', 'acc_3', 'bird_accg_0', 'bird_accg_1', 'bird_accg_2',
#        'bird_accg_3', 'cart_accg_0', 'cart_accg_1', 'cart_accg_2',
#        'cart_accg_3', 'cauldron_accg_0', 'cauldron_accg_1', 'cauldron_accg_2',
#        'cauldron_accg_3', 'chest_accg_0', 'chest_accg_1', 'chest_accg_2',
#        'chest_accg_3', 'mushroom_accg_0', 'mushroom_accg_1', 'mushroom_accg_2',
#        'mushroom_accg_3']

X_test_gt_remainder_mean_list = X_train_gt_mean_list

# X_test_gt_remainder_mean_list = ['accuracy',
#        'accuracy_group', 'bird_accuracy',
#        'cart_accuracy', 'cauldron_accuracy', 'chest_accuracy', 'mushroom_accuracy']

# !!! Should add 'forecasted_assessment'
# Removed 'sessions_with_assessment_count'
X_test_gt_remainder_unchanged_list = ['previous_assessments_count']

# Difference in train set:
# X_test_gt_remainder_unchanged_list = ['Y_target', 'forecasted_assessment', 'previous_assessments_count', 'sessions_with_assessment_count'] 

# Difference in train set:
# We do not define X_test_gt_remainder and take all in X_test_gt
X_test_gt_sum = X_test_gt.groupby(['installation_id'], as_index=False, sort=False)[X_test_gt_remainder_sum_list].agg(sum)
X_test_gt_mean = X_test_gt.groupby(['installation_id'], as_index=False, sort=False)[X_test_gt_remainder_mean_list].agg('mean')
X_test_gt_unchaged = X_test_gt.groupby(['installation_id'], as_index=False, sort=False)[X_test_gt_remainder_unchanged_list].last()

# Merge both
X_test_gt_remainder = pd.merge(X_test_gt_sum, X_test_gt_mean, how='left', on=['installation_id'])
X_test_gt = pd.merge(X_test_gt_remainder, X_test_gt_unchaged, how='left', on=['installation_id'])

# Not applicable to test set:
# # Returning the installation_ids which had no previous assessments before the forecasted one
# #X_test_gt = pd.concat([X_test_gt_remainder, X_test_gt_last]).sort_index().reset_index(drop=True) index got broken while grouping by
# X_test_gt = X_test_gt_remainder.append(X_test_gt_last, ignore_index=True)

# # Questionable re sorting as it drops installation_id, need to test
# X_test_gt = pd.concat([X_test_gt_remainder, X_test_gt_last]).drop_duplicates('installation_id').reset_index(drop=True)

In [153]:
X_test_gt.head(5)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,00abaee7,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
1,01242218,4,7,1,2,1,0,1,1,0,3,1,1,1,1,2,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0.466667,1.6,0.066667,0.2,0.1,0.0,0.1,5
2,02256298,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
3,027e7ce5,7,2,1,0,1,0,2,2,1,0,2,0,0,1,0,6,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,2,0.904762,2.714286,0.142857,0.142857,0.190476,0.142857,0.285714,7
4,02a29f99,1,14,0,13,0,0,1,1,0,0,0,0,2,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.166667,0.666667,0.0,0.0,0.166667,0.0,0.0,3


In [154]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 557 
Rows & columns count (557, 45)


In [155]:
# # !debugging, finding heavy user
# X_test_gt[X_test_gt['num_correct'] == X_test_gt.num_correct.max()]

In [156]:
# # !debugging on heavy user
# X_test[(X_test['installation_id'] == '56a739ec') & (X_test['event_code'] == 4100) & (X_test['title'] == 'Cart Balancer (Assessment)')]

### Adding users w/o previous assessment attempts

* Test set specific as in train set we used only 'intallation_id's with at least one assessment attempt 

In [157]:
test_features_list = X_test_gt.columns
X_test_gt.columns

Index(['installation_id', 'num_correct', 'num_incorrect', 'bird_correct',
       'bird_incorrect', 'cart_correct', 'cart_incorrect', 'cauldron_correct',
       'cauldron_incorrect', 'chest_correct', 'chest_incorrect',
       'mushroom_correct', 'mushroom_incorrect', 'acc_0', 'acc_1', 'acc_2',
       'acc_3', 'bird_accg_0', 'bird_accg_1', 'bird_accg_2', 'bird_accg_3',
       'cart_accg_0', 'cart_accg_1', 'cart_accg_2', 'cart_accg_3',
       'cauldron_accg_0', 'cauldron_accg_1', 'cauldron_accg_2',
       'cauldron_accg_3', 'chest_accg_0', 'chest_accg_1', 'chest_accg_2',
       'chest_accg_3', 'mushroom_accg_0', 'mushroom_accg_1', 'mushroom_accg_2',
       'mushroom_accg_3', 'accuracy', 'accuracy_group', 'bird_accuracy',
       'cart_accuracy', 'cauldron_accuracy', 'chest_accuracy',
       'mushroom_accuracy', 'previous_assessments_count'],
      dtype='object')

In [158]:
test_users_wo_assessments = set(X_test.installation_id) - set(X_test_gt.installation_id)
len(test_users_wo_assessments)

443

### Creating empty df matching test's columns

* Filled with 0
* Alternatively could test with Nan, None or -1

In [159]:
test_users_wo_assessments_df = pd.DataFrame(0, index=np.arange(len(test_users_wo_assessments)), columns=test_features_list)

In [160]:
test_users_wo_assessments_df

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
439,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
440,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Adding 'installation_id's w/o prior assessments

In [161]:
# We have created installation_id column with zero values. Now will assign missing installation_id
test_users_wo_assessments_df['installation_id'] = test_users_wo_assessments

In [162]:
test_users_wo_assessments_df.head(2)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,4be715ec,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1423dc8f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### (~T) Merging 'installation_id's with and w/o assessments

In [163]:
X_test_gt = X_test_gt.append(test_users_wo_assessments_df, ignore_index=True)

In [164]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 45)


In [165]:
# debugging
len(set(X_test_gt.installation_id))

1000

In [166]:
# debugging
# we lost the order of 'installation_id', but submission is sorted ascending
booltest_sub = X_test_gt.installation_id.sort_values(ascending=True).reset_index(drop=True) == submission.installation_id
set(booltest_sub)

{True}

### (T) Sorting to match order of submission

In [167]:
X_test_gt = X_test_gt.sort_values('installation_id', ascending=True).reset_index(drop=True)

In [168]:
X_test_gt.head(10)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count
0,00abaee7,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
1,01242218,4,7,1,2,1,0,1,1,0,3,1,1,1,1,2,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0.466667,1.6,0.066667,0.2,0.1,0.0,0.1,5
2,017c5718,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,01a44906,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,01bc6cb6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,02256298,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1
6,0267757a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,027e7ce5,7,2,1,0,1,0,2,2,1,0,2,0,0,1,0,6,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,2,0.904762,2.714286,0.142857,0.142857,0.190476,0.142857,0.285714,7
8,02a29f99,1,14,0,13,0,0,1,1,0,0,0,0,2,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.166667,0.666667,0.0,0.0,0.166667,0.0,0.0,3
9,0300c576,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [169]:
# debugging sorting
booltest_train = X_test_gt.installation_id == submission.installation_id
set(booltest_train)

{True}

In [170]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 45)


### (T) Adding 'forecasted_assessment' feature

* To both 'installation_id's with and w/o assessment attempt
* It fixes initial bug where 'installation_id's w/o assessment attempt got their last attempted assessment as their 'forecasted_assessment' 

In [171]:
# Create the forecasted_assessment_df which will contain all test set's installation_ids last forecasted_assessment

forecasted_assessment_df = X_test.groupby(['installation_id'], as_index=False, sort=False).agg('last')

# Reduce forecasted_assessment_df to users only w/o assessment (1000 -> 443):
# forecasted_assessment_df = forecasted_assessment_df[forecasted_assessment_df.installation_id.isin(test_users_wo_assessments)]
# Reseting the index, otherwise will get Nans when mapping:
# forecasted_assessment_df.reset_index()

In [172]:
# forecasted_assessment_df.shape

* Add 'forecasted_assessment' feature to the test set

In [173]:
# Add forecasted_assessment number to X_test_gt:
# Map is how train set has assigned values to assessment titles:
# 0 Bird Measurer (Assessment)
# 1 Cart Balancer (Assessment)
# 2 Cauldron Filler (Assessment)
# 3 Chest Sorter (Assessment)
# 4 Mushroom Sorter (Assessment)
X_test_gt['forecasted_assessment'] = forecasted_assessment_df['title'].map({'Bird Measurer (Assessment)': 0,
                                                                                               'Cart Balancer (Assessment)': 1, 
                                                                                               'Cauldron Filler (Assessment)': 2, 
                                                                                               'Chest Sorter (Assessment)': 3, 
                                                                                    'Mushroom Sorter (Assessment)': 4})

In [174]:
X_test_gt.head(2)

Unnamed: 0,installation_id,num_correct,num_incorrect,bird_correct,bird_incorrect,cart_correct,cart_incorrect,cauldron_correct,cauldron_incorrect,chest_correct,chest_incorrect,mushroom_correct,mushroom_incorrect,acc_0,acc_1,acc_2,acc_3,bird_accg_0,bird_accg_1,bird_accg_2,bird_accg_3,cart_accg_0,cart_accg_1,cart_accg_2,cart_accg_3,cauldron_accg_0,cauldron_accg_1,cauldron_accg_2,cauldron_accg_3,chest_accg_0,chest_accg_1,chest_accg_2,chest_accg_3,mushroom_accg_0,mushroom_accg_1,mushroom_accg_2,mushroom_accg_3,accuracy,accuracy_group,bird_accuracy,cart_accuracy,cauldron_accuracy,chest_accuracy,mushroom_accuracy,previous_assessments_count,forecasted_assessment
0,00abaee7,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,1,2
1,01242218,4,7,1,2,1,0,1,1,0,3,1,1,1,1,2,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0.466667,1.6,0.066667,0.2,0.1,0.0,0.1,5,1


In [175]:
# debugging
set(X_test_gt.forecasted_assessment), X_test_gt.forecasted_assessment.count()

({0, 1, 2, 3, 4}, 1000)

In [176]:
# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 46)


In [177]:
# # debugging
# X_test_gt.loc[441, ['forecasted_assessment']]

In [178]:
# # debugging
# X_test_gt.loc[441,]

In [179]:
# # debugging OK - 'forecasted_assessment' of '779b71a3' is 'Chest Sorter (Assessment)' or encoded 3 
# X_test[X_test['installation_id'] == '779b71a3'].tail()

# Adding none acc features to the test set

## (~T) event_code

#### Preparing event_code features

In [180]:
# Uses ~3 GB of RAM for this operation (9->12->9)
X_test_eventcode = X_test.filter(['installation_id', 'event_code'], axis=1)
X_test_eventcode = event_code(X_test)

#### Merging event_code features to the main test set

# Add event_code features to the main dataframe
X_test_gt = pd.merge(X_test_gt, X_test_eventcode, on=['installation_id'])

del X_test_eventcode
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 90)


## (~T) Title, type, world and event_code

#### Preparing title, type and world features

In [181]:
# Create new X_test_titletypeworldfeat which holds time only title, type and world features
X_test_titletypeworldfeat = X_test.filter(['installation_id', 'title', 'type', 'world'], axis=1)
X_test_titletypeworldfeat = title_type_world(X_test_titletypeworldfeat)

#### Merging title, type and world features to the main test set

# Add title, type and world features to the main dataframe
X_test_gt = pd.merge(X_test_gt, X_test_titletypeworldfeat, on=['installation_id'])
# # Count nan in df for debugging purposes
# X_test_gt.isna().sum()

del X_test_titletypeworldfeat
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 142)


## (~T) Other features

* all_actions_time

* Aggregate amount (in ms) of time spent on Assessments, Activities and Games
* Clips do not have time spent feature

* all_actions_time
* action_duration_mean (!!!)
* event_code_count_mean
* number_of_sessions_nu
* event_count_mean (!!!)

###### all_actions_time

In [182]:
# Creating all_actions_time (games, activities and assessments)
# RAM: 8.7->8.5-8.7 GB
feat_gametime = X_test[X_test['type'].isin(['Assessment', 'Game', 'Activity'])]
feat_gametime = feat_gametime.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['game_time'].last()
feat_gametime = feat_gametime.groupby('installation_id', as_index=False, sort=False)['game_time'].sum()

# Merging to the main test set
X_test_gt['all_actions_time'] = feat_gametime['game_time']

# Deleting
del feat_gametime
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 143)


###### action_duration_mean

In [183]:
# Creating action_duration_mean (games, activities and assessments) (!!!)
# RAM: 8.7->9.6->8.7 GB
feat_gametimemean = X_test[X_test['type'].isin(['Assessment', 'Game', 'Activity'])]
feat_gametimemean = feat_gametimemean.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['game_time'].last()
feat_gametimemean = feat_gametimemean.groupby('installation_id', as_index=False, sort=False)['game_time'].mean()

# Merging to the main test set
X_test_gt['action_duration_mean'] = feat_gametimemean['game_time']

# Deleting
del feat_gametimemean
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 144)


###### event_code_count_mean

In [184]:
# Creating event_code_count_mean (!!!)
# RAM: OK, flat
feat_eventcodecountmean = X_test.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['event_code'].count()
feat_eventcodecountmean = feat_eventcodecountmean.groupby('installation_id', as_index=False, sort=False)['event_code'].mean()

# Merging to the main test set
X_test_gt['event_code_count_mean'] = feat_eventcodecountmean['event_code']

# Deleting
del feat_eventcodecountmean
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 145)


##### number_of_sessions_nu

In [185]:
# Creating event_code_count_mean
# RAM: OK, flat
feat_numberofsessions = X_test.groupby(['installation_id'], as_index=False, sort=False)['game_session'].count()

# Merging to the main test set
X_test_gt['number_of_sessions_nu'] = feat_numberofsessions['game_session']

# Deleting
del feat_numberofsessions
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 146)


##### event_count_mean

In [186]:
# Creating event_count_mean (!!!)
# RAM: OK, flat
feat_eventcountmean = X_test.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['event_count'].last()
feat_eventcountmean = feat_eventcountmean.groupby('installation_id', as_index=False, sort=False)['event_count'].mean()

# Merging to the main test set
X_test_gt['event_count_mean'] = feat_eventcountmean['event_count']

# Deleting
del feat_eventcountmean
gc.collect()

# debugging
debugging_ids(X_test_gt)

Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 147)


### (~T) timestamp

#### bug - taking the last even, which might be not assessment
#### could replace with mean

In [187]:
# RAM 8.7->10->9.3
# Create new X_test_timefeat which holds time only features  
feat_time = X_test.filter(['installation_id', 'timestamp'], axis=1)
# Prepare time features from given timestamp 
feat_time = timestamp_split(feat_time)

# Defining as last (bug)
feat_time = feat_time.groupby('installation_id', as_index=False).last()

# Merging to the main test set
X_test_gt = pd.merge(X_test_gt, feat_time, on=['installation_id'])

# Deleting
del feat_time
gc.collect()

#del X_test
gc.collect()

# debugging
debugging_ids(X_test_gt)

  mask |= (ar1 == a)


Debugging submitted dataframe: 
Unique installation_ids: 1000 
Rows & columns count (1000, 155)


### (legacy) title, type and world

In [188]:
# # Re-using f-ion used in train
# # Create new titletypeworldfeat which holds time only title, type and world features  
# X_test_titletypeworldfeat = X_test.filter(['installation_id', 'title', 'type', 'world'], axis=1)
# # Prepare title, type and world features from given timestamp 
# X_test_titletypeworldfeat = title_type_world(X_test_titletypeworldfeat)
# X_test_titletypeworldfeat

In [189]:
# # debugging OK, 'ffe00ca8' has 5 rows in 'world' 'CRYSTALCAVES'
# X_test[(X_test['installation_id'] == 'ffe00ca8') & (X_test['world'] == 'CRYSTALCAVES')]

### (legacy) merge of timestamp, type, title and world features to main test set

##### debugging index before merger

* to avoid incorrectly assigning features from another 'installation_id's  

In [190]:
# # debugging sorting of timefeat

# booltest_timefeat = X_test_gt.installation_id == X_test_timefeat.installation_id
# set(booltest_timefeat)

In [191]:
# # debugging sorting of X_test_titletypeworldfeat

# booltest_titletypeworldfeat = X_test_gt.installation_id == X_test_titletypeworldfeat.installation_id
# set(booltest_titletypeworldfeat)

In [192]:
# # debugging
# debugging_ids(X_test_gt)

##### merging time features

In [193]:
# # debugging
# debugging_ids(X_test_timefeat)

In [194]:
# # Merging new features to main test set

# # Add time features to the main dataframe
# X_test_gt = pd.merge(X_test_gt, X_test_timefeat, on=['installation_id'])

In [195]:
# len(set(X_test_gt.installation_id)), X_test_gt.shape

##### merging titletypeworld features

In [196]:
# # debugging - count nan in df - OK
# X_test_gt.isna().sum()

In [197]:
# # Add title, type and world features to the main dataframe
# X_test_gt = pd.merge(X_test_gt, X_test_titletypeworldfeat, on=['installation_id'])

In [198]:
# len(set(X_test_gt.installation_id)), X_test_gt.shape

In [199]:
# # Count nan in df for debugging purposes
#set(X_test_gt.isna().sum())

In [200]:
# # debugging sorting
# booltest_sub = X_test_gt.installation_id == submission.installation_id
# set(booltest_sub)

#### Cleaning unused dfs and variables

In [201]:
#del X_test, X_test_gt_remainder_sum_list, X_test_gt_remainder_mean_list, X_test_gt_remainder_unchanged_list, X_test_gt_sum, X_test_gt_mean, X_test_gt_unchaged, test_features_list, test_users_wo_assessments, test_users_wo_assessments_df, forecasted_assessment_df, X_test_timefeat, X_test_titletypeworldfeat
gc.collect()

0

## (~T) all_actions_time

* Aggregate amount (in ms) of time spent on Assessments, Activities and Games
* Clips do not have time spent feature

In [202]:
# #### Adding feature all_actions_time 
# feat_gametime_test = X_test[X_test['type'].isin(['Assessment', 'Game', 'Activity'])]
# #feat_gametime_test

# feat_gametime_test = feat_gametime_test.groupby(['installation_id', 'game_session'], as_index=False, sort=False)['game_time'].last()
# #feat_gametime_test

# feat_gametime_test = feat_gametime_test.groupby('installation_id', as_index=False, sort=False)['game_time'].sum()
# feat_gametime_test

# # debugging
# #X_test[X_test['installation_id'] == 'b37e2b2d']

# #feat_gametime_test[feat_gametime_test['installation_id'] == 'b37e2b2d']

# Submission

In [203]:
len(set(X_test_gt.installation_id)), X_test_gt.shape

(1000, (1000, 155))

In [204]:
# debugging - check if df feature types
X_test_gt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Columns: 155 entries, installation_id to is_weekend
dtypes: float64(104), int64(50), object(1)
memory usage: 1.2+ MB


In [205]:
# debugging sorting
booltest_sub = X_test_gt.installation_id == submission.installation_id
set(booltest_sub)

{True}

In [206]:
# drop installation_id
X_test_gt = X_test_gt.drop(['installation_id'], axis=1)

In [207]:
len(set(X_test_gt.index)), X_test_gt.shape

(1000, (1000, 154))

In [208]:
# # cast forecasted assessment to str for cat_features
# X_test_gt['forecasted_assessment'] = X_test_gt['forecasted_assessment'].astype(str)
# type(X_test_gt.forecasted_assessment[0])
# X_train_gt.previous_assessments_count

In [209]:
# Elsewise LightGBMError: Do not support special JSON characters in feature name.
X_test_gt.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test_gt.columns]

In [210]:
list(X_test_gt.columns)

['num_correct',
 'num_incorrect',
 'bird_correct',
 'bird_incorrect',
 'cart_correct',
 'cart_incorrect',
 'cauldron_correct',
 'cauldron_incorrect',
 'chest_correct',
 'chest_incorrect',
 'mushroom_correct',
 'mushroom_incorrect',
 'acc_0',
 'acc_1',
 'acc_2',
 'acc_3',
 'bird_accg_0',
 'bird_accg_1',
 'bird_accg_2',
 'bird_accg_3',
 'cart_accg_0',
 'cart_accg_1',
 'cart_accg_2',
 'cart_accg_3',
 'cauldron_accg_0',
 'cauldron_accg_1',
 'cauldron_accg_2',
 'cauldron_accg_3',
 'chest_accg_0',
 'chest_accg_1',
 'chest_accg_2',
 'chest_accg_3',
 'mushroom_accg_0',
 'mushroom_accg_1',
 'mushroom_accg_2',
 'mushroom_accg_3',
 'accuracy',
 'accuracy_group',
 'bird_accuracy',
 'cart_accuracy',
 'cauldron_accuracy',
 'chest_accuracy',
 'mushroom_accuracy',
 'previous_assessments_count',
 'forecasted_assessment',
 'event_count',
 'game_time',
 'event_code_2000',
 'event_code_2010',
 'event_code_2020',
 'event_code_2025',
 'event_code_2030',
 'event_code_2035',
 'event_code_2040',
 'event_code_2

#### Scaler

In [211]:
X_test_gt_scaled = scaler.fit_transform(X_test_gt.astype(np.float64))

In [212]:
# # Catboost clf submission
# # Important! X_test_gt_scaled added
# cbc_preds = cbc_model.predict(X_test_gt_scaled)
# submission['accuracy_group'] = cbc_preds.astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()

In [213]:
# #Catboost reg submission
# # Important! X_test_gt_scaled added
# cbc_preds = cbc_model.predict(X_test_gt_scaled)
# submission['accuracy_group'] = np.ceil(cbc_preds).astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()

##### PCA for test set

In [214]:
# Apply PCA for dimension reduction
#from sklearn.decomposition import PCA
#pca_test = PCA(n_components=10).fit(X_test_gt)
X_test_gt = pca.transform(X_test_gt)
print(sum(pca.explained_variance_ratio_))

0.9999999999995036


##### Weighting and submission

In [215]:
# LightGBM

submission = pd.read_csv(path + 'sample_submission.csv')
gbm_preds = gbm_model.predict(X_test_gt)

submission['accuracy_group'] = gbm_preds

submission['accuracy_group_weight0'] = np.where((submission['accuracy_group'] <= 0.81387600), 0, 0)
submission['accuracy_group_weight1'] = np.where((submission['accuracy_group'] > 0.81387600) & (submission['accuracy_group'] <= 1.09392700), 1, 0)
submission['accuracy_group_weight2'] = np.where((submission['accuracy_group'] > 1.09392700) & (submission['accuracy_group'] <= 1.42779600), 2, 0)
submission['accuracy_group_weight3'] = np.where((submission['accuracy_group'] > 1.42779600), 3, 0)
submission['accuracy_group'] = submission['accuracy_group_weight0'] + submission['accuracy_group_weight1'] + submission['accuracy_group_weight2'] + submission['accuracy_group_weight3']
submission = submission.drop(['accuracy_group_weight0', 'accuracy_group_weight1', 'accuracy_group_weight2', 'accuracy_group_weight3'], axis=1)

submission.to_csv("submission.csv", index = False)

submission.accuracy_group.value_counts()

3    720
0    123
2     92
1     65
Name: accuracy_group, dtype: int64

In [216]:
submission.accuracy_group.value_counts()

3    720
0    123
2     92
1     65
Name: accuracy_group, dtype: int64

In [217]:
# # xgboost submission
# xgb_preds = xgb_model.predict(X_test_gt)
# submission['accuracy_group'] = xgb_preds.astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()

In [218]:
# LightGBM submission
# gbm_preds = gbm_model.predict(X_test_gt)
# submission['accuracy_group'] = np.round(gbm_preds).astype(int)
# submission.to_csv("submission.csv", index = False)
# submission.head()
# submission.accuracy_group.value_counts()