In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb

In [3]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

### Read Data

In [4]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    specs_df = pd.read_csv(path/'specs.csv')
    return train_df, test_df, train_labels_df, specs_df

In [5]:
train_df, test_df, train_labels_df, specs_df = read_data()

In [6]:
train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [7]:
test_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [8]:
train_labels_df.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


## Feature Engineering

In [9]:
main_keys = ['installation_id', 'game_session', 'title']
merge_args = {'left_index':True, 'right_index':True}

In [10]:
merged_train_df = pd.merge(train_df, train_labels_df, on=main_keys)

In [11]:
merged_train_df

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct,num_incorrect,accuracy,accuracy_group
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.000000,3
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.000000,3
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.000000,3
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.000000,3
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865442,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,2,0.333333,1
865443,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,2,0.333333,1
865444,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,2,0.333333,1
865445,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,1,2,0.333333,1


In [12]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['year'] = df['timestamp'].dt.year
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['weekofyear'] = df['timestamp'].dt.weekofyear
    df['dayofyear'] = df['timestamp'].dt.dayofyear
    df['quarter'] = df['timestamp'].dt.quarter
    return df

merged_train_df = extract_time_features(merged_train_df)
merged_train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,...,num_incorrect,accuracy,accuracy_group,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,...,0,1.000000,3,8,5,2019,1,32,218,3
1,db02c830,901acc108f55a5a1,2019-08-06 05:22:01.400000+00:00,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,...,0,1.000000,3,8,5,2019,1,32,218,3
2,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:01.403000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,...,0,1.000000,3,8,5,2019,1,32,218,3
3,a52b92d5,901acc108f55a5a1,2019-08-06 05:22:05.242000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,...,0,1.000000,3,8,5,2019,1,32,218,3
4,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:05.244000+00:00,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,...,0,1.000000,3,8,5,2019,1,32,218,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865442,28520915,5448d652309a6324,2019-09-22 02:07:27.562000+00:00,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,...,2,0.333333,1,9,2,2019,6,38,265,3
865443,91561152,5448d652309a6324,2019-09-22 02:07:27.562000+00:00,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,...,2,0.333333,1,9,2,2019,6,38,265,3
865444,d3268efa,5448d652309a6324,2019-09-22 02:07:27.566000+00:00,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,...,2,0.333333,1,9,2,2019,6,38,265,3
865445,b5053438,5448d652309a6324,2019-09-22 02:07:28.311000+00:00,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,...,2,0.333333,1,9,2,2019,6,38,265,3


In [13]:
# Remove the timestamp column
merged_train_df = merged_train_df[[col for col in merged_train_df.columns if col != 'timestamp']]
merged_train_df

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,...,num_incorrect,accuracy,accuracy_group,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,3bfd1a65,901acc108f55a5a1,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,...,0,1.000000,3,8,5,2019,1,32,218,3
1,db02c830,901acc108f55a5a1,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,...,0,1.000000,3,8,5,2019,1,32,218,3
2,a1e4395d,901acc108f55a5a1,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,...,0,1.000000,3,8,5,2019,1,32,218,3
3,a52b92d5,901acc108f55a5a1,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,...,0,1.000000,3,8,5,2019,1,32,218,3
4,a1e4395d,901acc108f55a5a1,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,...,0,1.000000,3,8,5,2019,1,32,218,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865442,28520915,5448d652309a6324,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,...,2,0.333333,1,9,2,2019,6,38,265,3
865443,91561152,5448d652309a6324,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,...,2,0.333333,1,9,2,2019,6,38,265,3
865444,d3268efa,5448d652309a6324,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,...,2,0.333333,1,9,2,2019,6,38,265,3
865445,b5053438,5448d652309a6324,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK,...,2,0.333333,1,9,2,2019,6,38,265,3


In [17]:
merged_train_df.columns

Index(['event_id', 'game_session', 'event_data', 'installation_id',
       'event_count', 'event_code', 'game_time', 'title', 'type', 'world',
       'num_correct', 'num_incorrect', 'accuracy', 'accuracy_group', 'month',
       'hour', 'year', 'dayofweek', 'weekofyear', 'dayofyear', 'quarter'],
      dtype='object')

In [18]:
def convert_categorical_to_num(df, categorical_list):
    for cat_name in categorical_list:
        merged_train_df[cat_name] = merged_train_df[cat_name].astype('category')

convert_categorical_to_num(merged_train_df, ['title', 'type', 'world', 'event_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Check
cat_columns = merged_train_df.select_dtypes(['category']).columns
cat_columns

Index(['title', 'type', 'world'], dtype='object')

In [20]:
merged_train_df[cat_columns] = merged_train_df[cat_columns].apply(lambda x: x.cat.codes)
merged_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,...,num_incorrect,accuracy,accuracy_group,month,hour,year,dayofweek,weekofyear,dayofyear,quarter
0,3bfd1a65,901acc108f55a5a1,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,4,0,2,...,0,1.000000,3,8,5,2019,1,32,218,3
1,db02c830,901acc108f55a5a1,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,4,0,2,...,0,1.000000,3,8,5,2019,1,32,218,3
2,a1e4395d,901acc108f55a5a1,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,4,0,2,...,0,1.000000,3,8,5,2019,1,32,218,3
3,a52b92d5,901acc108f55a5a1,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,4,0,2,...,0,1.000000,3,8,5,2019,1,32,218,3
4,a1e4395d,901acc108f55a5a1,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,4,0,2,...,0,1.000000,3,8,5,2019,1,32,218,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865442,28520915,5448d652309a6324,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,2,0,1,...,2,0.333333,1,9,2,2019,6,38,265,3
865443,91561152,5448d652309a6324,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,2,0,1,...,2,0.333333,1,9,2,2019,6,38,265,3
865444,d3268efa,5448d652309a6324,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,2,0,1,...,2,0.333333,1,9,2,2019,6,38,265,3
865445,b5053438,5448d652309a6324,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,2,0,1,...,2,0.333333,1,9,2,2019,6,38,265,3


## Training

In [85]:
# quadratic weighted kappa
def qwk3(a1, a2, max_rat=3):
    '''
    a1 - ground truth
    a2 - predicted values
    '''
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

In [100]:
features = [c for c in comp_train_df.columns if c not in ['accuracy_group', 'installation_id']]
target = 'accuracy_group'

class TrainParams():
    def __init__(self, num_splits=10, max_depth=10, early_stopping_rounds=100):
        self.num_splits = 10
        self.params = {
            'learning_rate': 0.007,
            'metric': 'multiclass',
            'objective': 'multiclass',
            'num_classes': 4,
            'feature_fraction': 0.75,
            "bagging_fraction": 0.8,
            "bagging_seed": 42,
            "max_depth": max_depth,
            'verbose': 1
            'num_leaves': 2**max_depth + 10
        }
        self.early_stopping_rounds = early_stopping_rounds
        
    def __repr__(self):
        return f'num_splits: {self.num_splits} max_depth: {self.params["max_depth"]} early_stopping_rounds: {self.early_stopping_rounds}'
    
class Recorder():
    def __init__(self):
        self.qwk3 = 0.0
        self.lowest_qwk3 = 0.0
        self.best_params = None
        self.best_index = 0
        
    def set_results(self, qwk3, train_params: TrainParams, index):
        if qwk3 > self.qwk3:
            self.qwk3 = qwk3
            self.best_params = train_params
            self.best_index = index
        if qwk3 < self.lowest_qwk3:
            self.lowest_qwk3 = qwk3
    
    def __repr__(self):
        return f'[best]: {self.qwk3} [worst:] {self.lowest_qwk3} {self.best_params} [best index]: {self.best_index}'
    
def partition_train_set(df, factor=0.9):
    rows = df.shape[0]
    train_records = int(rows * factor)
    return df[:train_records], df[train_records:]

def train_model(comp_train_df, train_params: TrainParams, i):
    
    kf = KFold(n_splits=train_params.num_splits, shuffle=False)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(train_params.params, train_set, num_boost_round = 10000, early_stopping_rounds = train_params.early_stopping_rounds, 
                          valid_sets=[train_set, val_set], verbose_eval = train_params.early_stopping_rounds)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    return models, train_params

SyntaxError: invalid syntax (<ipython-input-100-798e03559256>, line 17)

In [101]:
partitioned_train_df, partitioned_valid_df = partition_train_set(comp_train_df, 0.95)
partitioned_train_df.shape, partitioned_valid_df.shape

((16805, 757), (885, 757))

In [102]:
all_models = []
train_params_list = []
num_splits = 10
for i, max_depth in enumerate(range(11, 12)):
    for j, early_stopping_round in enumerate([100]):
        train_params = TrainParams(num_splits=num_splits, max_depth=max_depth, early_stopping_rounds=early_stopping_round)
        print('###', train_params, '###')
        models, train_params = train_model(partitioned_train_df, train_params, j * i)
        all_models.append(models)
        train_params_list.append(train_params)

### num_splits: 10 max_depth: 11 early_stopping_rounds: 100 ###
Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.95294	valid_1's multi_logloss: 0.988762
[200]	training's multi_logloss: 0.824751	valid_1's multi_logloss: 0.875826
[300]	training's multi_logloss: 0.757488	valid_1's multi_logloss: 0.821596
[400]	training's multi_logloss: 0.714829	valid_1's multi_logloss: 0.78996
[500]	training's multi_logloss: 0.685517	valid_1's multi_logloss: 0.770725
[600]	training's multi_logloss: 0.664674	valid_1's multi_logloss: 0.75842
[700]	training's multi_logloss: 0.64914	valid_1's multi_logloss: 0.75023
[800]	training's multi_logloss: 0.637187	valid_1's multi_logloss: 0.7449
[900]	training's multi_logloss: 0.627687	valid_1's multi_logloss: 0.741622
[1000]	training's multi_logloss: 0.619632	valid_1's multi_logloss: 0.739544
[1100]	training's multi_logloss: 0.613097	valid_1's multi_logloss: 0.738907
[1200]	training's multi_logloss: 0.607376	vali

Fold: 6 quadratic weighted kappa score: 0.6654
Fold: 7
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.956095	valid_1's multi_logloss: 0.992213
[200]	training's multi_logloss: 0.826891	valid_1's multi_logloss: 0.881948
[300]	training's multi_logloss: 0.756888	valid_1's multi_logloss: 0.82561
[400]	training's multi_logloss: 0.714281	valid_1's multi_logloss: 0.794987
[500]	training's multi_logloss: 0.684615	valid_1's multi_logloss: 0.775763
[600]	training's multi_logloss: 0.663709	valid_1's multi_logloss: 0.764512
[700]	training's multi_logloss: 0.648253	valid_1's multi_logloss: 0.756248
[800]	training's multi_logloss: 0.636029	valid_1's multi_logloss: 0.749174
[900]	training's multi_logloss: 0.626347	valid_1's multi_logloss: 0.745143
[1000]	training's multi_logloss: 0.61851	valid_1's multi_logloss: 0.742079
[1100]	training's multi_logloss: 0.611858	valid_1's multi_logloss: 0.73934
[1200]	training's multi_logloss: 0.606325	valid_1's multi_l

In [89]:
def run_predictions(models, df):
    y_pred = np.zeros((len(df), 4))
    for model in models:
        y_pred += model.predict(df[features])
    return y_pred / num_splits

In [90]:
recorder = Recorder()
for i, models in enumerate(all_models):
    valid_df_preds = run_predictions(models, partitioned_valid_df)
    res = qwk3(partitioned_valid_df[target], valid_df_preds.argmax(axis = 1))
    recorder.set_results(res, train_params_list[i], i)

print(recorder)

[best]: 0.6563571095152401 [worst:] 0.0 num_splits: 10 max_depth: 11 early_stopping_rounds: 100 [best index]: 0


## Inference

In [91]:
def add_missing_columns(comp_train_df: pd.DataFrame, comp_test_df: pd.DataFrame):
    missing: set = set(comp_train_df.columns) - set(comp_test_df.columns)
    for col in missing:
        comp_test_df[col] = 0.
    print(f'Added missing colums: {missing}')

In [92]:
add_missing_columns(comp_train_df, comp_test_df)

Added missing colums: {'e4d32835', '119b5b02', '5dc079d8', '1b54d27f', '17ca3959', 'dcb1663e', '7fd1ac25', 'bfc77bd6', 'ecc6157f', '003cd2ee', 'ab4ec3a4', '01ca3a3c', '4074bac2', '2ec694de', '611485c5', '0ce40006', '29a42aea', '13f56524', 'a8cc6fec', 'accuracy_group'}


In [94]:
y_pred = run_predictions(all_models[recorder.best_index], comp_test_df)

In [95]:
assert comp_test_df.shape[0] == y_pred.shape[0]

In [96]:
np.unique(y_pred.argmax(-1), return_counts=True)

(array([0, 1, 2, 3]), array([653,  36,  26, 285]))

In [97]:
def prepare_submission(comp_test_df, y_pred):
    comp_test_df = comp_test_df.reset_index()
    comp_test_df = comp_test_df[['installation_id']]
    comp_test_df['accuracy_group'] = y_pred.argmax(axis = 1)
    sample_submission_df = pd.read_csv(path/'sample_submission.csv')
    sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
    sample_submission_df = sample_submission_df.merge(comp_test_df, on = 'installation_id')
    sample_submission_df.to_csv('submission.csv', index = False)

In [98]:
prepare_submission(comp_test_df, y_pred)

In [99]:
!head submission.csv

installation_id,accuracy_group
00abaee7,3
01242218,3
017c5718,0
01a44906,0
01bc6cb6,0
02256298,3
0267757a,0
027e7ce5,3
02a29f99,0


In [None]:
!cat submission.csv | wc -l