In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb

In [3]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

### Read Data

In [4]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    return train_df, test_df, train_labels_df

In [5]:
train_df, test_df, train_labels_df = read_data()

In [6]:
train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [7]:
test_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [8]:
train_labels_df.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


## Feature Engineering

In [9]:
def remove_wrong_event_codes(df):
    return df[((df['title'] == 'Bird Measurer (Assessment)') & (df['event_code'] == 4100)) == False]

In [10]:
train_df = remove_wrong_event_codes(train_df)

In [11]:
test_df = remove_wrong_event_codes(test_df)

In [12]:
main_key = 'installation_id'
merge_args = {'left_index':True, 'right_index':True}
agg_stats = ['mean', 'sum', 'min', 'max', 'std', 'skew', 'median', pd.Series.kurt, 'count']

In [13]:
def get_event_id_count(df):
    df = df.groupby([main_key]).agg({'event_id': ['count']})
    df.columns = ['event_id_count']
    return df

In [14]:
def get_object_columns(df, column):
    df = df.groupby([main_key, column])['event_id'].count().reset_index()
    df = df.pivot_table(index = main_key, columns = [column], values = 'event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns(df, column):
    df = df.groupby('installation_id').agg({f'{column}': agg_stats})
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = [f'{column}_{stat}' for stat in agg_stats]
    df.rename(columns={ df.columns[7]: f'{column}_kurt' }, inplace = True)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns_add(df, agg_column, column):
    df = df.groupby([main_key, agg_column]).agg({f'{column}': agg_stats}).reset_index()
    df = df.pivot_table(index = main_key, columns = [agg_column], values = [col for col in df.columns if col not in [main_key, agg_column]])
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = list(df.columns)
    return df

def feature_engineering(train_df, test_df, train_labels_df):
    
    numerical_columns = ['game_time']
    categorical_columns = ['type', 'world']

    comp_train_df = pd.DataFrame({main_key: train_df[main_key].unique()})
    comp_train_df.set_index(main_key, inplace = True)
    comp_test_df = pd.DataFrame({main_key: test_df[main_key].unique()})
    comp_test_df.set_index(main_key, inplace = True)

    for i in numerical_columns:
        comp_train_df = comp_train_df.merge(get_numeric_columns(train_df, i), **merge_args)
        comp_test_df = comp_test_df.merge(get_numeric_columns(test_df, i), **merge_args)
    
    for i in categorical_columns:
        comp_train_df = comp_train_df.merge(get_object_columns(train_df, i), **merge_args)
        comp_test_df = comp_test_df.merge(get_object_columns(test_df, i), **merge_args)
    
    for i in categorical_columns:
        for j in numerical_columns:
            comp_train_df = comp_train_df.merge(get_numeric_columns_add(train_df, i, j), **merge_args)
            comp_test_df = comp_test_df.merge(get_numeric_columns_add(test_df, i, j), **merge_args)
    
    comp_train_df.reset_index(inplace = True)
    comp_test_df.reset_index(inplace = True)
    
    print(f'Our training set has {comp_train_df.shape[0]} rows and {comp_train_df.shape[1]} columns')
    print(f'Our test set has {comp_test_df.shape[0]} rows and {comp_test_df.shape[1]} columns')

    # get the mode of the title
    labels_map = dict(train_labels_df.groupby('title')['accuracy_group'].agg(lambda x:x.value_counts().index[0]))
    # merge target
    labels = train_labels_df[[main_key, 'title', 'accuracy_group']]
    # replace title with the mode
    labels.loc[:,'title'] = labels['title'].map(labels_map)
    # get title from the test set
    comp_test_df.loc[:,'title'] = test_df.groupby(main_key).last()['title'].map(labels_map).reset_index(drop = True)
    # join train with labels
    comp_train_df = labels.merge(comp_train_df, on = main_key, how = 'left')
    print(f'We have {comp_train_df.shape[0]} training rows')
    
    return comp_train_df, comp_test_df

In [15]:
comp_train_df, comp_test_df = feature_engineering(train_df, test_df, train_labels_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
  new_axis = axis.drop(labels, errors=errors)


Our training set has 17000 rows and 90 columns
Our test set has 1000 rows and 90 columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


We have 17690 training rows


In [16]:
pd.options.display.max_columns = None
comp_test_df

Unnamed: 0,installation_id,game_time_mean,game_time_sum,game_time_min,game_time_max,game_time_std,game_time_skew,game_time_median,game_time_kurt,game_time_count,Activity,Assessment,Clip,Game,CRYSTALCAVES,MAGMAPEAK,NONE,TREETOPCITY,"(game_time, count, Activity)","(game_time, count, Assessment)","(game_time, count, Clip)","(game_time, count, Game)","(game_time, kurt, Activity)","(game_time, kurt, Assessment)","(game_time, kurt, Clip)","(game_time, kurt, Game)","(game_time, max, Activity)","(game_time, max, Assessment)","(game_time, max, Clip)","(game_time, max, Game)","(game_time, mean, Activity)","(game_time, mean, Assessment)","(game_time, mean, Clip)","(game_time, mean, Game)","(game_time, median, Activity)","(game_time, median, Assessment)","(game_time, median, Clip)","(game_time, median, Game)","(game_time, min, Activity)","(game_time, min, Assessment)","(game_time, min, Clip)","(game_time, min, Game)","(game_time, skew, Activity)","(game_time, skew, Assessment)","(game_time, skew, Clip)","(game_time, skew, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)","(game_time, sum, Activity)","(game_time, sum, Assessment)","(game_time, sum, Clip)","(game_time, sum, Game)","(game_time, count, CRYSTALCAVES)","(game_time, count, MAGMAPEAK)","(game_time, count, NONE)","(game_time, count, TREETOPCITY)","(game_time, kurt, CRYSTALCAVES)","(game_time, kurt, MAGMAPEAK)","(game_time, kurt, NONE)","(game_time, kurt, TREETOPCITY)","(game_time, max, CRYSTALCAVES)","(game_time, max, MAGMAPEAK)","(game_time, max, NONE)","(game_time, max, TREETOPCITY)","(game_time, mean, CRYSTALCAVES)","(game_time, mean, MAGMAPEAK)","(game_time, mean, NONE)","(game_time, mean, TREETOPCITY)","(game_time, median, CRYSTALCAVES)","(game_time, median, MAGMAPEAK)","(game_time, median, NONE)","(game_time, median, TREETOPCITY)","(game_time, min, CRYSTALCAVES)","(game_time, min, MAGMAPEAK)","(game_time, min, NONE)","(game_time, min, TREETOPCITY)","(game_time, skew, CRYSTALCAVES)","(game_time, skew, MAGMAPEAK)","(game_time, skew, NONE)","(game_time, skew, TREETOPCITY)","(game_time, std, CRYSTALCAVES)","(game_time, std, MAGMAPEAK)","(game_time, std, NONE)","(game_time, std, TREETOPCITY)","(game_time, sum, CRYSTALCAVES)","(game_time, sum, MAGMAPEAK)","(game_time, sum, NONE)","(game_time, sum, TREETOPCITY)",title
0,00abaee7,63567.408986,55176511,0,1960630,149911.784066,11.578484,40657.0,143.621229,868,454.0,27.0,14.0,373.0,253.0,241.0,1.0,373.0,454.000000,27.0,14.0,373.000000,-0.633921,-0.809225,0.0,64.636491,105916.000000,30038.0,0.0,1.960630e+06,38077.154185,14008.074074,0.0,100566.394102,30520.000000,14303.0,0.0,70287.000000,0.0,0.0,0.0,0.0,0.768075,0.089729,0.0,7.965669,30100.806203,9279.184749,0.0,220845.119349,1.728703e+07,378218.0,0.0,3.751126e+07,253.000000,241.000000,1.000000,373.0,-0.962372,-1.061831,0.0,67.719968,135794.00000,188805.000000,0.0,1960630.0,54616.913043,69526.593361,0.0,65958.506702,52104.000000,56998.000000,0.0,33074.0,0.0,0.0,0.0,0.0,0.376034,0.510952,0.0,8.243170,37649.869386,53238.970991,0.0,222474.449033,1.381808e+07,1.675591e+07,0.0,24602523.0,3
1,01242218,75775.845786,205882973,0,317027,61810.824535,1.420597,62056.0,2.045609,2717,1356.0,243.0,29.0,1089.0,885.0,848.0,2.0,982.0,1356.000000,243.0,29.0,1089.000000,0.580727,-0.313895,0.0,-0.811927,317027.000000,82149.0,0.0,1.694920e+05,87635.177729,27373.646091,0.0,73827.250689,65968.500000,21570.0,0.0,70527.000000,0.0,0.0,0.0,0.0,1.211983,0.774335,0.0,0.263853,74745.148527,21270.907451,0.0,40292.737670,1.188333e+08,6651796.0,0.0,8.039788e+07,885.000000,848.000000,2.000000,982.0,2.412172,0.441943,0.0,-0.489293,317027.00000,283765.000000,0.0,178550.0,81169.943503,85886.292453,0.0,62338.082485,67509.000000,65141.000000,0.0,51563.5,0.0,0.0,0.0,0.0,1.640714,1.068934,0.0,0.680420,67990.867851,68507.373099,0.0,45307.191839,7.183540e+07,7.283158e+07,0.0,61215997.0,3
2,017c5718,33017.233333,4952585,0,60943,17140.293312,-0.421696,35616.5,-0.848399,150,143.0,1.0,6.0,0.0,0.0,0.0,4.0,146.0,143.000000,1.0,6.0,697.878049,-0.752415,0.726756,0.0,2.157466,60943.000000,0.0,0.0,6.529826e+05,34633.461538,0.000000,0.0,180209.144675,36862.000000,0.0,0.0,190872.320732,0.0,0.0,0.0,0.0,-0.418506,0.665192,0.0,0.689734,15871.435721,38695.777858,0.0,182855.771844,4.952585e+06,0.0,0.0,1.617186e+08,423.655782,723.183876,4.000000,146.0,0.913555,2.430651,0.0,-0.774285,316635.54966,665107.285714,0.0,60943.0,75932.838429,117047.319657,0.0,33921.815068,60235.606803,72952.560113,0.0,36318.0,0.0,0.0,0.0,0.0,0.578195,0.917524,0.0,-0.430958,75616.844935,143236.743283,0.0,16462.002358,5.277859e+07,1.391002e+08,0.0,4952585.0,3
3,01a44906,41162.901709,9632119,0,85983,28696.300044,0.231737,32967.0,-1.371421,234,145.0,1.0,10.0,78.0,0.0,0.0,3.0,231.0,145.000000,1.0,10.0,78.000000,-1.685530,0.726756,0.0,-0.947873,85983.000000,0.0,0.0,7.720400e+04,46314.958621,0.000000,0.0,37390.384615,39433.000000,0.0,0.0,36726.500000,0.0,0.0,0.0,0.0,0.051522,0.665192,0.0,0.094331,30760.619218,38695.777858,0.0,20391.645155,6.715669e+06,0.0,0.0,2.916450e+06,423.655782,723.183876,3.000000,231.0,0.913555,2.430651,0.0,-1.378278,316635.54966,665107.285714,0.0,85983.0,75932.838429,117047.319657,0.0,41697.484848,60235.606803,72952.560113,0.0,33817.0,0.0,0.0,0.0,0.0,0.578195,0.917524,0.0,0.222744,75616.844935,143236.743283,0.0,28492.647474,5.277859e+07,1.391002e+08,0.0,9632119.0,3
4,01bc6cb6,147664.880252,140576966,0,511237,128478.809653,1.250042,106076.5,0.679159,952,226.0,1.0,17.0,708.0,522.0,3.0,3.0,424.0,226.000000,1.0,17.0,708.000000,-0.923448,0.726756,0.0,-0.194195,221698.000000,0.0,0.0,5.112370e+05,110746.128319,0.000000,0.0,163203.871469,121294.000000,0.0,0.0,105111.000000,0.0,0.0,0.0,0.0,-0.074129,0.665192,0.0,1.015493,57606.576415,38695.777858,0.0,141120.117860,2.502862e+07,0.0,0.0,1.155483e+08,522.000000,3.000000,3.000000,424.0,-0.621550,2.430651,0.0,-1.219943,221698.00000,0.000000,0.0,511237.0,90758.699234,0.000000,0.0,219813.502358,84012.000000,0.000000,0.0,200473.5,0.0,0.0,0.0,0.0,0.291697,0.000000,0.0,0.298199,52164.580560,0.000000,0.0,155679.905961,4.737604e+07,0.000000e+00,0.0,93200925.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,fee254cf,56395.488152,11899448,0,155008,43542.436389,0.449183,48287.0,-0.908832,211,0.0,115.0,4.0,92.0,0.0,1.0,1.0,209.0,557.077658,115.0,4.0,92.000000,3.207505,-0.796371,0.0,-1.170313,582662.403823,155008.0,0.0,1.149210e+05,108733.747862,52736.573913,0.0,63421.108696,80719.541219,32401.0,0.0,71305.000000,0.0,0.0,0.0,0.0,0.867264,0.770495,0.0,-0.280526,127624.865673,49038.400677,0.0,34001.061136,9.439616e+07,6064706.0,0.0,5.834742e+06,423.655782,1.000000,1.000000,209.0,0.913555,2.430651,0.0,-0.910677,316635.54966,0.000000,0.0,155008.0,75932.838429,0.000000,0.0,56935.157895,60235.606803,0.000000,0.0,49940.0,0.0,0.0,0.0,0.0,0.578195,0.917524,0.0,0.441755,75616.844935,143236.743283,0.0,43397.005555,5.277859e+07,0.000000e+00,0.0,11899448.0,3
996,ff57e602,44772.023102,13565923,0,139394,31785.997875,0.699889,39918.0,0.159045,303,127.0,29.0,11.0,136.0,246.0,0.0,1.0,56.0,127.000000,29.0,11.0,136.000000,-1.096482,-0.834165,0.0,-0.231436,69318.000000,36169.0,0.0,1.393940e+05,33302.031496,17072.896552,0.0,65010.669118,33137.000000,18600.0,0.0,65769.500000,0.0,0.0,0.0,0.0,0.026164,0.026070,0.0,0.160738,18880.279305,11068.777982,0.0,32210.700278,4.229358e+06,495114.0,0.0,8.841451e+06,246.000000,723.183876,1.000000,56.0,-0.158619,2.430651,0.0,-1.256116,139394.00000,665107.285714,0.0,57697.0,48402.142276,117047.319657,0.0,29624.928571,45304.000000,72952.560113,0.0,29701.5,0.0,0.0,0.0,0.0,0.576489,0.917524,0.0,-0.158853,33037.671890,143236.743283,0.0,18885.540724,1.190693e+07,1.391002e+08,0.0,1658996.0,0
997,ffc73fb2,61802.442966,32508085,0,199825,53869.630318,0.728098,46648.0,-0.572993,526,238.0,256.0,32.0,0.0,154.0,132.0,1.0,239.0,238.000000,256.0,32.0,697.878049,-0.617321,-0.650888,0.0,2.157466,199825.000000,149031.0,0.0,6.529826e+05,81037.907563,51644.777344,0.0,180209.144675,78483.000000,28885.0,0.0,190872.320732,0.0,0.0,0.0,0.0,0.519924,0.899523,0.0,0.689734,53386.528402,48963.311394,0.0,182855.771844,1.928702e+07,13221063.0,0.0,1.617186e+08,154.000000,132.000000,1.000000,239.0,-1.412452,-0.227526,0.0,-0.798319,149031.00000,64865.000000,0.0,199825.0,73178.383117,21596.681818,0.0,76936.619247,64435.000000,17101.000000,0.0,78483.0,0.0,0.0,0.0,0.0,0.101301,0.693637,0.0,0.433389,52554.796817,15886.543489,0.0,57100.791765,1.126947e+07,2.850762e+06,0.0,18387852.0,3
998,ffe00ca8,25213.420849,6530276,0,72242,20859.265204,0.705400,19590.0,-0.673306,259,123.0,110.0,11.0,15.0,5.0,139.0,1.0,114.0,123.000000,110.0,11.0,15.000000,-1.475113,-0.466640,0.0,-1.156150,72242.000000,43571.0,0.0,2.256400e+04,35926.439024,17185.645455,0.0,14726.866667,39648.000000,15751.5,0.0,18086.000000,0.0,0.0,0.0,0.0,-0.061215,0.511187,0.0,-0.720429,23350.701961,11718.642969,0.0,8108.364710,4.418952e+06,1890421.0,0.0,2.209030e+05,5.000000,139.000000,1.000000,114.0,0.000000,-0.301554,0.0,-1.358390,0.00000,43571.000000,0.0,72242.0,0.000000,15573.532374,0.0,38294.342105,0.000000,14588.000000,0.0,41087.5,0.0,0.0,0.0,0.0,0.000000,0.604975,0.0,-0.201029,0.000000,11511.482848,0.0,22603.416385,0.000000e+00,2.164721e+06,0.0,4365555.0,3


## Normalize Title

In [21]:
import re

comp_train_df.columns = [c if type(c) != tuple else '_'.join(c) for c in comp_train_df.columns]
comp_test_df.columns = [c if type(c) != tuple else '_'.join(c) for c in comp_test_df.columns]
comp_train_df.columns = [re.sub(r'\W', '_', s) for s in comp_train_df.columns]
comp_test_df.columns = [re.sub(r'\W', '_', s) for s in comp_test_df.columns]

In [22]:
list(comp_train_df.columns)

['installation_id',
 'title',
 'accuracy_group',
 'game_time_mean',
 'game_time_sum',
 'game_time_min',
 'game_time_max',
 'game_time_std',
 'game_time_skew',
 'game_time_median',
 'game_time_kurt',
 'game_time_count',
 'Activity',
 'Assessment',
 'Clip',
 'Game',
 'CRYSTALCAVES',
 'MAGMAPEAK',
 'NONE',
 'TREETOPCITY',
 'game_time_count_Activity',
 'game_time_count_Assessment',
 'game_time_count_Clip',
 'game_time_count_Game',
 'game_time_kurt_Activity',
 'game_time_kurt_Assessment',
 'game_time_kurt_Clip',
 'game_time_kurt_Game',
 'game_time_max_Activity',
 'game_time_max_Assessment',
 'game_time_max_Clip',
 'game_time_max_Game',
 'game_time_mean_Activity',
 'game_time_mean_Assessment',
 'game_time_mean_Clip',
 'game_time_mean_Game',
 'game_time_median_Activity',
 'game_time_median_Assessment',
 'game_time_median_Clip',
 'game_time_median_Game',
 'game_time_min_Activity',
 'game_time_min_Assessment',
 'game_time_min_Clip',
 'game_time_min_Game',
 'game_time_skew_Activity',
 'game_time

## Training

In [23]:
# quadratic weighted kappa
def qwk3(a1, a2, max_rat=3):
    '''
    a1 - ground truth
    a2 - predicted values
    '''
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

In [24]:
features = [i for i in comp_train_df.columns if i not in ['accuracy_group', 'installation_id']]
target = 'accuracy_group'
num_splits = 10
params = {
    'learning_rate': 0.007,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'feature_fraction': 0.75,
    "bagging_fraction": 0.8,
    "bagging_seed": 42,
    'max_depth': 11
}

early_stopping_rounds = 100

def train_model(comp_train_df):
    
    kf = KFold(n_splits=num_splits, shuffle=True)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = early_stopping_rounds, 
                          valid_sets=[train_set, val_set], verbose_eval = early_stopping_rounds)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(comp_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return models

In [25]:
models = train_model(comp_train_df)

Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.08909	valid_1's multi_logloss: 1.11838
[200]	training's multi_logloss: 1.01978	valid_1's multi_logloss: 1.07115
[300]	training's multi_logloss: 0.975149	valid_1's multi_logloss: 1.04599
[400]	training's multi_logloss: 0.943596	valid_1's multi_logloss: 1.03285
[500]	training's multi_logloss: 0.918461	valid_1's multi_logloss: 1.02437
[600]	training's multi_logloss: 0.897764	valid_1's multi_logloss: 1.0189
[700]	training's multi_logloss: 0.879708	valid_1's multi_logloss: 1.01541
[800]	training's multi_logloss: 0.864031	valid_1's multi_logloss: 1.01294
[900]	training's multi_logloss: 0.84989	valid_1's multi_logloss: 1.01116
[1000]	training's multi_logloss: 0.836708	valid_1's multi_logloss: 1.01008
[1100]	training's multi_logloss: 0.824469	valid_1's multi_logloss: 1.0095
[1200]	training's multi_logloss: 0.813004	valid_1's multi_logloss: 1.0092
[1300]	training's multi_logloss: 0.802449	val

KeyboardInterrupt: 

## Inference

In [None]:
def add_missing_columns(comp_train_df: pd.DataFrame, comp_test_df: pd.DataFrame):
    missing: set = set(comp_train_df.columns) - set(comp_test_df.columns)
    for col in missing:
        comp_test_df[col] = 0.
    print(f'Added missing colums: {missing}')

In [None]:
add_missing_columns(comp_train_df, comp_test_df)

In [None]:
def run_predictions(models):
    y_pred = np.zeros((len(comp_test_df), 4))
    for model in models:
        y_pred += model.predict(comp_test_df[features])
    return y_pred / num_splits

In [None]:
y_pred = run_predictions(models)

In [None]:
assert comp_test_df.shape[0] == y_pred.shape[0]

In [None]:
np.unique(y_pred.argmax(-1), return_counts=True)

In [None]:
def prepare_submission(comp_test_df, sample_submission_df, y_pred):
    comp_test_df = comp_test_df.reset_index()
    comp_test_df = comp_test_df[['installation_id']]
    comp_test_df['accuracy_group'] = y_pred.argmax(axis = 1)
    sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
    sample_submission_df = sample_submission_df.merge(comp_test_df, on = 'installation_id')
    sample_submission_df.to_csv('submission.csv', index = False)

In [None]:
prepare_submission(comp_test_df, sample_submission_df, y_pred)

In [None]:
!head submission.csv

In [None]:
!cat submission.csv | wc -l