In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb

In [3]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

### Read Data

In [4]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    return train_df, test_df, train_labels_df

In [5]:
train_df, test_df, train_labels_df = read_data()

In [6]:
train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [7]:
test_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [8]:
train_labels_df.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


## Feature Engineering

In [9]:
main_key = 'installation_id'
merge_args = {'left_index':True, 'right_index':True}
agg_stats = ['mean', 'sum', 'min', 'max', 'std', 'skew', 'median', pd.Series.kurt, 'count']

In [10]:
def get_event_id_count(df):
    df = df.groupby([main_key]).agg({'event_id': ['count']})
    df.columns = ['event_id_count']
    return df

In [11]:
import json

def process_event_code(x, is_correct):
    count = 0
    for xi in x:
        fields = json.loads(xi)
        if 'event_code' in fields and fields['event_code'] in [4100, 4110] and 'correct' in fields and fields['correct'] == is_correct:
            count += 1
    return count

def process_correct_event_code(x):
    return process_event_code(x, True)

def process_incorrect_event_code(x):
    return process_event_code(x, False)

def extract_correct_incorrect(df, field_name, func):
    key = ['installation_id']
    event_code_count = df[:].groupby(key)['event_data'].agg(func)
    event_code_count = event_code_count.reset_index()
    event_code_count.columns = [*key, field_name]
    return event_code_count

def extract_correct(df):
    return extract_correct_incorrect(df, 'num_correct', process_correct_event_code)

def extract_incorrect(df):
    return extract_correct_incorrect(df, 'num_incorrect', process_incorrect_event_code)

In [12]:
def get_object_columns(df, column):
    df = df.groupby([main_key, column])['event_id'].count().reset_index()
    df = df.pivot_table(index = main_key, columns = [column], values = 'event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns(df, column):
    df = df.groupby('installation_id').agg({f'{column}': agg_stats})
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = [f'{column}_{stat}' for stat in agg_stats]
    df.rename(columns={ df.columns[7]: f'{column}_kurt' }, inplace = True)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns_add(df, agg_column, column):
    df = df.groupby([main_key, agg_column]).agg({f'{column}': agg_stats}).reset_index()
    df = df.pivot_table(index = main_key, columns = [agg_column], values = [col for col in df.columns if col not in [main_key, agg_column]])
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = list(df.columns)
    return df

def process_correct_incorrect(comp_train_df, comp_test_df, func, field):
    comp_train_df = comp_train_df.merge(func(train_df), on=main_key, how='left')
    comp_test_df = comp_test_df.merge(func(test_df), on=main_key, how='left')
    comp_train_df[field].fillna(0.0, inplace=True)
    comp_test_df[field].fillna(0.0, inplace=True)
    return comp_train_df, comp_test_df

def feature_engineering(train_df, test_df, train_labels_df):
    
    numerical_columns = ['game_time']
    categorical_columns = ['type', 'world']

    comp_train_df = pd.DataFrame({main_key: train_df[main_key].unique()})
    comp_train_df.set_index(main_key, inplace = True)
    comp_test_df = pd.DataFrame({main_key: test_df[main_key].unique()})
    comp_test_df.set_index(main_key, inplace = True)

    for i in numerical_columns:
        comp_train_df = comp_train_df.merge(get_numeric_columns(train_df, i), **merge_args)
        comp_test_df = comp_test_df.merge(get_numeric_columns(test_df, i), **merge_args)
    
    for i in categorical_columns:
        comp_train_df = comp_train_df.merge(get_object_columns(train_df, i), **merge_args)
        comp_test_df = comp_test_df.merge(get_object_columns(test_df, i), **merge_args)
    
    for i in categorical_columns:
        for j in numerical_columns:
            comp_train_df = comp_train_df.merge(get_numeric_columns_add(train_df, i, j), **merge_args)
            comp_test_df = comp_test_df.merge(get_numeric_columns_add(test_df, i, j), **merge_args)
            
    comp_train_df.reset_index(inplace = True)
    comp_test_df.reset_index(inplace = True)
    
    comp_train_df, comp_test_df = process_correct_incorrect(comp_train_df, comp_test_df, extract_correct, 'num_correct')
    comp_train_df, comp_test_df = process_correct_incorrect(comp_train_df, comp_test_df, extract_incorrect, 'num_incorrect')
    
    print(f'Our training set has {comp_train_df.shape[0]} rows and {comp_train_df.shape[1]} columns')
    print(f'Our test set has {comp_test_df.shape[0]} rows and {comp_test_df.shape[1]} columns')

    # get the mode of the title
    labels_map = dict(train_labels_df.groupby('title')['accuracy_group'].agg(lambda x:x.value_counts().index[0]))
    # merge target
    labels = train_labels_df[[main_key, 'title', 'accuracy_group']]
    # replace title with the mode
    labels.loc[:,'title'] = labels['title'].map(labels_map)
    # get title from the test set
    comp_test_df.loc[:,'title'] = test_df.groupby(main_key).last()['title'].map(labels_map).reset_index(drop = True)
    # join train with labels
    comp_train_df = labels.merge(comp_train_df, on = main_key, how = 'left')
    print(f'We have {comp_train_df.shape[0]} training rows')
    
    return comp_train_df, comp_test_df

In [13]:
comp_train_df, comp_test_df = feature_engineering(train_df, test_df, train_labels_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
  new_axis = axis.drop(labels, errors=errors)


Our training set has 17000 rows and 92 columns
Our test set has 1000 rows and 92 columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


We have 17690 training rows


In [14]:
pd.options.display.max_columns = None
comp_train_df

Unnamed: 0,installation_id,title,accuracy_group,game_time_mean,game_time_sum,game_time_min,game_time_max,game_time_std,game_time_skew,game_time_median,game_time_kurt,game_time_count,Activity,Assessment,Clip,Game,CRYSTALCAVES,MAGMAPEAK,NONE,TREETOPCITY,"(game_time, count, Activity)","(game_time, count, Assessment)","(game_time, count, Clip)","(game_time, count, Game)","(game_time, kurt, Activity)","(game_time, kurt, Assessment)","(game_time, kurt, Clip)","(game_time, kurt, Game)","(game_time, max, Activity)","(game_time, max, Assessment)","(game_time, max, Clip)","(game_time, max, Game)","(game_time, mean, Activity)","(game_time, mean, Assessment)","(game_time, mean, Clip)","(game_time, mean, Game)","(game_time, median, Activity)","(game_time, median, Assessment)","(game_time, median, Clip)","(game_time, median, Game)","(game_time, min, Activity)","(game_time, min, Assessment)","(game_time, min, Clip)","(game_time, min, Game)","(game_time, skew, Activity)","(game_time, skew, Assessment)","(game_time, skew, Clip)","(game_time, skew, Game)","(game_time, std, Activity)","(game_time, std, Assessment)","(game_time, std, Clip)","(game_time, std, Game)","(game_time, sum, Activity)","(game_time, sum, Assessment)","(game_time, sum, Clip)","(game_time, sum, Game)","(game_time, count, CRYSTALCAVES)","(game_time, count, MAGMAPEAK)","(game_time, count, NONE)","(game_time, count, TREETOPCITY)","(game_time, kurt, CRYSTALCAVES)","(game_time, kurt, MAGMAPEAK)","(game_time, kurt, NONE)","(game_time, kurt, TREETOPCITY)","(game_time, max, CRYSTALCAVES)","(game_time, max, MAGMAPEAK)","(game_time, max, NONE)","(game_time, max, TREETOPCITY)","(game_time, mean, CRYSTALCAVES)","(game_time, mean, MAGMAPEAK)","(game_time, mean, NONE)","(game_time, mean, TREETOPCITY)","(game_time, median, CRYSTALCAVES)","(game_time, median, MAGMAPEAK)","(game_time, median, NONE)","(game_time, median, TREETOPCITY)","(game_time, min, CRYSTALCAVES)","(game_time, min, MAGMAPEAK)","(game_time, min, NONE)","(game_time, min, TREETOPCITY)","(game_time, skew, CRYSTALCAVES)","(game_time, skew, MAGMAPEAK)","(game_time, skew, NONE)","(game_time, skew, TREETOPCITY)","(game_time, std, CRYSTALCAVES)","(game_time, std, MAGMAPEAK)","(game_time, std, NONE)","(game_time, std, TREETOPCITY)","(game_time, sum, CRYSTALCAVES)","(game_time, sum, MAGMAPEAK)","(game_time, sum, NONE)","(game_time, sum, TREETOPCITY)",num_correct,num_incorrect
0,0006a69f,3,3,82491.007366,313548319,0,1520600,159789.192183,7.504204,50155.0,62.680316,3801,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,1771.0,261.0,37.0,1732.0,30.534946,0.515647,0.0,0.057029,1520600.0,92799.0,0.0,232249.0,104714.852626,24886.245211,0.0,70209.587182,52767.0,16826.0,0.0,58727.0,0.0,0.0,0.0,0.0,5.459150,1.210442,0.0,0.864883,225262.351712,23754.712897,0.0,52702.709458,185450004.0,6495310.0,0.0,121603005.0,316.823091,1910.0,4.0,1887.0,2.087356,3.880306,0.0,33.384820,3.879622e+05,320531.0,0.0,1520600.0,101178.549937,63648.531937,0.0,101738.009009,89433.546653,46808.0,0.0,54010.0,0.0,0.0,0.0,0.0,0.527844,1.736113,0.0,5.704184,115847.423107,57216.164087,0.0,217683.232366,6.031041e+07,121568696.0,0.0,191979623.0,8,17
1,0006a69f,0,0,82491.007366,313548319,0,1520600,159789.192183,7.504204,50155.0,62.680316,3801,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,1771.0,261.0,37.0,1732.0,30.534946,0.515647,0.0,0.057029,1520600.0,92799.0,0.0,232249.0,104714.852626,24886.245211,0.0,70209.587182,52767.0,16826.0,0.0,58727.0,0.0,0.0,0.0,0.0,5.459150,1.210442,0.0,0.864883,225262.351712,23754.712897,0.0,52702.709458,185450004.0,6495310.0,0.0,121603005.0,316.823091,1910.0,4.0,1887.0,2.087356,3.880306,0.0,33.384820,3.879622e+05,320531.0,0.0,1520600.0,101178.549937,63648.531937,0.0,101738.009009,89433.546653,46808.0,0.0,54010.0,0.0,0.0,0.0,0.0,0.527844,1.736113,0.0,5.704184,115847.423107,57216.164087,0.0,217683.232366,6.031041e+07,121568696.0,0.0,191979623.0,8,17
2,0006a69f,3,3,82491.007366,313548319,0,1520600,159789.192183,7.504204,50155.0,62.680316,3801,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,1771.0,261.0,37.0,1732.0,30.534946,0.515647,0.0,0.057029,1520600.0,92799.0,0.0,232249.0,104714.852626,24886.245211,0.0,70209.587182,52767.0,16826.0,0.0,58727.0,0.0,0.0,0.0,0.0,5.459150,1.210442,0.0,0.864883,225262.351712,23754.712897,0.0,52702.709458,185450004.0,6495310.0,0.0,121603005.0,316.823091,1910.0,4.0,1887.0,2.087356,3.880306,0.0,33.384820,3.879622e+05,320531.0,0.0,1520600.0,101178.549937,63648.531937,0.0,101738.009009,89433.546653,46808.0,0.0,54010.0,0.0,0.0,0.0,0.0,0.527844,1.736113,0.0,5.704184,115847.423107,57216.164087,0.0,217683.232366,6.031041e+07,121568696.0,0.0,191979623.0,8,17
3,0006a69f,3,2,82491.007366,313548319,0,1520600,159789.192183,7.504204,50155.0,62.680316,3801,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,1771.0,261.0,37.0,1732.0,30.534946,0.515647,0.0,0.057029,1520600.0,92799.0,0.0,232249.0,104714.852626,24886.245211,0.0,70209.587182,52767.0,16826.0,0.0,58727.0,0.0,0.0,0.0,0.0,5.459150,1.210442,0.0,0.864883,225262.351712,23754.712897,0.0,52702.709458,185450004.0,6495310.0,0.0,121603005.0,316.823091,1910.0,4.0,1887.0,2.087356,3.880306,0.0,33.384820,3.879622e+05,320531.0,0.0,1520600.0,101178.549937,63648.531937,0.0,101738.009009,89433.546653,46808.0,0.0,54010.0,0.0,0.0,0.0,0.0,0.527844,1.736113,0.0,5.704184,115847.423107,57216.164087,0.0,217683.232366,6.031041e+07,121568696.0,0.0,191979623.0,8,17
4,0006a69f,0,3,82491.007366,313548319,0,1520600,159789.192183,7.504204,50155.0,62.680316,3801,1771.0,261.0,37.0,1732.0,0.0,1910.0,4.0,1887.0,1771.0,261.0,37.0,1732.0,30.534946,0.515647,0.0,0.057029,1520600.0,92799.0,0.0,232249.0,104714.852626,24886.245211,0.0,70209.587182,52767.0,16826.0,0.0,58727.0,0.0,0.0,0.0,0.0,5.459150,1.210442,0.0,0.864883,225262.351712,23754.712897,0.0,52702.709458,185450004.0,6495310.0,0.0,121603005.0,316.823091,1910.0,4.0,1887.0,2.087356,3.880306,0.0,33.384820,3.879622e+05,320531.0,0.0,1520600.0,101178.549937,63648.531937,0.0,101738.009009,89433.546653,46808.0,0.0,54010.0,0.0,0.0,0.0,0.0,0.527844,1.736113,0.0,5.704184,115847.423107,57216.164087,0.0,217683.232366,6.031041e+07,121568696.0,0.0,191979623.0,8,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17685,ffc90c32,0,3,79080.932136,158478188,0,827160,109443.413015,4.028416,51465.0,19.372269,2004,479.0,242.0,29.0,1254.0,489.0,827.0,2.0,686.0,479.0,242.0,29.0,1254.0,0.841704,-0.380753,0.0,12.462043,141895.0,54896.0,0.0,827160.0,43527.747390,18079.165289,0.0,106262.551037,35295.0,14614.5,0.0,75421.0,0.0,0.0,0.0,0.0,1.131711,0.762105,0.0,3.347029,32266.955154,13959.228740,0.0,128944.239368,20849791.0,4375158.0,0.0,133253239.0,489.000000,827.0,2.0,686.0,-0.534263,7.578588,0.0,0.739966,1.428870e+05,827160.0,0.0,193000.0,48566.196319,118910.175333,0.0,53047.526239,39940.000000,76483.0,0.0,40434.0,0.0,0.0,0.0,0.0,0.666591,2.745263,0.0,1.250166,35463.030938,154053.818961,0.0,47262.707236,2.374887e+07,98338715.0,0.0,36390603.0,18,2
17686,ffd2871d,3,3,173255.870612,424476883,0,1204349,193862.300262,2.677175,113377.5,9.092686,2450,1382.0,40.0,13.0,1015.0,760.0,1687.0,1.0,2.0,1382.0,40.0,13.0,1015.0,-0.116483,-1.174048,0.0,20.388951,601961.0,54533.0,0.0,1204349.0,211510.620839,27651.950000,0.0,129126.233498,143792.5,28553.0,0.0,87654.0,0.0,0.0,0.0,0.0,1.077566,-0.042056,0.0,4.537166,177803.267710,16658.486692,0.0,206245.356968,292307678.0,1106078.0,0.0,131063127.0,760.000000,1687.0,1.0,2.0,14.515865,0.538622,0.0,3.231196,1.204349e+06,601961.0,0.0,0.0,148066.993421,184911.658566,0.0,0.000000,94014.500000,131863.0,0.0,0.0,0.0,0.0,0.0,0.0,3.911667,1.310109,0.0,0.644670,234179.431178,171544.434637,0.0,0.000000,1.125309e+08,311945968.0,0.0,0.0,1,1
17687,ffeb0b1b,3,1,214236.207784,341278279,0,800116,214417.731703,1.202195,137577.0,0.321285,1593,381.0,247.0,24.0,941.0,944.0,534.0,3.0,112.0,381.0,247.0,24.0,941.0,-0.042522,-0.617790,0.0,-0.915399,348276.0,126295.0,0.0,800116.0,116547.490814,49247.672065,0.0,302560.584485,97256.0,48900.0,0.0,220430.0,0.0,0.0,0.0,0.0,0.960644,0.324022,0.0,0.625319,97130.762988,30324.623861,0.0,231963.968379,44404594.0,12164175.0,0.0,284709510.0,944.000000,534.0,3.0,112.0,-0.911740,1.106512,0.0,-1.096650,8.001160e+05,348276.0,0.0,126295.0,301599.057203,95825.322097,0.0,48196.848214,217865.500000,64607.0,0.0,49750.0,0.0,0.0,0.0,0.0,0.626658,1.374220,0.0,0.292296,232220.599471,89430.451610,0.0,37863.533434,2.847095e+08,51170722.0,0.0,5398047.0,2,5
17688,ffeb0b1b,3,0,214236.207784,341278279,0,800116,214417.731703,1.202195,137577.0,0.321285,1593,381.0,247.0,24.0,941.0,944.0,534.0,3.0,112.0,381.0,247.0,24.0,941.0,-0.042522,-0.617790,0.0,-0.915399,348276.0,126295.0,0.0,800116.0,116547.490814,49247.672065,0.0,302560.584485,97256.0,48900.0,0.0,220430.0,0.0,0.0,0.0,0.0,0.960644,0.324022,0.0,0.625319,97130.762988,30324.623861,0.0,231963.968379,44404594.0,12164175.0,0.0,284709510.0,944.000000,534.0,3.0,112.0,-0.911740,1.106512,0.0,-1.096650,8.001160e+05,348276.0,0.0,126295.0,301599.057203,95825.322097,0.0,48196.848214,217865.500000,64607.0,0.0,49750.0,0.0,0.0,0.0,0.0,0.626658,1.374220,0.0,0.292296,232220.599471,89430.451610,0.0,37863.533434,2.847095e+08,51170722.0,0.0,5398047.0,2,5


In [15]:
list(comp_test_df.columns)

['installation_id',
 'game_time_mean',
 'game_time_sum',
 'game_time_min',
 'game_time_max',
 'game_time_std',
 'game_time_skew',
 'game_time_median',
 'game_time_kurt',
 'game_time_count',
 'Activity',
 'Assessment',
 'Clip',
 'Game',
 'CRYSTALCAVES',
 'MAGMAPEAK',
 'NONE',
 'TREETOPCITY',
 ('game_time', 'count', 'Activity'),
 ('game_time', 'count', 'Assessment'),
 ('game_time', 'count', 'Clip'),
 ('game_time', 'count', 'Game'),
 ('game_time', 'kurt', 'Activity'),
 ('game_time', 'kurt', 'Assessment'),
 ('game_time', 'kurt', 'Clip'),
 ('game_time', 'kurt', 'Game'),
 ('game_time', 'max', 'Activity'),
 ('game_time', 'max', 'Assessment'),
 ('game_time', 'max', 'Clip'),
 ('game_time', 'max', 'Game'),
 ('game_time', 'mean', 'Activity'),
 ('game_time', 'mean', 'Assessment'),
 ('game_time', 'mean', 'Clip'),
 ('game_time', 'mean', 'Game'),
 ('game_time', 'median', 'Activity'),
 ('game_time', 'median', 'Assessment'),
 ('game_time', 'median', 'Clip'),
 ('game_time', 'median', 'Game'),
 ('game_ti

## Normalize

In [16]:
import re

In [17]:
comp_train_df.columns = [c if type(c) != tuple else '_'.join(c) for c in comp_train_df.columns]

In [18]:
comp_test_df.columns = [c if type(c) != tuple else '_'.join(c) for c in comp_test_df.columns]

In [19]:
comp_train_df.columns = [re.sub(r'\W', '_', s) for s in comp_train_df.columns]

In [20]:
comp_test_df.columns = [re.sub(r'\W', '_', s) for s in comp_test_df.columns]

In [21]:
list(comp_train_df.columns)

['installation_id',
 'title',
 'accuracy_group',
 'game_time_mean',
 'game_time_sum',
 'game_time_min',
 'game_time_max',
 'game_time_std',
 'game_time_skew',
 'game_time_median',
 'game_time_kurt',
 'game_time_count',
 'Activity',
 'Assessment',
 'Clip',
 'Game',
 'CRYSTALCAVES',
 'MAGMAPEAK',
 'NONE',
 'TREETOPCITY',
 'game_time_count_Activity',
 'game_time_count_Assessment',
 'game_time_count_Clip',
 'game_time_count_Game',
 'game_time_kurt_Activity',
 'game_time_kurt_Assessment',
 'game_time_kurt_Clip',
 'game_time_kurt_Game',
 'game_time_max_Activity',
 'game_time_max_Assessment',
 'game_time_max_Clip',
 'game_time_max_Game',
 'game_time_mean_Activity',
 'game_time_mean_Assessment',
 'game_time_mean_Clip',
 'game_time_mean_Game',
 'game_time_median_Activity',
 'game_time_median_Assessment',
 'game_time_median_Clip',
 'game_time_median_Game',
 'game_time_min_Activity',
 'game_time_min_Assessment',
 'game_time_min_Clip',
 'game_time_min_Game',
 'game_time_skew_Activity',
 'game_time

## Training

In [22]:
# quadratic weighted kappa
def qwk3(a1, a2, max_rat=3):
    '''
    a1 - ground truth
    a2 - predicted values
    '''
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

In [23]:
features = [i for i in comp_train_df.columns if i not in ['accuracy_group', 'installation_id']]
target = 'accuracy_group'
num_splits = 10
params = {
    'learning_rate': 0.007,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'feature_fraction': 0.75,
    "bagging_fraction": 0.8,
    "bagging_seed": 42,
    'max_depth': 11
}

early_stopping_rounds = 100

def train_model(comp_train_df, comp_test_df):
    
    kf = KFold(n_splits=num_splits, shuffle=True)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = early_stopping_rounds, 
                          valid_sets=[train_set, val_set], verbose_eval = early_stopping_rounds)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(comp_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return models

In [24]:
models = train_model(comp_train_df, comp_test_df)

Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.03489	valid_1's multi_logloss: 1.06054
[200]	training's multi_logloss: 0.943148	valid_1's multi_logloss: 0.983132
[300]	training's multi_logloss: 0.886634	valid_1's multi_logloss: 0.941571
[400]	training's multi_logloss: 0.849629	valid_1's multi_logloss: 0.918768
[500]	training's multi_logloss: 0.822702	valid_1's multi_logloss: 0.906012
[600]	training's multi_logloss: 0.80212	valid_1's multi_logloss: 0.899056
[700]	training's multi_logloss: 0.785433	valid_1's multi_logloss: 0.895113
[800]	training's multi_logloss: 0.771192	valid_1's multi_logloss: 0.892859
[900]	training's multi_logloss: 0.758808	valid_1's multi_logloss: 0.891959
[1000]	training's multi_logloss: 0.747794	valid_1's multi_logloss: 0.89248
Early stopping, best iteration is:
[933]	training's multi_logloss: 0.755053	valid_1's multi_logloss: 0.891843
Fold: 1 quadratic weighted kappa score: 0.6124
Fold: 2
Training until val

[200]	training's multi_logloss: 0.944519	valid_1's multi_logloss: 0.977096
[300]	training's multi_logloss: 0.888151	valid_1's multi_logloss: 0.933638
[400]	training's multi_logloss: 0.850879	valid_1's multi_logloss: 0.909286
[500]	training's multi_logloss: 0.82408	valid_1's multi_logloss: 0.895777
[600]	training's multi_logloss: 0.803586	valid_1's multi_logloss: 0.888495
[700]	training's multi_logloss: 0.786981	valid_1's multi_logloss: 0.884565
[800]	training's multi_logloss: 0.772681	valid_1's multi_logloss: 0.883316
[900]	training's multi_logloss: 0.760448	valid_1's multi_logloss: 0.883646
Early stopping, best iteration is:
[834]	training's multi_logloss: 0.768333	valid_1's multi_logloss: 0.883248
Fold: 10 quadratic weighted kappa score: 0.6102
Quadratic weighted score: 0.611


## Inference

In [25]:
def add_missing_columns(comp_train_df: pd.DataFrame, comp_test_df: pd.DataFrame):
    missing: set = set(comp_train_df.columns) - set(comp_test_df.columns)
    for col in missing:
        comp_test_df[col] = 0.
    print(f'Added missing colums: {missing}')

In [26]:
add_missing_columns(comp_train_df, comp_test_df)

Added missing colums: {'accuracy_group'}


In [27]:
def run_predictions(models):
    y_pred = np.zeros((len(comp_test_df), 4))
    for model in models:
        y_pred += model.predict(comp_test_df[features])
    return y_pred / num_splits

In [28]:
y_pred = run_predictions(models)

In [29]:
np.unique(y_pred.argmax(-1), return_counts=True)

(array([0, 1, 2, 3]), array([510,  19,  26, 445]))

In [30]:
assert comp_test_df.shape[0] == y_pred.shape[0]

In [31]:
def prepare_submission(comp_test_df, y_pred):
    comp_test_df = comp_test_df.reset_index()
    comp_test_df = comp_test_df[['installation_id']]
    comp_test_df['accuracy_group'] = y_pred.argmax(axis = 1)
    sample_submission_df = pd.read_csv(path/'sample_submission.csv')
    sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
    sample_submission_df = sample_submission_df.merge(comp_test_df, on = 'installation_id')
    sample_submission_df.to_csv('submission.csv', index = False)

In [32]:
prepare_submission(comp_test_df, y_pred)

In [33]:
!head submission.csv

installation_id,accuracy_group
00abaee7,3
01242218,3
017c5718,0
01a44906,0
01bc6cb6,0
02256298,3
0267757a,0
027e7ce5,3
02a29f99,0


In [34]:
!cat submission.csv | wc -l

1001
