In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb

In [3]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

### Read Data

In [4]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    sample_submission_df = pd.read_csv(path/'sample_submission.csv')
    return train_df, test_df, train_labels_df, sample_submission_df

In [5]:
train_df, test_df, train_labels_df, sample_submission_df = read_data()

In [6]:
train_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [7]:
test_df.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [8]:
train_labels_df.head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.5,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3


## Feature Engineering

In [9]:
def get_object_columns(df, columns):
    df = df.groupby(['installation_id', columns])['event_id'].count().reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [columns], values = 'event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace = True)
    return df

agg_stats = ['mean', 'sum', 'min', 'max', 'std', 'skew', 'median']

def get_numeric_columns(df, column):
    df = df.groupby('installation_id').agg({f'{column}': agg_stats})
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = [f'{column}_{stat}' for stat in agg_stats]
    return df

def get_numeric_columns_add(df, agg_column, column):
    df = df.groupby(['installation_id', agg_column]).agg({f'{column}': agg_stats}).reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [agg_column], values = [col for col in df.columns if col not in ['installation_id', agg_column]])
    df[column].fillna(df[column].mean(), inplace = True)
    df.columns = list(df.columns)
    return df

def get_event_counts(df):
    df_event_count = df.groupby(['installation_id', 'event_id']).agg({'event_id': ['count']})
    df_event_count.columns = ['count']
    df_event_count_pivot = df_event_count.pivot_table(index = 'installation_id', columns = ['event_id'], values = 'count')
    return df_event_count_pivot.fillna(0)

def add_missing_columns(comp_train_df, comp_test_df):
    pass

def perform_features_engineering(train_df, test_df, train_labels_df):
    
    numerical_columns = ['game_time']
    categorical_columns = ['type', 'world']

    comp_train_df = pd.DataFrame({'installation_id': train_df['installation_id'].unique()})
    comp_train_df.set_index('installation_id', inplace = True)
    comp_test_df = pd.DataFrame({'installation_id': test_df['installation_id'].unique()})
    comp_test_df.set_index('installation_id', inplace = True)

    for i in numerical_columns:
        comp_train_df = comp_train_df.merge(get_numeric_columns(train_df, i), left_index = True, right_index = True)
        comp_test_df = comp_test_df.merge(get_numeric_columns(test_df, i), left_index = True, right_index = True)
        
    print(comp_train_df.columns)
    
    for i in categorical_columns:
        comp_train_df = comp_train_df.merge(get_object_columns(train_df, i), left_index = True, right_index = True)
        comp_test_df = comp_test_df.merge(get_object_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        for j in numerical_columns:
            comp_train_df = comp_train_df.merge(get_numeric_columns_add(train_df, i, j), left_index = True, right_index = True)
            comp_test_df = comp_test_df.merge(get_numeric_columns_add(test_df, i, j), left_index = True, right_index = True)
            
    comp_train_df = comp_train_df.merge(get_event_counts(train_df), left_index = True, right_index = True)
    comp_test_df = comp_test_df.merge(get_event_counts(test_df), left_index = True, right_index = True)
    
    comp_train_df.reset_index(inplace = True)
    comp_test_df.reset_index(inplace = True)
    
    print(f'Our training set has {comp_train_df.shape[0]} rows and {comp_train_df.shape[1]} columns')

    # get the mode of the title
    labels_map = dict(train_labels_df.groupby('title')['accuracy_group'].agg(lambda x:x.value_counts().index[0]))
    # merge target
    labels = train_labels_df[['installation_id', 'title', 'accuracy_group']]
    # replace title with the mode
    labels['title'] = labels['title'].map(labels_map)
    # get title from the test set
    comp_test_df['title'] = test_df.groupby('installation_id').last()['title'].map(labels_map).reset_index(drop = True)
    # join train with labels
    comp_train_df = labels.merge(comp_train_df, on = 'installation_id', how = 'left')
    print(f'We have {comp_train_df.shape[0]} training rows')
    
    return comp_train_df, comp_test_df

In [10]:
comp_train_df, comp_test_df = perform_features_engineering(train_df, test_df, train_labels_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Index(['game_time_mean', 'game_time_sum', 'game_time_min', 'game_time_max',
       'game_time_std', 'game_time_skew', 'game_time_median'],
      dtype='object')


  new_axis = axis.drop(labels, errors=errors)


Our training set has 17000 rows and 456 columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


We have 17690 training rows


In [11]:
comp_test_df

Unnamed: 0,installation_id,game_time_mean,game_time_sum,game_time_min,game_time_max,game_time_std,game_time_skew,game_time_median,Activity,Assessment,...,f5b8c21a,f6947f54,f71c4741,f7e47413,f806dc10,f93fc684,fbaf3456,fcfdffb6,fd20ea40,title
0,00abaee7,63567.408986,55176511,0,1960630,149911.784066,11.578484,40657.0,454.0,27.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
1,01242218,75770.044869,206018752,0,317027,61789.322138,1.421303,62056.0,1356.0,245.0,...,3.0,2.0,17.0,17.0,3.0,1.0,6.0,32.0,1.0,3
2,017c5718,33017.233333,4952585,0,60943,17140.293312,-0.421696,35616.5,143.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,3
3,01a44906,41162.901709,9632119,0,85983,28696.300044,0.231737,32967.0,145.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0,3
4,01bc6cb6,147664.880252,140576966,0,511237,128478.809653,1.250042,106076.5,226.0,1.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,fee254cf,56850.495283,12052305,0,155008,43941.426092,0.454220,49113.5,0.0,116.0,...,0.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3
996,ff57e602,44772.023102,13565923,0,139394,31785.997875,0.699889,39918.0,127.0,29.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
997,ffc73fb2,61802.442966,32508085,0,199825,53869.630318,0.728098,46648.0,238.0,256.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3
998,ffe00ca8,25213.420849,6530276,0,72242,20859.265204,0.705400,19590.0,123.0,110.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3


## Normalize

In [12]:
comp_train_df.columns = [c if type(c) != tuple else '_'.join(c) for c in comp_train_df.columns]

In [13]:
comp_test_df.columns = [c if type(c) != tuple else '_'.join(c) for c in comp_test_df.columns]

In [14]:
list(comp_train_df.columns)

['installation_id',
 'title',
 'accuracy_group',
 'game_time_mean',
 'game_time_sum',
 'game_time_min',
 'game_time_max',
 'game_time_std',
 'game_time_skew',
 'game_time_median',
 'Activity',
 'Assessment',
 'Clip',
 'Game',
 'CRYSTALCAVES',
 'MAGMAPEAK',
 'NONE',
 'TREETOPCITY',
 'game_time_max_Activity',
 'game_time_max_Assessment',
 'game_time_max_Clip',
 'game_time_max_Game',
 'game_time_mean_Activity',
 'game_time_mean_Assessment',
 'game_time_mean_Clip',
 'game_time_mean_Game',
 'game_time_median_Activity',
 'game_time_median_Assessment',
 'game_time_median_Clip',
 'game_time_median_Game',
 'game_time_min_Activity',
 'game_time_min_Assessment',
 'game_time_min_Clip',
 'game_time_min_Game',
 'game_time_skew_Activity',
 'game_time_skew_Assessment',
 'game_time_skew_Clip',
 'game_time_skew_Game',
 'game_time_std_Activity',
 'game_time_std_Assessment',
 'game_time_std_Clip',
 'game_time_std_Game',
 'game_time_sum_Activity',
 'game_time_sum_Assessment',
 'game_time_sum_Clip',
 'game_

In [15]:
def normalize(df: pd.DataFrame, cont_names):
    "Compute the means and stds of `self.cont_names` columns to normalize them."
    means, stds = {},{}
    for n in cont_names:
        means[n], stds[n] = df[n].mean(), df[n].std()
        df[n] = (df[n]-means[n]) / (1e-7 + stds[n])

In [16]:
normalize(comp_train_df, [c for c in comp_train_df.columns if c not in ['installation_id', 'title', 'accuracy_group']])
normalize(comp_test_df, [c for c in comp_test_df.columns if c not in ['installation_id', 'title', 'accuracy_group']])

## Training

In [17]:
# quadratic weighted kappa
def qwk3(a1, a2, max_rat=3):
    '''
    a1 - ground truth
    a2 - predicted values
    '''
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

In [18]:
features = [i for i in comp_train_df.columns if i not in ['accuracy_group', 'installation_id']]
target = 'accuracy_group'
num_splits = 10
params = {
    'learning_rate': 0.007,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'feature_fraction': 0.75,
    "bagging_fraction": 0.8,
    "bagging_seed": 42,
}

def train_model(comp_train_df, comp_test_df):
    
    kf = KFold(n_splits=num_splits, shuffle=True)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = 100, 
                          valid_sets=[train_set, val_set], verbose_eval = 100)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(comp_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return models

In [19]:
models = train_model(comp_train_df, comp_test_df)

Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.954551	valid_1's multi_logloss: 0.994869
[200]	training's multi_logloss: 0.829172	valid_1's multi_logloss: 0.882416
[300]	training's multi_logloss: 0.759873	valid_1's multi_logloss: 0.825151
[400]	training's multi_logloss: 0.716836	valid_1's multi_logloss: 0.792388
[500]	training's multi_logloss: 0.687768	valid_1's multi_logloss: 0.774018
[600]	training's multi_logloss: 0.666132	valid_1's multi_logloss: 0.762873
[700]	training's multi_logloss: 0.650282	valid_1's multi_logloss: 0.757642
[800]	training's multi_logloss: 0.638087	valid_1's multi_logloss: 0.756819
[900]	training's multi_logloss: 0.627749	valid_1's multi_logloss: 0.757631
Early stopping, best iteration is:
[824]	training's multi_logloss: 0.635209	valid_1's multi_logloss: 0.756391
Fold: 1 quadratic weighted kappa score: 0.6583
Fold: 2
Training until validation scores don't improve for 100 rounds
[100]	training's multi_loglo

[800]	training's multi_logloss: 0.637392	valid_1's multi_logloss: 0.745745
Early stopping, best iteration is:
[718]	training's multi_logloss: 0.647648	valid_1's multi_logloss: 0.744627
Fold: 10 quadratic weighted kappa score: 0.634
Quadratic weighted score: 0.6539


## Inference

In [39]:
def add_missing_columns(comp_train_df: pd.DataFrame, comp_test_df: pd.DataFrame):
    missing: set = set(comp_train_df.columns) - set(comp_test_df.columns)
    for col in missing:
        comp_test_df[col] = 0.
    print(f'Added missing colums: {missing}')

In [41]:
add_missing_columns(comp_train_df, comp_test_df)

Added set()


In [42]:
def run_predictions(models):
    y_pred = np.zeros((len(comp_test_df), 4))
    for model in models:
        y_pred += model.predict(comp_test_df[features])
    return y_pred / num_splits

In [43]:
y_pred = run_predictions(models)

In [44]:
assert comp_test_df.shape[0] == y_pred.shape[0]

In [45]:
np.unique(y_pred.argmax(-1), return_counts=True)

(array([0, 1, 3]), array([ 42, 210, 748]))

In [46]:
def prepare_submission(comp_test_df, sample_submission_df, y_pred):
    comp_test_df = comp_test_df.reset_index()
    comp_test_df = comp_test_df[['installation_id']]
    comp_test_df['accuracy_group'] = y_pred.argmax(axis = 1)
    sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
    sample_submission_df = sample_submission_df.merge(comp_test_df, on = 'installation_id')
    sample_submission_df.to_csv('submission.csv', index = False)

In [47]:
prepare_submission(comp_test_df, sample_submission_df, y_pred)

In [49]:
!head submission.csv

installation_id,accuracy_group
00abaee7,3
01242218,3
017c5718,3
01a44906,3
01bc6cb6,3
02256298,3
0267757a,3
027e7ce5,3
02a29f99,1


In [50]:
!cat submission.csv | wc -l

1001
