In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
import gc
import json
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from pathlib import Path
import sys

In [2]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

In [3]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    specs_df = pd.read_csv(path/'specs.csv')
    return train_df, test_df, train_labels_df, specs_df

In [4]:
train_df, test_df, train_labels_df, specs_df = read_data()

In [5]:
train_df.columns

Index(['event_id', 'game_session', 'timestamp', 'event_data', 'installation_id', 'event_count', 'event_code', 'game_time', 'title', 'type', 'world'], dtype='object')

In [6]:
train_df.shape

(11341042, 11)

## Feature Engineering

### Cleanup

In [7]:
def remove_wrong_event_codes(df):
    return df[((df['title'] == 'Bird Measurer (Assessment)') & (df['event_code'] == 4100)) == False]

train_df = remove_wrong_event_codes(train_df)
test_df = remove_wrong_event_codes(test_df)

In [8]:
train_df.shape

(11338690, 11)

In [9]:
list_of_user_activities = list(set(train_df['title'].unique()) | (set(test_df['title'].unique())))
activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
assess_titles = list(set(train_df[train_df['type'] == 'Assessment']['title'].value_counts().index) | (set(test_df[test_df['type'] == 'Assessment']['title'].value_counts().index)))
# then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
win_code[activities_map['Bird Measurer (Assessment)']] = 4110
list_of_event_code = list(set(train_df['event_code'].unique()) | set(test_df['event_code'].unique()))

In [10]:
len(train_df['event_code'].unique())

42

In [11]:
len(set(train_df['event_code'].unique()))

42

In [12]:
train_samples = [(installation_id, user_sample) for (installation_id, user_sample) in train_df.groupby('installation_id')]

In [13]:
test_samples = [(installation_id, user_sample) for (installation_id, user_sample) in test_df.groupby('installation_id')]

In [14]:
def safe_div(dividend, divisor):
    return dividend / divisor if divisor != 0 else 0

def update_counters(counter, col, session):
    increment_counter = Counter(session[col])
    for x in increment_counter.keys():
        counter[x] += increment_counter[x]
    return counter    

def feature_generation(samples, is_test=False, sample_slice=slice(0, sys.maxsize)):

    all_features = [] # 21239
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}

    for (installation_id, user_sample) in tqdm(samples[sample_slice], total = len(samples[sample_slice])):
        accumulated_accuracy = 0
        counter = 0
        accumulated_correct_attempts = 0
        accumulated_incorrect_attempts = 0
        user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
        user_activities_time = user_activities_count.copy()
        user_activities_session_len = user_activities_count.copy()
        event_code_count = Counter({ev: 0 for ev in list_of_event_code})
        last_session_type = ''
        train_features = None
        compiled_train = []
        for i, session in user_sample.groupby('game_session', sort=False):
            session_type = session['type'].iloc[0]
            session_title = session['title'].iloc[0]
            session_length = len(session)
            if (session_type == 'Assessment') and (session_length > 1 or is_test):
                all_attempts = session[session['event_code'] == win_code[activities_map[session_title]]]
                all_attempts_str = all_attempts['event_data'].str
                true_attempts = all_attempts_str.contains('true').sum()
                false_attempts = all_attempts_str.contains('false').sum()

                train_features = user_activities_count.copy()
                train_features = \
{**train_features, **{f'{k}_time_mean': safe_div(v, user_activities_session_len[k]) for k,v in user_activities_time.items()}}
                train_features.update(event_code_count.copy())

                train_features.update(last_accuracy_title.copy())
                train_features['accumulated_accuracy'] = accumulated_accuracy / counter if counter > 0 else 0
                train_features['session_title'] = activities_map[session['title'].iloc[0]]

                train_features['accumulated_correct_attempts'] = accumulated_correct_attempts
                train_features['accumulated_incorrect_attempts'] = accumulated_incorrect_attempts
                accumulated_correct_attempts += true_attempts 
                accumulated_incorrect_attempts += false_attempts

                train_features['installation_id'] = session['installation_id'].iloc[-1]

                accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
                accumulated_accuracy += accuracy

                if accuracy == 0:
                    train_features['accuracy_group'] = 0
                elif accuracy == 1:
                    train_features['accuracy_group'] = 3
                elif accuracy == 0.5:
                    train_features['accuracy_group'] = 2
                else:
                    train_features['accuracy_group'] = 1
                    
                if is_test:
                    compiled_train.append(train_features)
                elif true_attempts + false_attempts > 0:
                    compiled_train.append(train_features)
                counter += 1

                last_accuracy_title['acc_' + session_title] = accuracy

            user_activities_count[session_type] += 1
            last_session_type = session_type
            user_activities_time[session_type] += session['game_time'].sum()
            user_activities_session_len[session_type] += session_length
            event_code_count = update_counters(event_code_count, 'event_code', session)
            
        if not is_test:
            all_features += compiled_train 
        else:
            all_features.append(compiled_train[-1]) # just append the last one

    return pd.DataFrame(all_features)

In [15]:
comp_train_df = feature_generation(train_samples, False)

HBox(children=(IntProgress(value=0, max=17000), HTML(value='')))




In [16]:
comp_train_df

Unnamed: 0,Clip,Activity,Assessment,Game,Clip_time_mean,Activity_time_mean,Assessment_time_mean,Game_time_mean,2050,4100,2060,4110,2070,2075,2080,2081,2083,3110,3120,3121,4220,4230,5000,4235,5010,4010,4020,4021,4022,4025,4030,4031,3010,4035,4040,3020,3021,4045,2000,4050,2010,2020,4070,2025,2030,4080,2035,2040,4090,4095,acc_Cauldron Filler (Assessment),acc_Chest Sorter (Assessment),acc_Cart Balancer (Assessment),acc_Bird Measurer (Assessment),acc_Mushroom Sorter (Assessment),accumulated_accuracy,session_title,accumulated_correct_attempts,accumulated_incorrect_attempts,installation_id,accuracy_group
0,11,3,0,4,0.0,62342.039773,0.000000,48983.785211,6,0,0,0,0,0,4,1,2,77,7,9,0,0,0,0,0,4,92,14,31,19,121,0,79,1,0,7,9,0,18,0,0,20,94,4,18,0,0,6,4,0,-1.000000,-1.0,-1.0,-1.0,-1.0,0.000000,6,0,0,0006a69f,3
1,14,4,1,6,0.0,59979.374101,17534.645833,70380.123810,6,5,1,2,1,0,4,1,2,223,11,16,0,0,0,0,0,6,127,14,31,37,149,0,226,6,2,11,16,0,25,0,1,26,156,5,22,0,1,6,4,0,-1.000000,-1.0,-1.0,-1.0,1.0,1.000000,30,1,0,0006a69f,0
2,14,4,2,6,0.0,59979.374101,37020.718519,70380.123810,6,5,1,13,1,0,4,1,2,225,22,16,0,0,0,0,0,6,127,14,31,59,171,0,228,6,2,22,16,0,26,0,1,27,160,5,22,0,1,6,4,0,-1.000000,-1.0,-1.0,0.0,1.0,0.500000,6,1,11,0006a69f,3
3,24,9,4,10,0.0,124341.192203,28933.117647,63704.850888,9,6,2,13,2,0,8,2,5,336,25,40,9,0,5,0,5,10,243,29,45,93,314,6,341,14,9,25,40,2,47,0,2,52,348,9,43,0,5,10,4,1,-1.000000,-1.0,-1.0,0.0,0.0,0.500000,6,2,11,0006a69f,2
4,28,10,5,13,0.0,116572.430213,25943.733624,74238.844284,9,12,3,13,2,1,8,2,5,457,30,53,9,0,5,0,5,12,277,29,45,105,331,6,463,15,10,30,53,2,56,0,3,64,387,10,53,0,6,10,4,1,-1.000000,-1.0,-1.0,0.0,0.5,0.500000,30,3,12,0006a69f,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17685,26,7,5,11,0.0,43527.747390,17893.305419,106262.551037,3,17,4,1,1,3,6,2,4,262,16,64,5,2,3,2,2,12,192,13,11,34,208,11,265,20,19,16,66,2,49,0,4,62,512,5,55,0,3,4,0,2,1.000000,0.5,1.0,1.0,1.0,0.866667,8,5,2,ffc90c32,3
17686,3,2,0,3,0.0,204121.834225,0.000000,98737.580247,0,0,2,0,2,0,1,0,1,102,7,23,0,7,0,7,0,3,95,15,0,0,117,12,102,22,7,7,24,2,8,0,0,34,250,2,31,0,1,0,3,1,1.000000,1.0,1.0,1.0,1.0,0.000000,15,0,0,ffd2871d,3
17687,11,3,0,4,0.0,116495.367213,0.000000,390519.784091,0,0,0,0,0,0,0,0,0,100,98,20,0,0,0,0,0,6,83,25,0,0,162,0,101,54,0,98,20,0,18,0,0,16,112,0,15,0,0,0,0,4,0.000000,1.0,1.0,1.0,1.0,0.000000,15,0,0,ffeb0b1b,1
17688,23,3,1,6,0.0,116495.367213,50403.022472,302560.584485,0,3,0,0,0,0,0,0,0,125,151,33,0,0,0,0,0,8,127,25,0,2,242,0,127,83,6,152,33,0,33,0,1,28,148,0,25,0,0,0,0,6,0.333333,1.0,1.0,1.0,1.0,0.333333,6,1,2,ffeb0b1b,0


In [17]:
comp_test_df = feature_generation(test_samples, True)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [18]:
comp_test_df

Unnamed: 0,Clip,Activity,Assessment,Game,Clip_time_mean,Activity_time_mean,Assessment_time_mean,Game_time_mean,2050,4100,2060,4110,2070,2075,2080,2081,2083,3110,3120,3121,4220,4230,5000,4235,5010,4010,4020,4021,4022,4025,4030,4031,3010,4035,4040,3020,3021,4045,2000,4050,2010,2020,4070,2025,2030,4080,2035,2040,4090,4095,acc_Cauldron Filler (Assessment),acc_Chest Sorter (Assessment),acc_Cart Balancer (Assessment),acc_Bird Measurer (Assessment),acc_Mushroom Sorter (Assessment),accumulated_accuracy,session_title,accumulated_correct_attempts,accumulated_incorrect_attempts,installation_id,accuracy_group
0,14,7,1,3,0.0,38077.154185,14546.846154,100566.394102,0,1,1,0,1,0,0,0,0,108,33,9,0,0,0,0,0,3,120,12,0,10,148,0,111,33,0,33,9,0,25,0,1,11,190,1,7,0,0,0,0,0,-1.0,-1.0,1.0,-1.000000,-1.000000,1.000000,15,1,0,00abaee7,0
1,29,11,5,12,0.0,87635.177729,27486.760331,73827.250689,6,21,4,3,1,3,10,2,7,416,25,72,9,0,4,0,4,12,325,26,36,94,407,11,418,49,25,25,72,6,57,0,4,85,385,2,80,0,2,6,2,0,0.5,0.0,1.0,0.333333,0.500000,0.466667,22,4,7,01242218,0
2,6,2,0,0,0.0,34633.461538,0.000000,0.000000,0,0,0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,15,0,20,4,40,0,24,0,0,0,0,0,8,0,0,0,14,0,0,0,0,0,0,0,0.5,0.0,0.0,0.333333,0.500000,0.000000,6,0,0,017c5718,0
3,10,2,0,1,0.0,46314.958621,0.000000,37390.384615,0,0,0,0,0,0,0,0,0,21,2,3,0,0,0,0,0,1,31,0,29,9,61,0,21,0,0,2,3,0,13,0,0,3,28,2,3,0,0,0,1,0,0.5,0.0,0.0,0.333333,0.000000,0.000000,6,0,0,01a44906,0
4,17,1,0,6,0.0,110746.128319,0.000000,163203.871469,0,0,0,0,0,0,1,0,1,73,27,27,0,0,0,0,0,6,146,0,3,0,231,0,75,62,16,27,27,4,24,0,0,28,134,9,25,0,0,0,0,5,0.5,0.0,0.0,0.333333,0.000000,0.000000,22,0,0,01bc6cb6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,0,3,1,0.0,0.000000,53199.175439,63421.108696,0,1,0,1,0,0,0,0,0,19,1,8,0,0,0,0,0,1,22,0,0,19,39,0,20,0,1,1,8,0,8,0,2,8,41,2,6,0,1,0,0,1,0.5,0.0,0.0,1.000000,1.000000,0.666667,15,2,0,fee254cf,0
996,11,2,1,2,0.0,33302.031496,17682.642857,65010.669118,0,1,2,0,0,1,0,0,0,21,8,8,0,0,0,0,0,2,65,0,1,0,54,0,23,11,0,8,8,0,16,0,1,7,56,0,7,0,0,0,0,2,0.0,0.0,1.0,1.000000,1.000000,1.000000,8,1,0,ff57e602,0
997,32,2,4,0,0.0,81037.907563,51847.305882,0.000000,0,9,0,0,0,0,0,0,0,48,6,10,0,0,0,0,0,0,53,4,0,23,97,0,48,15,6,6,10,0,38,0,3,6,136,1,5,0,1,0,0,0,1.0,0.0,1.0,1.000000,0.333333,0.583333,22,3,6,ffc73fb2,0
998,11,3,3,1,0.0,35926.439024,17343.311927,14726.866667,0,6,1,0,0,1,0,0,0,42,4,4,0,0,0,0,0,1,12,0,0,6,37,0,44,13,6,4,4,0,18,0,0,6,45,0,2,0,0,0,2,0,0.0,0.0,0.0,1.000000,0.333333,0.250000,22,2,4,ffe00ca8,0


## Training

In [19]:
# quadratic weighted kappa
def qwk3(a1, a2, max_rat=3):
    '''
    a1 - ground truth
    a2 - predicted values
    '''
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)
    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))
    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)
    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)
    e = e / a1.shape[0]
    return 1 - o / e

In [20]:
features = [i for i in comp_train_df.columns if i not in ['accuracy_group', 'installation_id', 'game_session']]
target = 'accuracy_group'
num_splits = 10
params = {
    'learning_rate': 0.007,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'feature_fraction': 0.75,
    "bagging_fraction": 0.8,
    "bagging_seed": 42
}

early_stopping_rounds = 100
num_boost_round = 2000

def train_model(comp_train_df):
    
    kf = KFold(n_splits=num_splits, shuffle=True)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round = num_boost_round, early_stopping_rounds = early_stopping_rounds, 
                          valid_sets=[train_set, val_set], verbose_eval = early_stopping_rounds)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(comp_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return models

In [21]:
%%time
models = train_model(comp_train_df)

Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 1.08378	valid_1's multi_logloss: 1.11452
[200]	training's multi_logloss: 1.0138	valid_1's multi_logloss: 1.06959
[300]	training's multi_logloss: 0.970696	valid_1's multi_logloss: 1.04866
[400]	training's multi_logloss: 0.938456	valid_1's multi_logloss: 1.03744
[500]	training's multi_logloss: 0.912522	valid_1's multi_logloss: 1.03097
[600]	training's multi_logloss: 0.889725	valid_1's multi_logloss: 1.02677
[700]	training's multi_logloss: 0.869692	valid_1's multi_logloss: 1.02408
[800]	training's multi_logloss: 0.85111	valid_1's multi_logloss: 1.02235
[900]	training's multi_logloss: 0.833976	valid_1's multi_logloss: 1.02085
[1000]	training's multi_logloss: 0.817739	valid_1's multi_logloss: 1.0196
[1100]	training's multi_logloss: 0.802195	valid_1's multi_logloss: 1.01908
[1200]	training's multi_logloss: 0.787201	valid_1's multi_logloss: 1.01907
Early stopping, best iteration is:
[1178]	tr

[500]	training's multi_logloss: 0.915317	valid_1's multi_logloss: 1.01107
[600]	training's multi_logloss: 0.892862	valid_1's multi_logloss: 1.00476
[700]	training's multi_logloss: 0.872655	valid_1's multi_logloss: 1.00066
[800]	training's multi_logloss: 0.854007	valid_1's multi_logloss: 0.997813
[900]	training's multi_logloss: 0.836594	valid_1's multi_logloss: 0.995857
[1000]	training's multi_logloss: 0.819819	valid_1's multi_logloss: 0.994651
[1100]	training's multi_logloss: 0.80428	valid_1's multi_logloss: 0.993713
[1200]	training's multi_logloss: 0.789399	valid_1's multi_logloss: 0.992727
[1300]	training's multi_logloss: 0.774991	valid_1's multi_logloss: 0.991955
[1400]	training's multi_logloss: 0.761443	valid_1's multi_logloss: 0.991475
[1500]	training's multi_logloss: 0.748513	valid_1's multi_logloss: 0.991274
[1600]	training's multi_logloss: 0.735677	valid_1's multi_logloss: 0.990829
[1700]	training's multi_logloss: 0.723459	valid_1's multi_logloss: 0.990715
[1800]	training's mul

## Inference

In [22]:
def run_predictions(models, df):
    y_pred = np.zeros((len(df), 4))
    for i, model in enumerate(models):
        y_pred += model.predict(df[features])
        print(f'Ran {i}th model ')
    return y_pred / num_splits

In [23]:
y_pred = run_predictions(models[:], comp_test_df)

Ran 0th model 
Ran 1th model 
Ran 2th model 
Ran 3th model 
Ran 4th model 
Ran 5th model 
Ran 6th model 
Ran 7th model 
Ran 8th model 
Ran 9th model 


In [24]:
np.unique(y_pred.argmax(-1), return_counts=True)

(array([0, 1, 3]), array([214,  42, 744]))

In [25]:
comp_test_df['accuracy_group'] = y_pred.argmax(-1)

## Submission

In [26]:
def prepare_submission(submission_df):
    sample_submission_df = pd.read_csv(path/'sample_submission.csv')
    sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
    sample_submission_df = sample_submission_df.merge(submission_df, on = 'installation_id')
    sample_submission_df = sample_submission_df[['installation_id', 'accuracy_group']]
    sample_submission_df.to_csv('submission.csv', index = False)

In [27]:
prepare_submission(comp_test_df)

In [28]:
!head submission.csv

installation_id,accuracy_group
00abaee7,3
01242218,3
017c5718,3
01a44906,3
01bc6cb6,3
02256298,3
0267757a,3
027e7ce5,3
02a29f99,0


In [29]:
!cat submission.csv | wc -l

1001
