In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.stats import kurtosis, skew
import gc
import json
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from pathlib import Path
import sys
import re

In [3]:
path=Path('/kaggle/data_science_bowl')
path

PosixPath('/kaggle/data_science_bowl')

In [4]:
def read_data():
    train_df = pd.read_csv(path/'train.csv')
    test_df = pd.read_csv(path/'test.csv')
    train_labels_df = pd.read_csv(path/'train_labels.csv')
    specs_df = pd.read_csv(path/'specs.csv')
    return train_df, test_df, train_labels_df, specs_df

In [5]:
%%time
train_df, test_df, train_labels_df, specs_df = read_data()

KeyboardInterrupt: 

In [6]:
train_df.columns

NameError: name 'train_df' is not defined

## Feature Engineering

### Cleanup

In [None]:
def remove_wrong_event_codes(df):
    return df[((df['title'] == 'Bird Measurer (Assessment)') & (df['event_code'] == 4100)) == False]

train_df = remove_wrong_event_codes(train_df)
test_df = remove_wrong_event_codes(test_df)

In [None]:
def remove_ids_with_no_assessment(df):
    # Remove `installation_id` without any assesments
    ids_with_subms = df[df.type == "Assessment"][['installation_id']].drop_duplicates()
    df = pd.merge(df, ids_with_subms, on="installation_id", how="inner")
    return df

In [None]:
train_df = remove_ids_with_no_assessment(train_df)

In [None]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])

In [None]:
from scripts import feature_engineering_simple1

one_hot_counters=["title", "type", "event_code", "event_id"]
comp_train_df = feature_engineering_simple1.prepare(train_df, one_hot_counters)

In [None]:
comp_train_df['title']

In [None]:
comp_test_df = feature_engineering_simple1.prepare(test_df, one_hot_counters, test=True)

In [None]:
comp_test_df

## Normalize

In [None]:
del_cols = ["timestamp_Second", 'timestamp', 'title']
for col in comp_train_df.columns.values:
    counts = comp_train_df[col].value_counts().iloc[0]
    if (counts / comp_train_df.shape[0]) >= 0.99:
        del_cols.append(col)
comp_train_df.drop(columns=del_cols, inplace=True, errors="ignore")
comp_test_df.drop(columns=del_cols, inplace=True, errors="ignore")
display(f"Dropped {del_cols}")

In [None]:
comp_train_df = comp_train_df[comp_train_df[comp_train_df.columns[comp_train_df.columns.str.startswith("duration_", na=False)].to_list()].apply(sum, axis=1) < 10000].reset_index(drop=True)

In [None]:
comp_train_df

## Training

In [None]:
features = [i for i in comp_train_df.columns if i not in ['accuracy_group', 'installation_id', 'game_session']]
target = 'accuracy_group'
num_splits = 10
params = {
    'learning_rate': 0.007,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'feature_fraction': 0.75,
    "bagging_fraction": 0.8,
    "bagging_seed": 42
}

early_stopping_rounds = 100
num_boost_round = 4000

def train_model(comp_train_df):
    
    kf = KFold(n_splits=num_splits, shuffle=True)
    
    oof_pred = np.zeros((len(comp_train_df), 4))
    models = []
    
    for fold, (tr_ind, val_ind) in enumerate(kf.split(comp_train_df)):
        print(f'Fold: {fold+1}')
        x_train, x_val = comp_train_df[features].iloc[tr_ind], comp_train_df[features].iloc[val_ind]
        y_train, y_val = comp_train_df[target][tr_ind], comp_train_df[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        model = lgb.train(params, train_set, num_boost_round = num_boost_round, early_stopping_rounds = early_stopping_rounds, 
                          valid_sets=[train_set, val_set], verbose_eval = early_stopping_rounds)
        oof_pred[val_ind] = model.predict(x_val)
        models.append(model)
        
        val_crt_fold = qwk3(y_val, oof_pred[val_ind].argmax(axis = 1))
        print(f'Fold: {fold+1} quadratic weighted kappa score: {np.round(val_crt_fold,4)}')
        
    res = qwk3(comp_train_df['accuracy_group'], oof_pred.argmax(axis = 1))
    print(f'Quadratic weighted score: {np.round(res,4)}')
        
    return models

In [None]:
%%time
models = train_model(comp_train_df)

## Inference