In [None]:
import numpy as np
import pandas as pd
import os
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm as lgb

## credits: https://www.kaggle.com/ragnar123/simple-exploratory-data-analysis-and-model

def read_data():
    print(f'Read data')
    train_df = pd.read_csv('../input/data-science-bowl-2019/train.csv')
    test_df = pd.read_csv('../input/data-science-bowl-2019/test.csv')
    train_labels_df = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
    specs_df = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
    sample_submission_df = pd.read_csv('../input/data-science-bowl-2019/sample_submission.csv')
    
    return train_df, test_df, train_labels_df, specs_df, sample_submission_df


def get_time(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    return df
    
def get_object_columns(df, columns):
    df = df.groupby(['installation_id', columns])['event_id'].count().reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [columns], values = 'event_id')
    df.columns = list(df.columns)
    df.fillna(0, inplace = True)
    return df

def get_numeric_columns(df, column):
    df = df.groupby('installation_id').agg({f'{column}': ['mean', 'sum', 'std']})
    df.fillna(0, inplace = True)
    df.columns = [f'{column}_mean', f'{column}_sum', f'{column}_std']
    return df

def get_numeric_columns_2(df, agg_column, column):
    df = df.groupby(['installation_id', agg_column]).agg({f'{column}': ['mean', 'sum', 'std']}).reset_index()
    df = df.pivot_table(index = 'installation_id', columns = [agg_column], values = [col for col in df.columns if col not in ['installation_id', 'type']])
    df.fillna(0, inplace = True)
    df.columns = list(df.columns)
    return df

def perform_features_engineering(train_df, test_df, train_labels_df):
    numerical_columns = ['game_time']
    categorical_columns = ['type', 'world']

    reduce_train = pd.DataFrame({'installation_id': train_df['installation_id'].unique()})
    reduce_train.set_index('installation_id', inplace = True)
    reduce_test = pd.DataFrame({'installation_id': test_df['installation_id'].unique()})
    reduce_test.set_index('installation_id', inplace = True)

    test_df = get_time(test_df)
    train_df = get_time(train_df)

    for i in numerical_columns:
        reduce_train = reduce_train.merge(get_numeric_columns(train_df, i), left_index = True, right_index = True)
        reduce_test = reduce_test.merge(get_numeric_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        reduce_train = reduce_train.merge(get_object_columns(train_df, i), left_index = True, right_index = True)
        reduce_test = reduce_test.merge(get_object_columns(test_df, i), left_index = True, right_index = True)
    
    for i in categorical_columns:
        for j in numerical_columns:
            reduce_train = reduce_train.merge(get_numeric_columns_2(train_df, i, j), left_index = True, right_index = True)
            reduce_test = reduce_test.merge(get_numeric_columns_2(test_df, i, j), left_index = True, right_index = True)
    
    
    reduce_train.reset_index(inplace = True)
    reduce_test.reset_index(inplace = True)
    
    print('Our training set have {} rows and {} columns'.format(reduce_train.shape[0], reduce_train.shape[1]))

    # get the mode of the title
    labels_map = dict(train_labels_df.groupby('title')['accuracy_group'].agg(lambda x:x.value_counts().index[0]))
    # merge target
    labels = train_labels_df[['installation_id', 'title', 'accuracy_group']]
    # replace title with the mode
    labels['title'] = labels['title'].map(labels_map)
    # get title from the test set
    reduce_test['title'] = test_df.groupby('installation_id').last()['title'].map(labels_map).reset_index(drop = True)
    # join train with labels
    reduce_train = labels.merge(reduce_train, on = 'installation_id', how = 'left')
    print('We have {} training rows'.format(reduce_train.shape[0]))
    
    return reduce_train, reduce_test

def run_lgb(reduce_train, reduce_test):
    kf = KFold(n_splits=5)
    features = [i for i in reduce_train.columns if i not in ['accuracy_group', 'installation_id']]
    target = 'accuracy_group'
    oof_pred = np.zeros((len(reduce_train), 4))
    y_pred = np.zeros((len(reduce_test), 4))
    for fold, (tr_ind, val_ind) in enumerate(kf.split(reduce_train)):
        print('Fold {}'.format(fold + 1))
        x_train, x_val = reduce_train[features].iloc[tr_ind], reduce_train[features].iloc[val_ind]
        y_train, y_val = reduce_train[target][tr_ind], reduce_train[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)

        params = {
            'learning_rate': 0.01,
            'metric': 'multiclass',
            'objective': 'multiclass',
            'num_classes': 4,
            'feature_fraction': 0.75,
            'subsample': 0.75
        }

        model = lgb.train(params, train_set, num_boost_round = 100000, early_stopping_rounds = 100, 
                          valid_sets=[train_set, val_set], verbose_eval = 100)
        oof_pred[val_ind] = model.predict(x_val)
        y_pred += model.predict(reduce_test[features]) / 5
    return y_pred


train_df, test_df, train_labels_df, specs_df, sample_submission_df = read_data()
reduce_train, reduce_test = perform_features_engineering(train_df, test_df, train_labels_df)
y_pred = run_lgb(reduce_train, reduce_test)
reduce_test = reduce_test.reset_index()
reduce_test = reduce_test[['installation_id']]
reduce_test['accuracy_group'] = y_pred.argmax(axis = 1)
sample_submission_df.drop('accuracy_group', inplace = True, axis = 1)
sample_submission_df = sample_submission_df.merge(reduce_test, on = 'installation_id')
sample_submission_df.to_csv('submission.csv', index = False)