# Overview
- nb008の改良　
- nb009で見つけたハイパラを使用する

# Const

In [17]:
NB = '010'
N_SPLITS = 5
SHOW_LOG = True
# VERBOSE = 10
VERBOSE = None
EARLY_STOPPING_ROUNDS = 100


PATH_TRAIN = './../data/official/train.csv'
PATH_TEST = './../data/official/test.csv'
PATH_SAMPLE_SUBMITTION = './../data/official/atmaCup8_sample-submission.csv'
SAVE_DIR = f'../data/output_nb/nb{NB}/'

feat_train_only = ['JP_Sales', 'Global_Sales', 'NA_Sales', 'Other_Sales', 'EU_Sales']
feat_common = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
           'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer',
           'Rating']
feat_string = ['Platform', 'Genre', 'Publisher', 'Developer', 'Rating']

# Import everything I need :)

In [3]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMRegressor 

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')

# My function

In [4]:
def metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** .5

def preprocess_User_Score(df):
    '''
    - tbdをnanにする
    - stringをfloatにする
    '''
    mask = df.User_Score.values == 'tbd'
    df.User_Score[mask] = np.nan
    df.User_Score = df.User_Score.values.astype(float)
    return df

def string_encode(df, cols):
    '''
    - np.nanがあれば、'nan'に置き換える
    - label encodingする
    '''
    df[cols] = df[cols].replace(np.nan, 'nan')
    for col in cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

def df_preprocessing(df, string_cols):
    df = preprocess_User_Score(df)
    df = string_encode(df, string_cols)
    return df

In [5]:
def run_fold_lgbm(_X_trn, _y_trn, _X_val, _y_val, _X_te, model_config, show_log=True):
    FEAT_STRING = ['Platform', 'Genre', 'Publisher', 'Developer', 'Rating']
    
    # train
    model = LGBMRegressor(objective='regression', **model_config)
    model.fit(_X_trn, _y_trn.values[:, 0],
              categorical_feature=FEAT_STRING,
              eval_set=[(_X_trn, _y_trn), (_X_val, _y_val)],
              verbose=VERBOSE,
              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
             )
    
    # predict
    y_trn_pred = model.predict(_X_trn)
    y_val_pred = model.predict(_X_val)
    _y_test_pred = model.predict(_X_te)
    
    # postprocessiing
    y_trn_pred[y_trn_pred <= 1] = 1
    y_val_pred[y_val_pred <= 1] = 1
    
    if show_log:
        print(show_log)
        print(f'score train: {metric(_y_trn, y_trn_pred):.5f}')
        print(f'score valid: {metric(_y_val, y_val_pred):.5f}')
        print('')
    
    return y_trn_pred, y_val_pred, _y_test_pred

def run(X, y, X_te, splitter, use_col, model_config, show_log=True):
    print(f'use_col: {use_col}\n') if show_log else None

    oof = np.zeros(len(X))
    y_test_pred = np.zeros(len(X_te))
    for fold_i, (idx_trn, idx_val) in enumerate(splitter.split(X)):
        if show_log:
            print(f'::Fold {fold_i+1}/{N_SPLITS} start at {time.ctime()}::')
        X_trn, X_val = X.iloc[idx_trn, :], X.iloc[idx_val, :]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        X_trn = pd.DataFrame(X_trn, columns=X.columns)
        X_val = pd.DataFrame(X_val, columns=X.columns)

        # train
        y_trn_pred, y_val_pred, _y_test_pred = run_fold_lgbm(X_trn, y_trn, X_val, y_val, 
                                                             X_te, model_config, show_log=show_log)

        # result
        oof[idx_val] = y_val_pred
        y_test_pred += _y_test_pred / N_SPLITS


    y_test_pred[y_test_pred <= 1] = 1
    return oof, y_test_pred

# Preparation

set

In [6]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

<br>

load dataset

In [7]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)
ss = pd.read_csv(PATH_SAMPLE_SUBMITTION)

<br>

preprocess

In [8]:
train = df_preprocessing(train, feat_string)
test = df_preprocessing(test, feat_string)

In [9]:
mask = test.columns != 'Name'
use_col = test.columns[mask].tolist()
use_col

['Platform',
 'Year_of_Release',
 'Genre',
 'Publisher',
 'Critic_Score',
 'Critic_Count',
 'User_Score',
 'User_Count',
 'Developer',
 'Rating']

In [10]:
X = train[use_col].copy()
y = train[['Global_Sales']].copy()
X_te = test[use_col].copy()

# Create Model

In [22]:
splitter = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)

In [28]:
model_config = {
    'n_estimators': 800,
    'max_depth': 55,
    'subsample': 0.5,
    'colsample_bytree': 0.9, 
#     'learning_rate': 0.006437110612661787,
    'learning_rate': 0.01,
    'reg_alpha': 5.0,
    'reg_lambda': 5.0,
    'min_child_samples': 57
    }

In [29]:
%%time
oof, y_test_pred = run(X, y, X_te, splitter, use_col, model_config, show_log=SHOW_LOG)
print(f'oof score: {metric(y, oof):.5f}')

use_col: ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating']

::Fold 1/5 start at Sat Dec  5 10:40:27 2020::
True
score train: 1.05393
score valid: 1.11073

::Fold 2/5 start at Sat Dec  5 10:40:28 2020::
True
score train: 1.03543
score valid: 1.08560

::Fold 3/5 start at Sat Dec  5 10:40:29 2020::
True
score train: 0.99053
score valid: 1.12504

::Fold 4/5 start at Sat Dec  5 10:40:30 2020::
True
score train: 1.01715
score valid: 1.10029

::Fold 5/5 start at Sat Dec  5 10:40:31 2020::
True
score train: 1.01704
score valid: 1.13990

oof score: 1.11247
CPU times: user 53.4 s, sys: 272 ms, total: 53.6 s
Wall time: 4.49 s


# create sub

In [30]:
ss['Global_Sales'] = y_test_pred
save_path = f'{SAVE_DIR}submission.csv'
ss.to_csv(save_path, index=False)

print(f'save: {save_path}')

save: ../data/output_nb/nb010/submission.csv
