# Overview
- nb006の改良
- train_test_splitからkfoldにする

# Const

In [7]:
NB = '007'
N_SPLITS = 5


PATH_TRAIN = './../data/official/train.csv'
PATH_TEST = './../data/official/test.csv'
PATH_SAMPLE_SUBMITTION = './../data/official/atmaCup8_sample-submission.csv'
SAVE_DIR = f'../data/output_nb/nb{NB}/'

feat_train_only = ['JP_Sales', 'Global_Sales', 'NA_Sales', 'Other_Sales', 'EU_Sales']
feat_common = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
           'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer',
           'Rating']
feat_string = ['Platform', 'Genre', 'Publisher', 'Developer', 'Rating']

# Import everything I need :)

In [39]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMRegressor 

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')

# My function

In [13]:
def metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** .5

def preprocess_User_Score(df):
    '''
    - tbdをnanにする
    - stringをfloatにする
    '''
    mask = df.User_Score.values == 'tbd'
    df.User_Score[mask] = np.nan
    df.User_Score = df.User_Score.values.astype(float)
    return df

def string_encode(df, cols):
    '''
    - np.nanがあれば、'nan'に置き換える
    - label encodingする
    '''
    df[cols] = df[cols].replace(np.nan, 'nan')
    for col in cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

def df_preprocessing(df, string_cols):
    df = preprocess_User_Score(df)
    df = string_encode(df, string_cols)
    return df

# Preparation

set

In [10]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

<br>

load dataset

In [18]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)
ss = pd.read_csv(PATH_SAMPLE_SUBMITTION)

<br>

preprocess

In [19]:
train = df_preprocessing(train, feat_string)
test = df_preprocessing(test, feat_string)

In [20]:
mask = test.columns != 'Name'
use_col = test.columns[mask].tolist()
use_col

['Platform',
 'Year_of_Release',
 'Genre',
 'Publisher',
 'Critic_Score',
 'Critic_Count',
 'User_Score',
 'User_Count',
 'Developer',
 'Rating']

In [25]:
X = train[use_col].copy()
y = train[['Global_Sales']].copy()
X_te = test[use_col].copy()

# Create Model

In [23]:
splitter = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2020)

In [43]:
%%time
print(f'use_col: {use_col}\n')

oof = np.zeros(len(X))
y_test_pred = np.zeros(len(X_te))
for fold_i, (idx_trn, idx_val) in enumerate(splitter.split(X)):
    print(f'::Fold {fold_i+1}/{N_SPLITS} start at {time.ctime()}::')
    X_trn, X_val = X.iloc[idx_trn, :], X.iloc[idx_val, :]
    y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
    X_trn = pd.DataFrame(X_trn, columns=X.columns)
    X_val = pd.DataFrame(X_val, columns=X.columns)
    
    # train
    model = LGBMRegressor(random_state=2020, colsample_bytree=0.7)
    model.fit(X_trn, y_trn.values[:, 0], categorical_feature=feat_string)
    
    # predict
    y_trn_pred = model.predict(X_trn)
    y_val_pred = model.predict(X_val)
    _y_test_pred = model.predict(X_te)
    
    # postprocessiing
    y_trn_pred[y_trn_pred <= 1] = 1
    y_val_pred[y_val_pred <= 1] = 1
    
    # result
    oof[idx_val] = y_val_pred
    y_test_pred += _y_test_pred / N_SPLITS
    
    print(f'score train: {metric(y_trn, y_trn_pred):.5f}')
    print(f'score valid: {metric(y_val, y_val_pred):.5f}')
    print('')

y_test_pred[y_test_pred <= 1] = 1
print(f'oof score: {metric(y, oof):.5f}')

use_col: ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating']

::Fold 1/5 start at Fri Dec  4 21:35:29 2020::
score train: 1.04486
score valid: 1.18130

::Fold 2/5 start at Fri Dec  4 21:35:29 2020::
score train: 1.04830
score valid: 1.11969

::Fold 3/5 start at Fri Dec  4 21:35:29 2020::
score train: 1.01304
score valid: 1.14669

::Fold 4/5 start at Fri Dec  4 21:35:29 2020::
score train: 1.05580
score valid: 1.14030

::Fold 5/5 start at Fri Dec  4 21:35:29 2020::
score train: 1.03097
score valid: 1.13289

oof score: 1.14436
CPU times: user 5.63 s, sys: 39.1 ms, total: 5.67 s
Wall time: 483 ms


# create sub

In [44]:
ss['Global_Sales'] = y_test_pred
save_path = f'{SAVE_DIR}submission.csv'
ss.to_csv(save_path, index=False)

print(f'save: {save_path}')

save: ../data/output_nb/nb007/submission.csv
