# Overview
- nb004の改良
- nb005で見たstringの特徴量を入れる

# Const

In [1]:
NB = '006'
PATH_TRAIN = './../data/official/train.csv'
PATH_TEST = './../data/official/test.csv'
PATH_SAMPLE_SUBMITTION = './../data/official/atmaCup8_sample-submission.csv'
SAVE_DIR = f'../data/output_nb/nb{NB}/'

feat_train_only = ['JP_Sales', 'Global_Sales', 'NA_Sales', 'Other_Sales', 'EU_Sales']
feat_common = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
           'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer',
           'Rating']
feat_string = ['Platform', 'Genre', 'Publisher', 'Developer', 'Rating']

# Import everything I need :)

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMRegressor 

from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

# My function

In [3]:
def metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** .5

def preprocess_User_Score(df):
    '''
    - tbdをnanにする
    - stringをfloatにする
    '''
    mask = df.User_Score.values == 'tbd'
    df.User_Score[mask] = np.nan
    df.User_Score = df.User_Score.values.astype(float)
    return df

# Preparation

set

In [4]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

<br>

load dataset

In [5]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)
ss = pd.read_csv(PATH_SAMPLE_SUBMITTION)

<br>

preprocess

In [6]:
def string_encode(df, cols):
#     mask = test[feat_string].isna()
#     df[cols][mask] = 'nan'
    df[cols] = df[cols].replace(np.nan, 'nan')
    for col in cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

In [7]:
train = preprocess_User_Score(train)
test = preprocess_User_Score(test)

train = string_encode(train, feat_string)
test = string_encode(test, feat_string)

In [8]:
mask = test.columns != 'Name'
use_col = test.columns[mask].tolist()
use_col

['Platform',
 'Year_of_Release',
 'Genre',
 'Publisher',
 'Critic_Score',
 'Critic_Count',
 'User_Score',
 'User_Count',
 'Developer',
 'Rating']

In [9]:
X = train[use_col].copy()
y = train[['Global_Sales']].copy()
X_te = test[use_col].copy()

# Create Model

In [10]:
# model = LGBMRegressor(random_state=2020, n_estimators=500, n_jobs=-1, num_leaves=100, colsample_bytree=0.7)
model = LGBMRegressor(random_state=2020, colsample_bytree=0.7)

In [11]:
%%time
print(f'use_col: {use_col}')
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_valid = pd.DataFrame(X_valid, columns=X.columns)

model.fit(X_train, y_train.values[:, 0], categorical_feature=feat_string)

y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)

use_col: ['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating']
CPU times: user 1.14 s, sys: 37.1 ms, total: 1.18 s
Wall time: 108 ms


<br>

post processing

In [12]:
mask = y_train_pred <= 1
y_train_pred[mask] = 1

mask = y_valid_pred <= 1
y_valid_pred[mask] = 1

In [13]:
print(f'score train: {metric(y_train, y_train_pred):.5f}')
print(f'score valid: {metric(y_valid, y_valid_pred):.5f}')

score train: 1.06132
score valid: 1.19139


# create sub

In [14]:
y_test_pred = model.predict(X_te)
mask = y_test_pred <= 1
y_test_pred[mask] = 1

ss['Global_Sales'] = y_test_pred

In [15]:
save_path = f'{SAVE_DIR}submission.csv'
ss.to_csv(save_path, index=False)

print(f'save: {save_path}')

save: ../data/output_nb/nb006/submission.csv
