# Overview
- nb016の改良
- trainだけでなくtestデータも考慮にいれて解析する

# Const

In [1]:
NB = '021'

PATH_TRAIN = './../data/official/train.csv'
PATH_TEST = './../data/official/test.csv'
PATH_SAMPLE_SUBMITTION = './../data/official/atmaCup8_sample-submission.csv'
SAVE_DIR = f'../data/output_nb/nb{NB}/'

feat_train_only = ['JP_Sales', 'Global_Sales', 'NA_Sales', 'Other_Sales', 'EU_Sales']
feat_common = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
           'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer',
           'Rating']
feat_string = ['Platform', 'Genre', 'Publisher', 'Rating']
feat_cat = ['Platform', 'Genre', 'Rating']
feat_num = ['Year_of_Release', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']
use_col = [
    'Platform',
    'Year_of_Release',
    'Genre',
    'Critic_Score',
    'Critic_Count',
    'User_Score',
    'User_Count',
    'Rating'
    ]

In [2]:
config_str = """
globals:
  seed: 2020
  show_log: True

feature:
    use_col: [
        'Platform',
        'Year_of_Release',
        'Genre',
        'Critic_Score',
        'Critic_Count',
        'User_Score',
        'User_Count',
        'Rating'
        ]

split:
  name: KFold
  params:
    n_splits: 5
    random_state: 2020
    shuffle: True

model:
    model_params:
        objective: 'regression'
        metric: 'rmse'
        n_estimators: 800
        max_depth: 131
        subsample: 0.6
        colsample_bytree: 0.9
        learning_rate: 0.022862256818781214
        reg_alpha: 1.0
        reg_lambda: 6.0
        min_child_samples: 10
    
    train_params:
        categorical_feature: ['Platform', 'Genre', 'Rating']
        verbose: -1
        early_stopping_rounds: 100
"""



# Import everything I need :)

In [3]:
import os
import time
import yaml
import types
import builtins
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fastprogress import progress_bar

from lightgbm import LGBMRegressor 

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


# My function

In [4]:
# noglobal

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [5]:
@noglobal
def metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** .5

@noglobal
def preprocess_User_Score(df):
    '''
    - tbdをnanにする
    - stringをfloatにする
    '''
    mask = df.User_Score.values == 'tbd'
    df.User_Score[mask] = np.nan
    df.User_Score = df.User_Score.values.astype(float)
    return df

@noglobal
def string_encode(df_trn, df_te, cols):
    '''
    - np.nanがあれば、'nan'に置き換える
    - label encodingする
    '''
    df = pd.concat([df_trn, df_te], axis=0).copy()
    df[cols] = df[cols].replace(np.nan, 'nan')
    df_trn[cols] = df_trn[cols].replace(np.nan, 'nan')
    df_te[cols] = df_te[cols].replace(np.nan, 'nan')
    for col in cols:
        le = LabelEncoder()
        le.fit(df[col])
        df_trn[col] = le.transform(df_trn[col])
        df_te[col] = le.transform(df_te[col])
    return df_trn, df_te

@noglobal
def df_preprocessing(df_trn, df_te, string_cols):
    df_trn = preprocess_User_Score(df_trn)
    df_te = preprocess_User_Score(df_te)
    df_trn, df_te = string_encode(df_trn, df_te, string_cols)
    return df_trn, df_te

In [6]:
@noglobal
def run_fold_lgbm(_X_trn, _y_trn, _X_val, _y_val, _X_te, model_config, show_log=True):
    model_params = model_config['model_params']
    train_params = model_config['train_params']
    
    # train
    model = LGBMRegressor(**model_params)
    model.fit(_X_trn, _y_trn.values[:, 0],
              eval_set=[(_X_trn, _y_trn), (_X_val, _y_val)],
              **train_params
             )
    
    # predict
    y_trn_pred = model.predict(_X_trn)
    y_val_pred = model.predict(_X_val)
    _y_test_pred = model.predict(_X_te)
    
    # postprocessiing
    y_trn_pred[y_trn_pred <= 1] = 1
    y_val_pred[y_val_pred <= 1] = 1
    y_trn_pred = np.expm1(y_trn_pred)  # exp を適用して 1 を引く
    y_val_pred = np.expm1(y_val_pred)  # exp を適用して 1 を引く
    _y_test_pred = np.expm1(_y_test_pred)  # exp を適用して 1 を引く
    
    if show_log:
        print(show_log)
        print(f'score train: {metric(np.expm1(_y_trn), y_trn_pred):.5f}')
        print(f'score valid: {metric(np.expm1(_y_val), y_val_pred):.5f}')
        print('')
    
    return y_trn_pred, y_val_pred, _y_test_pred

@noglobal
def run_lgbm(X, y, X_te, splitter, config):
    show_log = config['globals']['show_log']
    model_config = config['model']

    oof = np.zeros(len(X))
    y_test_pred = np.zeros(len(X_te))
    for fold_i, (idx_trn, idx_val) in enumerate(splitter.split(X)):
        if show_log:
            print(f'::Fold {fold_i+1}/{splitter.n_splits} start at {time.ctime()}::')
        X_trn, X_val = X.iloc[idx_trn, :], X.iloc[idx_val, :]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        X_trn = pd.DataFrame(X_trn, columns=X.columns)
        X_val = pd.DataFrame(X_val, columns=X.columns)

        # train
        y_trn_pred, y_val_pred, _y_test_pred = run_fold_lgbm(X_trn, y_trn, X_val, y_val, 
                                                             X_te, model_config, 
                                                             show_log=show_log)

        # result
        oof[idx_val] = y_val_pred
        y_test_pred += _y_test_pred / splitter.n_splits


    y_test_pred[y_test_pred <= 1] = 1
    return oof, y_test_pred

# Preparation

set

In [7]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

config = yaml.safe_load(config_str)

<br>

load dataset

In [8]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)
ss = pd.read_csv(PATH_SAMPLE_SUBMITTION)

<br>

preprocess

In [9]:
train, test = df_preprocessing(train, test, feat_string)

In [10]:
X = train[use_col].copy()
y = train[['Global_Sales']].copy()
X_te = test[use_col].copy()

In [11]:
# RMSLE を RMSEとしいて解く
y = np.log1p(y)  # 1 を足してlog を適用

# EDA

<br>

textを小文字にして1行に

In [12]:
train['Name'] = train['Name'].replace(np.nan, 'nan')
test['Name'] = test['Name'].replace(np.nan, 'nan')

train_word_line = ' '.join(train.Name.str.lower().values)
test_word_line = ' '.join(test.Name.str.lower().values)

# ( と ) と : を消去
train_word_line = train_word_line.replace('(', ' ')
train_word_line = train_word_line.replace(')', ' ')
train_word_line = train_word_line.replace(':', ' ')
test_word_line = test_word_line.replace('(', ' ')
test_word_line = test_word_line.replace(')', ' ')
test_word_line = test_word_line.replace(':', ' ')

train_word_list = train_word_line.split(None)
test_word_list = test_word_line.split(None)

In [13]:
print(train_word_line[:600])
print('')
print(test_word_line[:600])

lego batman  the videogame lego indiana jones  the original adventures lego batman  the videogame combat lego harry potter  years 5-7 lego harry potter  years 5-7 yakuza 4 lego harry potter  years 5-7 the lord of the rings  war in the north the lord of the rings  war in the north the chronicles of narnia  the lion, the witch and the wardrobe lego harry potter  years 5-7 the golden compass circus atari maze craze  a game of cops 'n robbers robert ludlum's the bourne conspiracy lego harry potter  years 5-7 robert ludlum's the bourne conspiracy the golden compass tomb raider  2013  slot machine t

hitman 2  silent assassin legacy of kain  soul reaver metal gear solid 2  substance silent hill  homecoming silent hill  homecoming cubix robots for everyone  clash 'n' bash dragon ball z  budokai tenkaichi 2  jp sales  shrek the third nicktoons  battle for volcano island the legend of zelda  the minish cap weekly jp sales  ghostbusters ii dead space 3 twisted metal  small brawl teen titans jame

<br>

trainとtest共通のワードを取得

In [14]:
common_word_unique = list(set(train_word_list) & set(test_word_list))
common_word_unique[:20]

['wolfenstein',
 'popolocrois',
 'blade',
 'shellshock',
 'kuro',
 'can',
 'army',
 'village',
 'disc',
 'vampire',
 'continuum',
 'rift',
 'blazblue',
 'backyard',
 'travel',
 'craze',
 'deadly',
 'cocoon',
 'rebellion',
 'featuring']

<br>

testで数える

In [15]:
%%time
common_words_in_test = []
for word in progress_bar(common_word_unique):
    bools = []
    for val in test.Name.values:
        words = np.array(str(val).split(None))
        bool_ = any(words == word)
        bools.append(bool_)
        
        if bool_:
            common_words_in_test.append(word)

CPU times: user 1min 41s, sys: 251 ms, total: 1min 41s
Wall time: 1min 41s


In [16]:
%%time
common_words_in_train = []
for word in progress_bar(common_word_unique):
    bools = []
    for val in train.Name.values:
        words = np.array(str(val).split(None))
        bool_ = any(words == word)
        bools.append(bool_)
        
        if bool_:
            common_words_in_train.append(word)

CPU times: user 1min 42s, sys: 92 ms, total: 1min 42s
Wall time: 1min 42s


<br>

trainの中の数を見る

In [17]:
value_counts = pd.Series(common_words_in_train).value_counts(ascending=False)
value_counts[:50]

of           821
the          506
2            428
no           325
&            208
3            206
-            134
and          133
to           121
4             89
vs.           86
in            81
5             44
ni            41
de            39
/             35
or            33
1             33
for           32
6             28
7             26
o             26
64            25
at            23
10            22
2009          22
on            22
2010          22
2001          22
*             20
2000          19
a             19
vs            18
2008          18
2004          17
2002          17
2006          16
2003          14
with          13
8             13
e             13
2nd           13
2011          12
ga            12
wa            12
is            11
featuring     11
2007          10
from           9
2016           9
dtype: int64

<br>

testの中の数を見る

In [18]:
value_counts = pd.Series(common_words_in_test).value_counts(ascending=False)
value_counts[:50]

of      825
2       433
the     428
no      290
3       192
&       141
to      138
for     131
and     126
-       110
in       87
4        70
2002     58
2000     45
2005     43
2004     41
09       41
08       39
vs.      38
2010     37
07       37
5        36
/        36
10       36
2003     36
06       35
de       35
1        31
2011     31
14       29
2014     28
13       27
2012     27
15       27
a        27
2001     25
11       25
12       24
99       22
2016     22
64       22
2015     22
16       20
with     19
2008     18
2009     18
98       18
ni       17
or       17
on       15
dtype: int64

---> 全然使えないじゃん。  
---> ボツ  
---> name_has は微妙だけど、2010とか2008をカテゴリカルにするとよさそう(name_age)てきな  
---> あと、1,2,3もいいな(name_ver)