# Overview
- Name特徴量から情報を抽出して特徴量としたい
- nb015の結果(cv: 1.00127)

# Const

In [1]:
NB = '016'

PATH_TRAIN = './../data/official/train.csv'
PATH_TEST = './../data/official/test.csv'
PATH_SAMPLE_SUBMITTION = './../data/official/atmaCup8_sample-submission.csv'
SAVE_DIR = f'../data/output_nb/nb{NB}/'

feat_train_only = ['JP_Sales', 'Global_Sales', 'NA_Sales', 'Other_Sales', 'EU_Sales']
feat_common = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
           'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer',
           'Rating']
feat_string = ['Platform', 'Genre', 'Publisher', 'Developer', 'Rating']
feat_cat = ['Platform', 'Genre', 'Rating']
feat_num = ['Year_of_Release', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']
use_col = [
    'Platform',
    'Year_of_Release',
    'Genre',
    'Critic_Score',
    'Critic_Count',
    'User_Score',
    'User_Count',
    'Rating'
    ]

In [2]:
config_str = """
globals:
  seed: 2020
  show_log: True

feature:
    use_col: [
        'Platform',
        'Year_of_Release',
        'Genre',
        'Critic_Score',
        'Critic_Count',
        'User_Score',
        'User_Count',
        'Rating'
        ]

split:
  name: KFold
  params:
    n_splits: 5
    random_state: 2020
    shuffle: True

model:
    model_params:
        objective: 'regression'
        metric: 'rmse'
        n_estimators: 800
        max_depth: 131
        subsample: 0.6
        colsample_bytree: 0.9
        learning_rate: 0.022862256818781214
        reg_alpha: 1.0
        reg_lambda: 6.0
        min_child_samples: 10
    
    train_params:
        categorical_feature: ['Platform', 'Genre', 'Rating']
        verbose: -1
        early_stopping_rounds: 100
"""



# Import everything I need :)

In [3]:
import os
import time
import yaml
import types
import builtins
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fastprogress import progress_bar

from lightgbm import LGBMRegressor 

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


# My function

In [4]:
# noglobal

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [5]:
@noglobal
def metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** .5

@noglobal
def preprocess_User_Score(df):
    '''
    - tbdをnanにする
    - stringをfloatにする
    '''
    mask = df.User_Score.values == 'tbd'
    df.User_Score[mask] = np.nan
    df.User_Score = df.User_Score.values.astype(float)
    return df

@noglobal
def string_encode(df_trn, df_te, cols):
    '''
    - np.nanがあれば、'nan'に置き換える
    - label encodingする
    '''
    df = pd.concat([df_trn, df_te], axis=0).copy()
    df[cols] = df[cols].replace(np.nan, 'nan')
    df_trn[cols] = df_trn[cols].replace(np.nan, 'nan')
    df_te[cols] = df_te[cols].replace(np.nan, 'nan')
    for col in cols:
        le = LabelEncoder()
        le.fit(df[col])
        df_trn[col] = le.transform(df_trn[col])
        df_te[col] = le.transform(df_te[col])
    return df_trn, df_te

@noglobal
def df_preprocessing(df_trn, df_te, string_cols):
    df_trn = preprocess_User_Score(df_trn)
    df_te = preprocess_User_Score(df_te)
    df_trn, df_te = string_encode(df_trn, df_te, string_cols)
    return df_trn, df_te

In [6]:
@noglobal
def run_fold_lgbm(_X_trn, _y_trn, _X_val, _y_val, _X_te, model_config, show_log=True):
    model_params = model_config['model_params']
    train_params = model_config['train_params']
    
    # train
    model = LGBMRegressor(**model_params)
    model.fit(_X_trn, _y_trn.values[:, 0],
              eval_set=[(_X_trn, _y_trn), (_X_val, _y_val)],
              **train_params
             )
    
    # predict
    y_trn_pred = model.predict(_X_trn)
    y_val_pred = model.predict(_X_val)
    _y_test_pred = model.predict(_X_te)
    
    # postprocessiing
    y_trn_pred[y_trn_pred <= 1] = 1
    y_val_pred[y_val_pred <= 1] = 1
    y_trn_pred = np.expm1(y_trn_pred)  # exp を適用して 1 を引く
    y_val_pred = np.expm1(y_val_pred)  # exp を適用して 1 を引く
    _y_test_pred = np.expm1(_y_test_pred)  # exp を適用して 1 を引く
    
    if show_log:
        print(show_log)
        print(f'score train: {metric(np.expm1(_y_trn), y_trn_pred):.5f}')
        print(f'score valid: {metric(np.expm1(_y_val), y_val_pred):.5f}')
        print('')
    
    return y_trn_pred, y_val_pred, _y_test_pred

@noglobal
def run_lgbm(X, y, X_te, splitter, config):
    show_log = config['globals']['show_log']
    model_config = config['model']

    oof = np.zeros(len(X))
    y_test_pred = np.zeros(len(X_te))
    for fold_i, (idx_trn, idx_val) in enumerate(splitter.split(X)):
        if show_log:
            print(f'::Fold {fold_i+1}/{splitter.n_splits} start at {time.ctime()}::')
        X_trn, X_val = X.iloc[idx_trn, :], X.iloc[idx_val, :]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        X_trn = pd.DataFrame(X_trn, columns=X.columns)
        X_val = pd.DataFrame(X_val, columns=X.columns)

        # train
        y_trn_pred, y_val_pred, _y_test_pred = run_fold_lgbm(X_trn, y_trn, X_val, y_val, 
                                                             X_te, model_config, 
                                                             show_log=show_log)

        # result
        oof[idx_val] = y_val_pred
        y_test_pred += _y_test_pred / splitter.n_splits


    y_test_pred[y_test_pred <= 1] = 1
    return oof, y_test_pred

# Preparation

set

In [7]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

config = yaml.safe_load(config_str)

<br>

load dataset

In [8]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)
ss = pd.read_csv(PATH_SAMPLE_SUBMITTION)

<br>

preprocess

In [9]:
train, test = df_preprocessing(train, test, feat_string)

In [10]:
X = train[use_col].copy()
y = train[['Global_Sales']].copy()
X_te = test[use_col].copy()

In [11]:
X

Unnamed: 0,Platform,Year_of_Release,Genre,Critic_Score,Critic_Count,User_Score,User_Count,Rating
0,26,,0,74.0,17.0,7.9,22.0,2
1,26,,0,78.0,22.0,6.6,28.0,2
2,19,,0,73.0,5.0,7.4,10.0,2
3,0,,0,,,,,8
4,26,,0,76.0,8.0,7.8,13.0,2
...,...,...,...,...,...,...,...,...
8354,13,2016.0,11,78.0,57.0,8.0,569.0,8
8355,13,2016.0,11,,,,,8
8356,20,2017.0,0,,,,,8
8357,18,2017.0,7,,,,,8


In [12]:
# RMSLE を RMSEとしいて解く
y = np.log1p(y)  # 1 を足してlog を適用

# EDA

<br>

textを小文字にして1行に

In [13]:
text_line = ' '.join(train.Name.str.lower().values)
text_line[:600]

"lego batman: the videogame lego indiana jones: the original adventures lego batman: the videogame combat lego harry potter: years 5-7 lego harry potter: years 5-7 yakuza 4 lego harry potter: years 5-7 the lord of the rings: war in the north the lord of the rings: war in the north the chronicles of narnia: the lion, the witch and the wardrobe lego harry potter: years 5-7 the golden compass circus atari maze craze: a game of cops 'n robbers robert ludlum's the bourne conspiracy lego harry potter: years 5-7 robert ludlum's the bourne conspiracy the golden compass tomb raider (2013) slot machine t"

<br>

上位100個を見て、使えそうなワードを抜き出す

In [14]:
value_counts = pd.Series(text_line.split(None)).value_counts()
for idx in range(100):
    print(f'{value_counts.index[idx]} \t {value_counts[idx]}')

the 	 1505
of 	 867
2 	 428
no 	 400
& 	 210
3 	 207
super 	 193
world 	 172
star 	 159
2: 	 157
lego 	 139
- 	 138
to 	 137
and 	 136
game 	 124
dragon 	 122
ds 	 115
mario 	 110
final 	 105
ii 	 103
nba 	 94
in 	 93
fantasy 	 92
warriors 	 89
4 	 89
3: 	 87
vs. 	 86
collection 	 86
a 	 81
man 	 77
battle 	 76
sonic 	 76
wars 	 72
sales) 	 72
heroes 	 72
quest 	 70
portable 	 69
shin 	 67
challenge 	 67
force 	 67
mega 	 67
wwe 	 67
legend 	 67
party 	 66
edition 	 64
disney 	 64
ultimate 	 64
king 	 63
wars: 	 61
baseball 	 61
pro 	 60
rock 	 60
racing 	 60
series 	 58
championship 	 58
samurai 	 57
disney's 	 56
games 	 56
war 	 56
monster 	 55
resident 	 55
band 	 51
ni 	 51
mlb 	 50
dead 	 50
x 	 49
adventure 	 49
dynasty 	 49
grand 	 49
pack 	 48
(jp 	 48
league 	 48
dark 	 48
soccer 	 47
evil 	 47
de 	 46
fighter 	 45
new 	 45
5 	 44
ninja 	 43
/ 	 43
iii 	 43
ball 	 42
club 	 42
football 	 42
video 	 41
tour 	 40
shadow 	 40
marvel 	 39
legends 	 39
vol. 	 39
tennis 	 39
ii: 	 

In [15]:
texts = [
    'super', 'world', 'star', 'lego', 'game', 'dragon', 'ds', 'mario',
    'final', 'fantasy',  'vs.', 'collection', 'man', 'sonic', 'battle',
    'quest', 'portable', 'shin', 'disney', 'baseball', 'soccer'
    ]

In [16]:
train_has_word = pd.DataFrame()
for text in progress_bar(texts):
    bools = []
    for name in train.Name.str.lower().values:
        words = np.array(name.split(None))
        bools.append(any(words == text))
    train_has_word[f'Name_has_{text}'] = bools
    
test_has_word = pd.DataFrame()
for text in progress_bar(texts):
    bools = []
    for name in test.Name.str.lower().values:
        words = np.array(str(name).split(None))
        bools.append(any(words == text))
    test_has_word[f'Name_has_{text}'] = bools

In [17]:
train_has_word.head()

Unnamed: 0,Name_has_super,Name_has_world,Name_has_star,Name_has_lego,Name_has_game,Name_has_dragon,Name_has_ds,Name_has_mario,Name_has_final,Name_has_fantasy,...,Name_has_collection,Name_has_man,Name_has_sonic,Name_has_battle,Name_has_quest,Name_has_portable,Name_has_shin,Name_has_disney,Name_has_baseball,Name_has_soccer
0,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


---> 新しい特徴量作った

# Experiments
- 新しい特徴量を一つづつ加えながらスコアの変動を見る
- nb015の結果(base_cv = 1.00127)と比較する

In [18]:
base_cv = 1.00127

In [53]:
%%time

new_scores = []
splitter = KFold(**config['split']['params'])
for col in train_has_word.columns:
    X_new = X.copy()
    X_te_new = X_te.copy()
    X_new[col] = train_has_word[col].values
    X_te_new[col] = test_has_word[col].values
    
    oof, y_test_pred = run_lgbm(X_new, y, X_te_new, splitter, config)
    new_cv = metric(np.expm1(y), oof)
    new_scores.append(new_cv)

::Fold 1/5 start at Mon Dec  7 23:12:00 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[405]	valid_0's rmse: 0.888219	valid_1's rmse: 0.991295
True
score train: 0.88822
score valid: 0.99129

::Fold 2/5 start at Mon Dec  7 23:12:00 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[660]	valid_0's rmse: 0.84741	valid_1's rmse: 1.01674
True
score train: 0.84742
score valid: 1.01674

::Fold 3/5 start at Mon Dec  7 23:12:01 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[675]	valid_0's rmse: 0.852182	valid_1's rmse: 0.997567
True
score train: 0.85220
score valid: 0.99756

::Fold 4/5 start at Mon Dec  7 23:12:02 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[467]	valid_0's rmse: 0.884161	valid_1's rmse: 0.969457
True
score train: 0.88416
score valid: 0.96946

::Fold 5/5

In [61]:
df_result = pd.DataFrame()
df_result['Name_has'] = train_has_word.columns
df_result['new_cv'] = new_scores
df_result['base_cv - new_cv'] = base_cv - np.array(new_scores)

In [64]:
df_result.sort_values('base_cv - new_cv', ascending=False)

Unnamed: 0,Name_has,new_cv,base_cv - new_cv
7,Name_has_mario,0.990816,0.010454
13,Name_has_sonic,0.996938,0.004332
0,Name_has_super,0.997559,0.003711
3,Name_has_lego,0.998969,0.002301
10,Name_has_vs.,1.000925,0.000345
2,Name_has_star,1.001063,0.000207
18,Name_has_disney,1.0012,7e-05
6,Name_has_ds,1.001579,-0.000309
1,Name_has_world,1.001814,-0.000544
16,Name_has_portable,1.001914,-0.000644


# 全部入れたら？

In [65]:
X_new = X.copy()
X_te_new = X_te.copy()
X_new = pd.concat([X, train_has_word], axis=1)
X_te_new = pd.concat([X_te, test_has_word], axis=1)

In [68]:
splitter = KFold(**config['split']['params'])
oof, y_test_pred = run_lgbm(X_new, y, X_te_new, splitter, config)
print(f'base_cv: {base_cv}')
print(f'oof score: {metric(np.expm1(y), oof):.5f}')

::Fold 1/5 start at Mon Dec  7 23:20:28 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[660]	valid_0's rmse: 0.828361	valid_1's rmse: 0.965618
True
score train: 0.82838
score valid: 0.96553

::Fold 2/5 start at Mon Dec  7 23:20:29 2020::
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[800]	valid_0's rmse: 0.808563	valid_1's rmse: 0.985166
True
score train: 0.80858
score valid: 0.98518

::Fold 3/5 start at Mon Dec  7 23:20:30 2020::
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[800]	valid_0's rmse: 0.811791	valid_1's rmse: 0.973774
True
score train: 0.81181
score valid: 0.97377

::Fold 4/5 start at Mon Dec  7 23:20:31 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[422]	valid_0's rmse: 0.869204	valid_1's rmse: 0.945342
True
score train: 0.86920
scor

---> めちゃんこ上がってるやん

# よさそうなやつだけ入れてみる

In [72]:
good_words = [
    'Name_has_mario',
    'Name_has_sonic',
    'Name_has_super',
    'Name_has_lego',
    'Name_has_vs.',
    'Name_has_star',
    'Name_has_disney',
    ]

In [73]:
X_new = X.copy()
X_te_new = X_te.copy()
X_new = pd.concat([X, train_has_word[good_words]], axis=1)
X_te_new = pd.concat([X_te, test_has_word[good_words]], axis=1)

In [74]:
splitter = KFold(**config['split']['params'])
oof, y_test_pred = run_lgbm(X_new, y, X_te_new, splitter, config)
print(f'base_cv: {base_cv}')
print(f'oof score: {metric(np.expm1(y), oof):.5f}')

::Fold 1/5 start at Mon Dec  7 23:25:15 2020::
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[800]	valid_0's rmse: 0.820402	valid_1's rmse: 0.976033
True
score train: 0.82043
score valid: 0.97595

::Fold 2/5 start at Mon Dec  7 23:25:16 2020::
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[800]	valid_0's rmse: 0.814985	valid_1's rmse: 0.99292
True
score train: 0.81501
score valid: 0.99292

::Fold 3/5 start at Mon Dec  7 23:25:17 2020::
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[564]	valid_0's rmse: 0.845801	valid_1's rmse: 0.97976
True
score train: 0.84581
score valid: 0.97976

::Fold 4/5 start at Mon Dec  7 23:25:18 2020::
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[800]	valid_0's rmse: 0.826569	valid_1's rmse: 0.948647
True
score train: 0

# Save features

In [83]:
path = f'{SAVE_DIR}train_feature_name_has.csv'
train_has_word.to_csv(path, index=False, encoding='utf-8')
path

'../data/output_nb/nb016/train_feature_name_has.csv'

In [84]:
path = f'{SAVE_DIR}test_feature_name_has.csv'
test_has_word.to_csv(path, index=False, encoding='utf-8')
path

'../data/output_nb/nb016/test_feature_name_has.csv'

# 結論
全部入れたほうが良い