# Overview
- feature_developer_has 特徴量を作成する

# Const

In [1]:
NB = '020'

PATH_TRAIN = './../data/official/train.csv'
PATH_TEST = './../data/official/test.csv'
PATH_SAMPLE_SUBMITTION = './../data/official/atmaCup8_sample-submission.csv'
SAVE_DIR = f'../data/output_nb/nb{NB}/'

feat_train_only = ['JP_Sales', 'Global_Sales', 'NA_Sales', 'Other_Sales', 'EU_Sales']
feat_common = ['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher',
           'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Developer',
           'Rating']
feat_string = ['Platform', 'Genre', 'Publisher', 'Rating']
feat_cat = ['Platform', 'Genre', 'Rating']
feat_num = ['Year_of_Release', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']
use_col = [
    'Platform',
    'Year_of_Release',
    'Genre',
    'Critic_Score',
    'Critic_Count',
    'User_Score',
    'User_Count',
    'Rating'
    ]

In [2]:
config_str = """
globals:
  seed: 2020
  show_log: True

feature:
    use_col: [
        'Platform',
        'Year_of_Release',
        'Genre',
        'Critic_Score',
        'Critic_Count',
        'User_Score',
        'User_Count',
        'Rating'
        ]

split:
  name: KFold
  params:
    n_splits: 5
    random_state: 2020
    shuffle: True

model:
    model_params:
        objective: 'regression'
        metric: 'rmse'
        n_estimators: 800
        max_depth: 131
        subsample: 0.6
        colsample_bytree: 0.9
        learning_rate: 0.022862256818781214
        reg_alpha: 1.0
        reg_lambda: 6.0
        min_child_samples: 10
    
    train_params:
        categorical_feature: ['Platform', 'Genre', 'Rating']
        verbose: -1
        early_stopping_rounds: 100
"""



# Import everything I need :)

In [3]:
import os
import time
import yaml
import types
import builtins
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fastprogress import progress_bar

from lightgbm import LGBMRegressor 

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


# My function

In [4]:
# noglobal

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [5]:
@noglobal
def metric(y_true, y_pred):
    return mean_squared_log_error(y_true, y_pred) ** .5

@noglobal
def preprocess_User_Score(df):
    '''
    - tbdをnanにする
    - stringをfloatにする
    '''
    mask = df.User_Score.values == 'tbd'
    df.User_Score[mask] = np.nan
    df.User_Score = df.User_Score.values.astype(float)
    return df

@noglobal
def string_encode(df_trn, df_te, cols):
    '''
    - np.nanがあれば、'nan'に置き換える
    - label encodingする
    '''
    df = pd.concat([df_trn, df_te], axis=0).copy()
    df[cols] = df[cols].replace(np.nan, 'nan')
    df_trn[cols] = df_trn[cols].replace(np.nan, 'nan')
    df_te[cols] = df_te[cols].replace(np.nan, 'nan')
    for col in cols:
        le = LabelEncoder()
        le.fit(df[col])
        df_trn[col] = le.transform(df_trn[col])
        df_te[col] = le.transform(df_te[col])
    return df_trn, df_te

@noglobal
def df_preprocessing(df_trn, df_te, string_cols):
    df_trn = preprocess_User_Score(df_trn)
    df_te = preprocess_User_Score(df_te)
    df_trn, df_te = string_encode(df_trn, df_te, string_cols)
    return df_trn, df_te

In [6]:
@noglobal
def run_fold_lgbm(_X_trn, _y_trn, _X_val, _y_val, _X_te, model_config, show_log=True):
    model_params = model_config['model_params']
    train_params = model_config['train_params']
    
    # train
    model = LGBMRegressor(**model_params)
    model.fit(_X_trn, _y_trn.values[:, 0],
              eval_set=[(_X_trn, _y_trn), (_X_val, _y_val)],
              **train_params
             )
    
    # predict
    y_trn_pred = model.predict(_X_trn)
    y_val_pred = model.predict(_X_val)
    _y_test_pred = model.predict(_X_te)
    
    # postprocessiing
    y_trn_pred[y_trn_pred <= 1] = 1
    y_val_pred[y_val_pred <= 1] = 1
    y_trn_pred = np.expm1(y_trn_pred)  # exp を適用して 1 を引く
    y_val_pred = np.expm1(y_val_pred)  # exp を適用して 1 を引く
    _y_test_pred = np.expm1(_y_test_pred)  # exp を適用して 1 を引く
    
    if show_log:
        print(show_log)
        print(f'score train: {metric(np.expm1(_y_trn), y_trn_pred):.5f}')
        print(f'score valid: {metric(np.expm1(_y_val), y_val_pred):.5f}')
        print('')
    
    return y_trn_pred, y_val_pred, _y_test_pred

@noglobal
def run_lgbm(X, y, X_te, splitter, config):
    show_log = config['globals']['show_log']
    model_config = config['model']

    oof = np.zeros(len(X))
    y_test_pred = np.zeros(len(X_te))
    for fold_i, (idx_trn, idx_val) in enumerate(splitter.split(X)):
        if show_log:
            print(f'::Fold {fold_i+1}/{splitter.n_splits} start at {time.ctime()}::')
        X_trn, X_val = X.iloc[idx_trn, :], X.iloc[idx_val, :]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        X_trn = pd.DataFrame(X_trn, columns=X.columns)
        X_val = pd.DataFrame(X_val, columns=X.columns)

        # train
        y_trn_pred, y_val_pred, _y_test_pred = run_fold_lgbm(X_trn, y_trn, X_val, y_val, 
                                                             X_te, model_config, 
                                                             show_log=show_log)

        # result
        oof[idx_val] = y_val_pred
        y_test_pred += _y_test_pred / splitter.n_splits


    y_test_pred[y_test_pred <= 1] = 1
    return oof, y_test_pred

# Preparation

set

In [7]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

config = yaml.safe_load(config_str)

<br>

load dataset

In [8]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)
ss = pd.read_csv(PATH_SAMPLE_SUBMITTION)

<br>

preprocess

In [9]:
train, test = df_preprocessing(train, test, feat_string)

In [10]:
X = train[use_col].copy()
y = train[['Global_Sales']].copy()
X_te = test[use_col].copy()

In [11]:
# RMSLE を RMSEとしいて解く
y = np.log1p(y)  # 1 を足してlog を適用

# EDA

<br>

textを小文字にして1行に

In [19]:
train['Developer'] = train['Developer'].replace(np.nan, 'nan')
test['Developer'] = test['Developer'].replace(np.nan, 'nan')

train_word_line = ' '.join(train.Developer.str.lower().values)
test_word_line = ' '.join(test.Developer.str.lower().values)

train_word_list = train_word_line.split(None)
test_word_list = test_word_line.split(None)

In [20]:
print(train_word_line[:600])
print('')
print(test_word_line[:600])

traveller's tales traveller's tales traveller's tales nan traveller's tales traveller's tales ryu ga gotoku studios traveller's tales snowblind studios snowblind studios amaze entertainment traveller's tales shiny entertainment nan nan high moon studios traveller's tales high moon studios shiny entertainment crystal dynamics, nixxes software nan traveller's tales nan traveller's tales traveller's tales nan wayforward nan grasshopper manufacture nan skonec altron high voltage software nan nan pendulo studios capcom snk playmore harmonix music systems nan culture brain rare ltd. nan nan atomic p

io interactive crystal dynamics kcej double helix games double helix games nan nan vicarious visions blue tongue entertainment nan nan visceral games incognito inc. artificial mind and movement radical entertainment namco bandai games america, namco bandai games telltale games nan nan nan techland konami nan nan nan handheld games vicious cycle lucky chicken konami computer entertainment hawaii 

<br>

trainとtest共通のワードを取得

In [27]:
common_word_unique = list(set(train_word_list) & set(test_word_list))
common_word_unique[:20]

['&',
 '7',
 'america,',
 'bluepoint',
 'furyu',
 'saffire',
 'arcade',
 'cyberconnect2',
 'and',
 'games',
 'eclipse',
 's.r.l',
 '4a',
 'remedy',
 'assembly',
 'magic',
 'torus',
 'conspiracy',
 'irem',
 'crytek']

<br>

testで数える

In [43]:
%%time
common_words_in_test = []
for word in progress_bar(common_word_unique):
    bools = []
    for dev in test.Developer.values:
        words = np.array(str(dev).split(None))
        bool_ = any(words == word)
        bools.append(bool_)
        
        if bool_:
            common_words_in_test.append(word)
            

# test_has_word = pd.DataFrame()
# for text in progress_bar(texts):
#     bools = []
#     for name in test.Name.str.lower().values:
#         words = np.array(str(name).split(None))
#         bools.append(any(words == text))
#     test_has_word[f'Name_has_{text}'] = bools

CPU times: user 21.5 s, sys: 23.8 ms, total: 21.5 s
Wall time: 21.5 s


<br>

testの中の数を見る

In [44]:
value_counts = pd.Series(common_words_in_test).value_counts(ascending=False)
value_counts[:30]

nan         3134
of            42
and           27
7             10
17            10
44             7
1st            6
id             5
&              4
5pb            4
h.a.n.d.       3
13             3
3              2
989            1
505            1
dtp            1
5              1
1              1
10tacle        1
dtype: int64

---> 全然使えないじゃん。  
---> ボツ