In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.datasets import load_boston
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
boston = load_boston()
X_array = boston.data
y_array = boston.target
df = pd.DataFrame(X_array, columns = boston.feature_names).assign(Price=np.array(y_array))

print(df.shape)
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0000)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=0000)

X_train = df_train.drop('Price', axis = 1)
y_train = df_train['Price']
X_valid = df_valid.drop('Price', axis = 1)
y_valid = df_valid['Price']
X_test = df_test.drop('Price', axis = 1)
y_test = df_test['Price']

lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
lgb_test = lgb.Dataset(X_test, y_test)

In [None]:
if 'opt_lgb_model' in locals():
    params = opt_lgb_model.params
else:
    params = {'task': 'train',
              'objective': 'regression',
              'boosting': 'gbdt',
              'metric' : 'rmse',
              'verbosity': -1,
              'randomseed': 0000}

In [None]:
evals_result = {}
lgb_model = lgb.train(params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      valid_names=['train', 'valid'],
                      num_boost_round=1000,
                      early_stopping_rounds=100,
                      evals_result=evals_result,
                      verbose_eval=100)

In [None]:
# show learning loss
plt.plot(evals_result['train']['rmse'], label='train')
plt.plot(evals_result['valid']['rmse'], label='valid')
plt.ylabel('Log loss')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()
plt.show()

In [None]:
y_pred = lgb_model.predict(X_test)
df_pred = pd.concat([y_test.reset_index(drop=True), pd.Series(y_pred)], axis=1)
df_pred.columns = ['true', 'pred']

df_pred.head()

In [None]:
def calc_score(df):
    MSE = mean_squared_error(df['true'], df['pred'])
    RMSE = np.sqrt(mean_squared_error(df['true'], df['pred']))
    MAE = mean_absolute_error(df['true'], df['pred'])
    R2 = r2_score(df['true'], df['pred'])
    eval_result = pd.DataFrame(columns=['MSE', 'RMSE', 'MAE', 'R2'])
    eval_result.loc['MLDEL'] = np.round(MSE), np.round(RMSE), np.round(MAE), np.round(R2, decimals=4)

    return eval_result

eval_result = calc_score(df_pred)
eval_result

In [None]:
def yy_plot(df, idx='R2'):
    plt.figure(figsize = (10,10))
    plt.plot(df['true'], df['true'], color = 'red', label = 'x=y')
    plt.scatter('true', 'pred', data=df)
    plt.xlabel('Pred-Y', fontsize=20)
    plt.ylabel('True-Y', fontsize=20)
    plt.text(5, 50, '{} = {}'.format(idx, eval_result[idx][0]), fontsize=15)

yy_plot(df_pred, 'R2')

In [None]:
lgb.plot_importance(lgb_model, importance_type='split')

In [None]:
lgb.plot_importance(lgb_model, importance_type='gain')

# Optuna

In [None]:
import optuna.integration.lightgbm as opt_lgb

In [None]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=0000)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=0000)

X_train = df_train.drop('Price', axis = 1)
y_train = df_train['Price']
X_valid = df_valid.drop('Price', axis = 1)
y_valid = df_valid['Price']
X_test = df_test.drop('Price', axis = 1)
y_test = df_test['Price']

lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
lgb_test = lgb.Dataset(X_test, y_test)

In [None]:
params = {'objective': 'regression',
          'metric': 'rmse',
          'verbosity': -1,
          'random_seed': 1234}

opt_lgb_model = opt_lgb.train(params=params,
                              train_set=lgb_train,
                              valid_sets=[lgb_train, lgb_valid],
                              valid_names=['train', 'valid'],
                              num_boost_round=1000,
                              early_stopping_rounds=50,
                              verbose_eval=0)

In [None]:
opt_lgb_model.params

# SHAP
- [リファレンス](https://shap.readthedocs.io/en/latest/index.html)
- [参考](https://blog.amedama.jp/entry/shap-lightgbm)

In [None]:
import shap

In [None]:
shap.initjs()

explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.summary_plot(shap_values=shap_values,
                  features=X_train,
                  feature_names=X_train.columns)

In [None]:
shap.summary_plot(shap_values=shap_values,
                  features=X_train,
                  feature_names=X_train.columns,
                  plot_type='bar')

In [None]:
shap.dependence_plot(ind='LSTAT',
                     interaction_index='LSTAT',
                     shap_values=shap_values,
                     features=X_train,
                     feature_names=X_train.columns)

In [None]:
shap.waterfall_plot(expected_value=explainer.expected_value,
                    shap_values=shap_values[0],
                    features=X_train.iloc[0],
                    feature_names=X_train.columns)

In [None]:
shap.force_plot(base_value=explainer.expected_value,
                shap_values=shap_values,
                features=X_train,
                feature_names=X_train.columns)

In [None]:
shap.decision_plot(base_value=explainer.expected_value,
                  shap_values=shap_values[:10],
                  features=X_train,
                  feature_names=list(X_train.columns))