In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
X_train_cat = pd.read_csv('X_train_cat.csv', encoding='cp949')
X_test_cat = pd.read_csv('X_test_cat.csv', encoding='cp949')
X_train_num = pd.read_csv('X_train_num.csv', encoding='cp949')
X_test_num = pd.read_csv('X_test_num.csv', encoding='cp949')
X_train_bin = pd.read_csv('X_train_bin.csv', encoding='cp949')
X_test_bin = pd.read_csv('X_test_bin.csv', encoding='cp949')

In [None]:
for df in [X_train_num, X_test_num]:
    numeric_features = df.dtypes[df.dtypes != "object"].index.tolist()
    print("Number of Numerical features: ", len(numeric_features))

    categorical_features = df.dtypes[df.dtypes == "object"].index.tolist()
    print("Number of Categorical features: ", len(categorical_features))

In [None]:
num_test = X_test_num[numeric_features]

In [None]:
num_train = X_train_num[numeric_features]

In [None]:
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

In [None]:
from lightgbm import LGBMClassifier                     

In [None]:
# Method: Using SHAP values 
import shap

# DF, based on which importance is checked
X_importance = num_test

# Explain model predictions using shap library:
model = LGBMClassifier(random_state=2020).fit(num_train, y_train)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)

# Plot summary_plot as barplot:
shap.summary_plot(shap_values, X_importance, plot_type='bar')

In [None]:
shap_sum = np.abs(shap_values).mean(axis=1)[1,:]
importance_df = pd.DataFrame([X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)
importance_df

In [None]:
# feature 중요도가 0 이상
SHAP_THRESHOLD = 0
features_selected = importance_df.query('shap_importance > @SHAP_THRESHOLD').column_name.tolist()

In [None]:
num_train = num_train[features_selected]
num_test = num_test[features_selected]
print(num_train.shape, num_test.shape)

In [None]:
from sklearn.model_selection import train_test_split, KFold


In [None]:
from catboost import CatBoostRegressor


In [None]:
scores = []  # CV 결과 저장
oof_pred = np.zeros(num_test.shape[0])  # OOF 저장
kfold = KFold(n_splits=10, shuffle=True, random_state=0) # K-Folds cross-validator

for train_index, valid_index in kfold.split(num_train, y_train): 
    # 학습/검증 데이터 분할
    train_x, valid_x = num_train.iloc[train_index], num_train.iloc[valid_index]
    train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]

    # 범주형피처 직접 처리와 Early stopping을 사용하여 CatBoost 모델링
    model = CatBoostRegressor(cat_features=categorical_features, verbose=False, random_state=0)
    model.fit(train_x, train_y,
              eval_set=[(valid_x,valid_y)],
              early_stopping_rounds=100,
             )

    # CV 스코어 계산 및 저장
    rmse = np.sqrt(mean_squared_error(valid_y, model.predict(valid_x)))
    scores.append(rmse)

    # OOF 예측값 저장
    oof_pred += model.predict(num_test) / kfold.get_n_splits() 

In [None]:
scores = np.array(scores) 
print("CV scores: ", scores)
print("CV mean = %.2f" % scores.mean(), "with std = %.2f" % scores.std())

In [None]:
# submission 화일 생성
filename = f'catboost_{CATBOOST_VERSION}_{scores.mean():.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)

In [None]:
CATBOOST_VERSION = 9.0

In [None]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')