#CatBoost + Optuna + StratifiedKFold

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna
!pip install category_encoders
!pip install catboost

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1
Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
#라이브러리 임포트
import os
import random
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
# warning 제거
import warnings
warnings.filterwarnings("ignore")

# Reproducibility setup
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_columns', None)

In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/소득 예측/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/소득 예측/test.csv')

In [5]:
# id 제거
train_data.drop('ID', axis=1, inplace=True)
test_data.drop('ID', axis=1, inplace=True)

In [6]:
# x,y 분리
X_train = train_data.drop('Income', axis=1)
Y_train = train_data['Income']

In [7]:
# 1. industry가 상위범주
X_train['Occupation_Industry'] = X_train['Occupation_Status'] + "_" + X_train['Industry_Status']
test_data['Occupation_Industry'] = test_data['Occupation_Status'] + "_" + test_data['Industry_Status']

# 2. 근무 기간이 52주인지, 0인지, 그 사이인지 판단
def complete_work(weeks):
    if weeks == 52:
        return 'compelete'
    elif weeks == 0:
        return 'inoccupation'
    else:
        return 'incomplete'

X_train['Work_Type'] = X_train['Working_Week (Yearly)'].apply(complete_work)
test_data['Work_Type'] = test_data['Working_Week (Yearly)'].apply(complete_work)

# 3. binary로 변환
def binary(value):
    if value == 0:
        return 'none'
    else:
        return 'some'

X_train['Dividends'] = X_train['Dividends'].apply(binary)
test_data['Dividends'] = test_data['Dividends'].apply(binary)

X_train['Gains'] = X_train['Gains'].apply(binary)
test_data['Gains'] = test_data['Gains'].apply(binary)

# columns 제거
# household_status는 다중공선성 때문에 제거
# losses는 feature importance가 가장 낮아 제거
X_train.drop(['Household_Status', 'Losses'], axis=1, inplace=True)
test_data.drop(['Household_Status', 'Losses'], axis=1, inplace=True)

In [8]:
# 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X_train.columns :
  if X_train[i].dtypes == 'O' :
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['Gender', 'Education_Status', 'Employment_Status', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Summary', 'Citizenship', 'Birth_Country', 'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status', 'Gains', 'Dividends', 'Income_Status', 'Occupation_Industry', 'Work_Type']
numerical_list : ['Age', 'Working_Week (Yearly)']


In [9]:
# 타겟 인코딩
encoder = TargetEncoder(cols=categorical_list)
X_train = encoder.fit_transform(X_train, Y_train)
test_data = encoder.transform(test_data)

In [14]:
def objective(trial):
    # hyperparameter space 정의
    params = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-1, 10.0),
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'bootstrap_type': 'Bernoulli',
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'random_strength': trial.suggest_uniform('random_strength', 1e-1, 10),
        'leaf_estimation_iterations': 1,
        'logging_level': 'Silent',
        'random_seed': RANDOM_SEED
    }

    # K-Fold 교차 검증
    n_splits = 5
    cv_scores = np.zeros(n_splits)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    for idx, (train_index, valid_index) in enumerate(kf.split(X_train, Y_train)):
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        Y_train_fold, Y_valid_fold = Y_train.iloc[train_index], Y_train.iloc[valid_index]

        model = CatBoostRegressor(**params)
        model.fit(X_train_fold, Y_train_fold, eval_set=[(X_valid_fold, Y_valid_fold)], verbose=0)

        preds = model.predict(X_valid_fold)

        # Replace negative predictions with zero before calculating RMSE
        preds_adjusted = np.maximum(0, preds)

        cv_scores[idx] = np.sqrt(mean_squared_error(Y_valid_fold, preds_adjusted))

    return np.mean(cv_scores)

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
study.optimize(objective, n_trials=100)

print(f"Best trial: {study.best_trial.params}")


[I 2024-04-07 01:36:09,163] A new study created in memory with name: no-name-d8aaae71-92d2-4cb2-8408-8c4b977819cb
[I 2024-04-07 01:36:33,569] Trial 0 finished with value: 588.4409859527145 and parameters: {'depth': 6, 'learning_rate': 0.28570714885887566, 'l2_leaf_reg': 2.9106359131330697, 'subsample': 0.759195090518222, 'random_strength': 1.6445845403801216}. Best is trial 0 with value: 588.4409859527145.
[I 2024-04-07 01:36:51,056] Trial 1 finished with value: 586.5647748743615 and parameters: {'depth': 5, 'learning_rate': 0.026844247528777843, 'l2_leaf_reg': 5.3994844097874335, 'subsample': 0.7606690070459252, 'random_strength': 7.10991852018085}. Best is trial 1 with value: 586.5647748743615.
[I 2024-04-07 01:37:03,940] Trial 2 finished with value: 588.5042829907763 and parameters: {'depth': 4, 'learning_rate': 0.29127385712697834, 'l2_leaf_reg': 4.622589001020832, 'subsample': 0.5274034664069657, 'random_strength': 1.9000671753502962}. Best is trial 1 with value: 586.5647748743615

Best trial: {'depth': 9, 'learning_rate': 0.05645358158645032, 'l2_leaf_reg': 3.7836223867572794, 'subsample': 0.8023666348684122, 'random_strength': 0.8593414916531122}


In [17]:
# 최종모델
final_model = CatBoostRegressor(**study.best_trial.params)
final_model.fit(X_train, Y_train, verbose=100)

0:	learn: 690.9114912	total: 12ms	remaining: 12s
100:	learn: 554.9983107	total: 1.95s	remaining: 17.4s
200:	learn: 531.6823742	total: 4.49s	remaining: 17.8s
300:	learn: 512.3918979	total: 6.4s	remaining: 14.9s
400:	learn: 496.0247425	total: 7.35s	remaining: 11s
500:	learn: 480.6678889	total: 8.37s	remaining: 8.34s
600:	learn: 467.7911434	total: 9.39s	remaining: 6.24s
700:	learn: 455.6016118	total: 10.4s	remaining: 4.43s
800:	learn: 443.1912879	total: 11.3s	remaining: 2.82s
900:	learn: 432.0600263	total: 12.4s	remaining: 1.36s
999:	learn: 421.0993301	total: 13.3s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7cd7dff21e70>

In [16]:
# test 데이터에 대해 예측
predictions = final_model.predict(test_data)

# 음수 예측값을 0으로 대체
predictions = np.maximum(0, predictions)

# submission
submission = pd.read_csv('/content/drive/MyDrive/소득 예측/sample_submission.csv')
submission['Income'] = predictions
submission.to_csv('submission_소득예측b.csv', index=False)

from google.colab import files
files.download('submission_소득예측b.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>