#CatBoost + Optuna + StratifiedKFold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install optuna
!pip install category_encoders
!pip install catboost

In [None]:
#라이브러리 임포트
import os
import random
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
# warning 제거
import warnings
warnings.filterwarnings("ignore")

# Reproducibility setup
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/소득 예측/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/소득 예측/test.csv')

In [None]:
# id 제거
train_data.drop('ID', axis=1, inplace=True)
test_data.drop('ID', axis=1, inplace=True)

In [None]:
# x,y 분리
X_train = train_data.drop('Income', axis=1)
Y_train = train_data['Income']

In [None]:
# 근무 시간 유형 정의 함수
def define_work_type(weeks):
    if weeks < 52:
        return '비정규직'
    else:
        return '정규직'

# 근무 시간 유형 적용
X_train['Work_Type'] = X_train['Working_Week (Yearly)'].apply(define_work_type)
test_data['Work_Type'] = test_data['Working_Week (Yearly)'].apply(define_work_type)

# 근무 시간 유형과 직업 상태 결합
X_train['Work_Type_Occupation'] = X_train['Work_Type'] + '_' + X_train['Occupation_Status']
test_data['Work_Type_Occupation'] = test_data['Work_Type'] + '_' + test_data['Occupation_Status']

# feature importance 높은 2개 결합
X_train['Occupation_Industry'] = X_train['Occupation_Status'] + "_" + X_train['Industry_Status']
test_data['Occupation_Industry'] = test_data['Occupation_Status'] + "_" + test_data['Industry_Status']

# 로그 변환 (0인 경우를 고려해 1을 더한 후 로그 적용)
X_train['Log_Dividends'] = np.log1p(X_train['Dividends'])
test_data['Log_Dividends'] = np.log1p(test_data['Dividends'])

X_train['Log_Gains'] = np.log1p(X_train['Gains'])
test_data['Log_Gains'] = np.log1p(test_data['Gains'])

X_train['Log_Losses'] = np.log1p(X_train['Losses'])
test_data['Log_Losses'] = np.log1p(test_data['Losses'])

# 중복, 필요없는 column 제거
# summary는 다중공선성 때문에 제거
X_train.drop(['Dividends', 'Household_Summary', 'Gains', 'Losses', 'Work_Type', 'Working_Week (Yearly)'], axis=1, inplace=True)
test_data.drop(['Dividends', 'Household_Summary','Gains', 'Losses', 'Work_Type', 'Working_Week (Yearly)'], axis=1, inplace=True)

In [None]:
# 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X_train.columns :
  if X_train[i].dtypes == 'O' :
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['Gender', 'Education_Status', 'Employment_Status', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Status', 'Citizenship', 'Birth_Country', 'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status', 'Income_Status', 'Work_Type_Occupation', 'Occupation_Industry']
numerical_list : ['Age', 'Log_Dividends', 'Log_Gains', 'Log_Losses']


In [None]:
# 타겟 인코딩
encoder = TargetEncoder(cols=categorical_list)
X_train = encoder.fit_transform(X_train, Y_train)
test_data = encoder.transform(test_data)

In [None]:
def objective(trial):
    # hyperparameter space 정의
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'bootstrap_type': 'Bernoulli',
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'random_strength': trial.suggest_uniform('random_strength', 1e-9, 10),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'logging_level': 'Silent',
        'random_seed': RANDOM_SEED
    }

    # Stratified K-Fold 교차 검증
    n_splits = 5
    cv_scores = np.zeros(n_splits)
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

    for idx, (train_index, valid_index) in enumerate(kf.split(X_train, Y_train)):
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        Y_train_fold, Y_valid_fold = Y_train.iloc[train_index], Y_train.iloc[valid_index]

        model = CatBoostRegressor(**params)
        model.fit(X_train_fold, Y_train_fold, eval_set=[(X_valid_fold, Y_valid_fold)], verbose=0)

        preds = model.predict(X_valid_fold)

        # Replace negative predictions with zero before calculating RMSE
        preds_adjusted = np.maximum(0, preds)

        cv_scores[idx] = np.sqrt(mean_squared_error(Y_valid_fold, preds_adjusted))

    return np.mean(cv_scores)

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
study.optimize(objective, n_trials=100)

print(f"Best trial: {study.best_trial.params}")


[I 2024-03-11 13:54:36,276] A new study created in memory with name: no-name-fd34f52c-43d9-4947-bff0-4c5300265d67
[I 2024-03-11 13:55:30,384] Trial 0 finished with value: 590.5191194019231 and parameters: {'iterations': 437, 'depth': 10, 'learning_rate': 0.22227824312530747, 'l2_leaf_reg': 0.0024430162614261413, 'subsample': 0.4936111842654619, 'random_strength': 1.559945204206032, 'leaf_estimation_iterations': 1}. Best is trial 0 with value: 590.5191194019231.
[I 2024-03-11 13:56:19,167] Trial 1 finished with value: 589.4299469101275 and parameters: {'iterations': 880, 'depth': 8, 'learning_rate': 0.21534104756085318, 'l2_leaf_reg': 1.5320059381854043e-08, 'subsample': 0.9819459112971965, 'random_strength': 8.324426408171774, 'leaf_estimation_iterations': 3}. Best is trial 1 with value: 589.4299469101275.
[I 2024-03-11 13:56:27,325] Trial 2 finished with value: 588.0829975316351 and parameters: {'iterations': 263, 'depth': 5, 'learning_rate': 0.09823025045826593, 'l2_leaf_reg': 0.0005

Best trial: {'iterations': 724, 'depth': 9, 'learning_rate': 0.020176940667001514, 'l2_leaf_reg': 9.741938710014464, 'subsample': 0.8459663325320136, 'random_strength': 6.657495381980411, 'leaf_estimation_iterations': 1}


In [None]:
# 최종모델
final_model = CatBoostRegressor(**study.best_trial.params)
final_model.fit(X_train, Y_train, verbose=100)

0:	learn: 697.7983898	total: 18.4ms	remaining: 13.3s
100:	learn: 590.6908409	total: 1.24s	remaining: 7.66s
200:	learn: 579.2219007	total: 2.41s	remaining: 6.28s
300:	learn: 572.9614159	total: 4.85s	remaining: 6.81s
400:	learn: 567.7768364	total: 6.63s	remaining: 5.34s
500:	learn: 562.4764843	total: 7.82s	remaining: 3.48s
600:	learn: 556.5447637	total: 9s	remaining: 1.84s
700:	learn: 551.3050198	total: 10.1s	remaining: 333ms
723:	learn: 550.4724727	total: 10.4s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x79e1275b3c70>

In [None]:
# test 데이터에 대해 예측
predictions = final_model.predict(test_data)

# 음수 예측값을 0으로 대체
predictions = np.maximum(0, predictions)

# submission
submission = pd.read_csv('/content/drive/MyDrive/소득 예측/sample_submission.csv')
submission['Income'] = predictions
submission.to_csv('submission_소득예측.csv', index=False)

from google.colab import files
files.download('submission_소득예측.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>