#CatBoost + Optuna + StratifiedKFold

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
!pip install optuna
!pip install category_encoders
!pip install catboost



In [23]:
#라이브러리 임포트
import os
import random
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
# warning 제거
import warnings
warnings.filterwarnings("ignore")

# Reproducibility setup
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_columns', None)

In [24]:
train_data = pd.read_csv('/content/drive/MyDrive/소득 예측/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/소득 예측/test.csv')

In [25]:
# id 제거
train_data.drop('ID', axis=1, inplace=True)
test_data.drop('ID', axis=1, inplace=True)

In [26]:
# x,y 분리
X_train = train_data.drop('Income', axis=1)
Y_train = train_data['Income']

In [27]:
# mother 나 father 중 한명이라도 US가 있으면 Birth_Country에 (US) 추가하는 함수 정의
def add_us_if_parent_is_us(row):
    if 'US' in [row['Birth_Country (Father)'], row['Birth_Country (Mother)']]:
        return row['Birth_Country'] + '(US)'
    else:
        return row['Birth_Country']

X_train['Birth_Country'] = X_train.apply(add_us_if_parent_is_us, axis=1)
test_data['Birth_Country'] = test_data.apply(add_us_if_parent_is_us, axis=1)

# Citizenship과 결합
X_train['Birth_Citizen'] = X_train['Birth_Country'] + '_' + X_train['Citizenship']
test_data['Birth_Citizen'] = test_data['Birth_Country'] + '_' + test_data['Citizenship']

# 근로소득이 아닌 소득
X_train['total_stock'] = X_train['Gains'] + X_train['Dividends'] - X_train['Losses']
test_data['total_stock'] = test_data['Gains'] + test_data['Dividends'] - test_data['Losses']


# columns 제거
# household_status는 다중공선성 때문에 제거
X_train.drop(['Household_Status', 'Birth_Country (Father)', 'Citizenship', 'Gains', 'Dividends', 'Losses',
              'Birth_Country', 'Birth_Country (Mother)'], axis=1, inplace=True)
test_data.drop(['Household_Status', 'Birth_Country (Father)', 'Citizenship', 'Gains', 'Dividends', 'Losses',
              'Birth_Country', 'Birth_Country (Mother)'], axis=1, inplace=True)

In [29]:
# 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X_train.columns :
  if X_train[i].dtypes == 'O' :
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['Gender', 'Education_Status', 'Employment_Status', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Summary', 'Tax_Status', 'Income_Status', 'Birth_Citizen']
numerical_list : ['Age', 'Working_Week (Yearly)', 'total_stock']


In [9]:
# 타겟 인코딩
#encoder = TargetEncoder(cols=categorical_list)
#X_train = encoder.fit_transform(X_train, Y_train)
#test_data = encoder.transform(test_data)

In [38]:
model = CatBoostRegressor(
    learning_rate=0.046
)
model.fit(X_train, Y_train,cat_features=categorical_list, verbose=100)

0:	learn: 693.5329510	total: 41.7ms	remaining: 41.6s
100:	learn: 588.7721325	total: 3.6s	remaining: 32.1s
200:	learn: 582.0795722	total: 7.19s	remaining: 28.6s
300:	learn: 576.9429865	total: 12.1s	remaining: 28.1s
400:	learn: 572.5799919	total: 16.7s	remaining: 25s
500:	learn: 568.6284759	total: 20.8s	remaining: 20.7s
600:	learn: 564.6652131	total: 26.1s	remaining: 17.3s
700:	learn: 560.8909420	total: 30.6s	remaining: 13s
800:	learn: 557.3673823	total: 34.8s	remaining: 8.63s
900:	learn: 553.8805155	total: 40.6s	remaining: 4.46s
999:	learn: 550.9443734	total: 44.7s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7ca74f6a0700>

In [39]:
# test 데이터에 대해 예측
predictions = model.predict(test_data)

# 음수 예측값을 0으로 대체
predictions = np.maximum(0, predictions)

# submission
submission = pd.read_csv('/content/drive/MyDrive/소득 예측/sample_submission.csv')
submission['Income'] = predictions
submission.to_csv('submission_소득예측a.csv', index=False)

from google.colab import files
files.download('submission_소득예측a.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>