#CatBoost + Optuna + StratifiedKFold

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna
!pip install category_encoders
!pip install catboost

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1
Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
#라이브러리 임포트
import os
import random
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
# warning 제거
import warnings
warnings.filterwarnings("ignore")

# Reproducibility setup
RANDOM_SEED = 42
os.environ['PYTHONHASHSEED'] = str(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_columns', None)

# 전처리

In [14]:
train_data = pd.read_csv('/content/drive/MyDrive/소득 예측/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/소득 예측/test.csv')

In [15]:
# id 제거
train_data.drop('ID', axis=1, inplace=True)
test_data.drop('ID', axis=1, inplace=True)

In [16]:
# x,y 분리
X_train = train_data.drop('Income', axis=1)
Y_train = train_data['Income']

In [17]:
# mother 나 father 중 한명이라도 US가 있으면 Birth_Country에 (US) 추가하는 함수 정의
def add_us_if_parent_is_us(row):
    if 'US' in [row['Birth_Country (Father)'], row['Birth_Country (Mother)']]:
        return row['Birth_Country'] + '(US)'
    else:
        return row['Birth_Country']

X_train['Birth_Country'] = X_train.apply(add_us_if_parent_is_us, axis=1)
test_data['Birth_Country'] = test_data.apply(add_us_if_parent_is_us, axis=1)

# Citizenship과 결합
X_train['Birth_Citizen'] = X_train['Birth_Country'] + '_' + X_train['Citizenship']
test_data['Birth_Citizen'] = test_data['Birth_Country'] + '_' + test_data['Citizenship']

# 근로소득이 아닌 소득
X_train['total_stock'] = X_train['Gains'] + X_train['Dividends'] - X_train['Losses']
test_data['total_stock'] = test_data['Gains'] + test_data['Dividends'] - test_data['Losses']


# columns 제거
# household_status는 다중공선성 때문에 제거
# losses 는 feature importance가 낮고, zero비율이 높기에 제거
X_train.drop(['Household_Status', 'Birth_Country (Father)', 'Citizenship', 'Gains', 'Dividends', 'Losses',
              'Birth_Country', 'Birth_Country (Mother)'], axis=1, inplace=True)
test_data.drop(['Household_Status', 'Birth_Country (Father)', 'Citizenship', 'Gains', 'Dividends', 'Losses',
              'Birth_Country', 'Birth_Country (Mother)'], axis=1, inplace=True)

In [18]:
# 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X_train.columns :
  if X_train[i].dtypes == 'O' :
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['Gender', 'Education_Status', 'Employment_Status', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Summary', 'Tax_Status', 'Income_Status', 'Birth_Citizen']
numerical_list : ['Age', 'Working_Week (Yearly)', 'total_stock']


In [19]:
# 타겟 인코딩
encoder = TargetEncoder(cols=categorical_list)
X_train = encoder.fit_transform(X_train, Y_train)
test_data = encoder.transform(test_data)

# 모델

In [20]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate= 0.02,
)
model.fit(X_train, Y_train, verbose=100)

0:	learn: 698.0829310	total: 52.1ms	remaining: 52.1s
100:	learn: 593.5645495	total: 463ms	remaining: 4.12s
200:	learn: 581.6743207	total: 862ms	remaining: 3.42s
300:	learn: 575.3317313	total: 1.22s	remaining: 2.82s
400:	learn: 570.0274475	total: 1.6s	remaining: 2.39s
500:	learn: 565.5266772	total: 1.96s	remaining: 1.95s
600:	learn: 561.3710741	total: 2.32s	remaining: 1.54s
700:	learn: 557.7263720	total: 2.71s	remaining: 1.16s
800:	learn: 554.5271854	total: 3.06s	remaining: 761ms
900:	learn: 551.5073662	total: 3.41s	remaining: 375ms
999:	learn: 548.3040762	total: 3.84s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x789033186920>

In [21]:
# test 데이터에 대해 예측
predictions = model.predict(test_data)

# 음수 예측값을 0으로 대체
predictions = np.maximum(0, predictions)

# submission
submission = pd.read_csv('/content/drive/MyDrive/소득 예측/sample_submission.csv')
submission['Income'] = predictions
submission.to_csv('submission_소득예측.csv', index=False)

from google.colab import files
files.download('submission_소득예측.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
result = pd.read_csv('submission_소득예측.csv')
result.describe()

Unnamed: 0,Income
count,10000.0
mean,561.260509
std,382.463084
min,0.0
25%,328.264134
50%,577.924167
75%,824.337992
max,3051.797805


In [25]:
over = train_data[train_data['Income'] >= 3051]
over['Gains'].value_counts()

Gains
0        96
15024     1
7298      1
7688      1
4386      1
20051     1
4650      1
1797      1
Name: count, dtype: int64

In [27]:
over['Losses'].value_counts()

Losses
0       98
1564     1
1977     1
1887     1
2001     1
2472     1
Name: count, dtype: int64

In [28]:
over['Dividends'].value_counts()

Dividends
0       78
30       2
125      2
600      2
200      2
10       2
500      2
750      1
400      1
2000     1
25       1
50       1
3600     1
3000     1
7        1
2        1
4000     1
1000     1
1920     1
80       1
Name: count, dtype: int64