# voting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install xgboost catboost  # XGBoost와 CatBoost 설치
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2
Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.4 MB/s[0m

In [None]:
import os
import gc
import re
import pickle
import joblib
import pandas as pd
import numpy as np
import random

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor  # XGBoost
from catboost import CatBoostRegressor  # CatBoost
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
train_data_path = '/content/drive/My Drive/웹 로그/train.csv'
test_data_path = '/content/drive/My Drive/웹 로그/test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [None]:
# train 데이터셋에서 결측값 확인
train_missing_values = train_df.isnull().sum()
print("Train 데이터셋의 결측값:")
print(train_missing_values[train_missing_values > 0])  # 결측값이 있는 열만 출력

Train 데이터셋의 결측값:
keyword          137675
referral_path    161107
dtype: int64


In [None]:
# 결측치 대체
train_df['keyword'].fillna(train_df['keyword'].mode()[0], inplace=True)
train_df['referral_path'].fillna(train_df['referral_path'].mode()[0], inplace=True)
test_df['keyword'].fillna(train_df['keyword'].mode()[0], inplace=True) #train의 최빈값 사용
test_df['referral_path'].fillna(train_df['referral_path'].mode()[0], inplace=True)

In [None]:
# 필요 없는 column 제거
train_data = train_df.drop(columns=['sessionID','userID'])
train_data
test_ids = test_df['sessionID']
test_data = test_df.drop(columns=['sessionID','userID'])

In [None]:
# X's & Y Split
Y = train_data['TARGET']
X = train_data.drop(columns=['TARGET'])

In [None]:
# 숫자형(Integer), 범주형(Categorical) 변수 분할
numerical_list=[]
categorical_list=[]

for i in X.columns :
  if X[i].dtypes == 'O' :
    categorical_list.append(i)
  else :
    numerical_list.append(i)

print("categorical_list :", categorical_list)
print("numerical_list :", numerical_list)

categorical_list : ['browser', 'OS', 'device', 'continent', 'subcontinent', 'country', 'traffic_source', 'traffic_medium', 'keyword', 'referral_path']
numerical_list : ['new', 'quality', 'duration', 'bounced', 'transaction', 'transaction_revenue']


In [None]:
encoders = {}
for col in categorical_list:
    encoder = LabelEncoder()
    encoder.fit(X[col])  # 원래 트레인 데이터에 대해 fit
    encoders[col] = encoder

In [None]:
def safe_transform(col, encoder, default_val=-1):
    labels = list(encoder.classes_)
    return col.apply(lambda x: encoder.transform([x])[0] if x in labels else default_val)

for col in categorical_list:
    # 각 컬럼에 대해 저장된 LabelEncoder를 사용하여 훈련 데이터와 검증 데이터를 변환합니다.
    X[col] = encoders[col].transform(X[col])

# 이제 테스트 데이터에 safe_transform을 적용합니다.
for col in categorical_list:
    test_data[col] = safe_transform(test_data[col], encoders[col])

In [None]:
import optuna
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

# Optuna를 사용하여 최적화할 목적 함수 정의
def objective(trial):
    # LightGBM 하이퍼파라미터
    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('lgbm_max_depth', 3, 20),
        'num_leaves': trial.suggest_int('lgbm_num_leaves', 20, 100),
    }

    # XGBoost 하이퍼파라미터
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 20),
    }

    # CatBoost 하이퍼파라미터
    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 100, 1000),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('cat_depth', 3, 10),
    }

    # 모델 인스턴스 생성
    lgbm_model = LGBMRegressor(**lgbm_params)
    xgb_model = XGBRegressor(**xgb_params)
    catboost_model = CatBoostRegressor(**cat_params, verbose=False)

    # Voting Regressor 정의
    voting_regressor = VotingRegressor(
        estimators=[
            ('lgbm', lgbm_model),
            ('xgb', xgb_model),
            ('catboost', catboost_model)
        ]
    )

    # 교차 검증을 사용한 스코어 계산
    scores = cross_val_score(voting_regressor, X, Y, cv=3, scoring='neg_mean_squared_error', error_score='raise')
    rmse_score = np.mean(np.sqrt(-scores))

    return rmse_score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # 시험 횟수를 10으로 설정

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")



In [None]:
from sklearn.ensemble import VotingRegressor

# 최적화된 하이퍼파라미터를 기반으로 모델 구성
best_lgbm_params = {
    'n_estimators': study.best_params['lgbm_n_estimators'],
    'learning_rate': study.best_params['lgbm_learning_rate'],
    'max_depth': study.best_params['lgbm_max_depth'],
    'num_leaves': study.best_params['lgbm_num_leaves'],
    'n_jobs': -1,
}

best_xgb_params = {
    'n_estimators': study.best_params['xgb_n_estimators'],
    'learning_rate': study.best_params['xgb_learning_rate'],
    'max_depth': study.best_params['xgb_max_depth'],
    'n_jobs': -1,
}

best_cat_params = {
    'iterations': study.best_params['cat_iterations'],
    'learning_rate': study.best_params['cat_learning_rate'],
    'depth': study.best_params['cat_depth'],
    'verbose': 0,
}

# 모델 인스턴스 생성
lgbm_model = LGBMRegressor(**best_lgbm_params)
xgb_model = XGBRegressor(**best_xgb_params)
catboost_model = CatBoostRegressor(**best_cat_params)

# Voting Regressor 정의
voting_regressor = VotingRegressor(
    estimators=[
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('catboost', catboost_model)
    ],
    weights=[2, 1, 2]  # lgbm과 catboost에 더 높은 가중치 부여
)

# Voting Regressor 학습
voting_regressor.fit(X, Y)

# 테스트 데이터에 대한 예측 수행
predictions = voting_regressor.predict(test_data)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.175124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 252289, number of used features: 16
[LightGBM] [Info] Start training from score 3.592626


In [None]:
# 예측 결과와 ID 열 결합
submission = pd.DataFrame({'sessionID': test_ids, 'TARGET': predictions})

# 제출 파일로 저장
submission.to_csv('submission_voting_optuna.csv', index=False)


In [None]:
from google.colab import files
files.download('submission_voting_optuna.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>