# 라이브러리 및 seed고정

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install catboost category_encoders optuna

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.wh

In [3]:
import os
import pandas as pd
import numpy as np
import random

import warnings
warnings.filterwarnings('ignore')

import category_encoders as ce
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import optuna

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Catboost

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/웹 광고 클릭률/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/웹 광고 클릭률/test.csv')

In [6]:
# ID 제거
train_df = train_df.drop(['ID'], axis=1)
test_ids = test_df['ID']
test_df = test_df.drop(['ID'], axis=1)

In [7]:
def process_train(df, order=2):
    # train data에 대해서만 결측치 처리
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        df[col] = df[col].interpolate(method='polynomial', order=order)

    # 남아있는 결측치는 'NAN'으로 대체
    df = df.fillna('NAN')

    return df

In [8]:
def process_test(df, train_df, order=2):
    # test data에 대해서 결측치 처리 (train 데이터의 평균값 사용하지 않도록 주의)
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        df[col] = df[col].interpolate(method='polynomial', order=order)

    # 남아있는 결측치는 'NAN'으로 대체
    df = df.fillna('NAN')

    # test_df의 열 유형을 train_df와 동일하게 설정
    for col in train_df.dtypes.index:
        if col in df.columns:
            if train_df[col].dtype == 'float64':
                df[col] = df[col].astype('float64')
            elif train_df[col].dtype == 'int64':
                df[col] = df[col].astype('int64')
            elif train_df[col].dtype == 'object':
                df[col] = df[col].astype('object')

    return df

In [9]:
train_df = process_train(train_df)
test_df = process_test(test_df, train_df)

In [10]:
# 문자형 열을 CatBoost Encoding
object_cols = train_df.select_dtypes(include=['object']).columns

cat_boost_encoder = ce.CatBoostEncoder(cols=object_cols)
train_df[object_cols] = cat_boost_encoder.fit_transform(train_df[object_cols], train_df['Click'])
test_df[object_cols] = cat_boost_encoder.transform(test_df[object_cols])


In [11]:
#X,Y 분리
X = train_df.drop(columns=['Click'])
Y = train_df['Click']


In [None]:
# PCA 적용
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)
test_pca = pca.transform(test_df)

In [None]:
# 모델 학습 및 예측
lgb_model = LGBMClassifier(random_state=42)
cat_model = CatBoostClassifier(verbose=0, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)


In [None]:
# 모델 학습
lgb_model.fit(X, Y)
cat_model.fit(X, Y)
xgb_model.fit(X, Y)

# 예측
lgb_pred = lgb_model.predict_proba(test_df)[:, 1]
cat_pred = cat_model.predict_proba(test_df)[:, 1]
xgb_pred = xgb_model.predict_proba(test_df)[:, 1]



In [None]:
# 앙상블 예측 (가중 평균)
weights = [0.1, 0.8, 0.1]  # 각 모델의 가중치
ensemble_pred = (weights[0] * lgb_pred + weights[1] * cat_pred + weights[2] * xgb_pred)

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/웹 광고 클릭률/sample_submission.csv')
submission['Click'] = ensemble_pred

submission.to_csv('click_submission_ensemble.csv', index=False)
submission

In [None]:
from google.colab import files
files.download('click_submission_ensemble.csv')