In [75]:
!pip install optuna
!pip install catboost

In [76]:
pip install -U imbalanced-learn




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [77]:
pip install -U torch





[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import optuna
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from collections import Counter
import catboost as cb
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [79]:
# 레이블 인코딩할 칼럼들
cat_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "customer_idx",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "category",
    "product_count",
    "timeline_count",
    "idit_all",
    "lead_owner",
    "bant_submit_count",
    "com_reg_count",
    "idx_count",
    "lead_count",
    "enterprise_count",
    "enterprise_weight"
]

def index_processing(context_df, train, test, column_name):
    idx = {v:k for k,v in enumerate(context_df[column_name].unique())}
    train.loc[:, column_name] = train[column_name].map(idx)
    test.loc[:, column_name] = test[column_name].map(idx)
    return idx

def process_context_data(train_df, test_df):
    context_df = pd.concat([train_df[cat_columns], test_df[cat_columns]]).reset_index(drop=True)
    idx = {}
    for col in cat_columns:
        idx_name = index_processing(context_df, train_df, test_df, col)
        idx[col+'2idx'] = idx_name
    return idx, train_df, test_df

def context_data_load():
    ######################## DATA LOAD
    train = pd.read_csv('train_final.csv', low_memory=False)
    test = pd.read_csv('submission_final.csv')

    idx, context_train, context_test = process_context_data(train, test)
    field_dims = np.array([len(toidx) for toidx in idx], dtype=np.int32)

    data = {
            'train':context_train.fillna(0),
            'test':context_test.fillna(0),
            'field_dims':field_dims,
            'cat_columns' : cat_columns,
            }


    return data

def context_data_split(data):
    # SMOTE를 사용하여 데이터 오버샘플링
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(data['train'].drop(['is_converted'], axis=1), data['train']['is_converted'])

    # 샘플링된 데이터를 다시 훈련 데이터와 테스트 데이터로 분할
    X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, 
                                                      y_resampled, 
                                                      test_size=0.2, 
                                                      random_state=42, 
                                                      stratify=y_resampled)

    y_train = y_train.astype(np.int32) ; y_valid = y_valid.astype(np.int32)
    data['X_train'], data['X_valid'], data['y_train'], data['y_valid'] = X_train, X_valid, y_train, y_valid
    
    return data


In [85]:
data = context_data_load()
data = context_data_split(data)

  'train':context_train.fillna(0),
  'test':context_test.fillna(0),


In [87]:
# CatBoost 모델 훈련
train_pool = Pool(data['X_train'], label=data['y_train'])
valid_pool = Pool(data['X_valid'], label=data['y_valid'])

In [88]:
def objective_catboost(trial):
    param = {
        "random_state": 42,
        'early_stopping_rounds': 20,
        'loss_function': 'Logloss',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        "n_estimators": trial.suggest_int("n_estimators", 500, 3000),
        "max_depth": trial.suggest_int("max_depth", 1, 8),
        'random_strength': trial.suggest_int('random_strength', 0, 50),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 3e-5),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "max_bin": trial.suggest_int("max_bin", 150, 300),
    }

    model = cb.CatBoostClassifier(**param, verbose=0)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    # 검증 데이터셋에 대한 예측 및 정확도 계산
    pred = model.predict(data['X_valid'])
    # pred = np.vectorize(lambda x: x.lower())(model.predict(x_val))
    pred = [val == 1 for val in pred]
    F1 = f1_score(data['y_valid'], pred, labels=[True, False])
    return F1

In [89]:
# Optuna 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective_catboost, n_trials=100)

# 최적의 파라미터와 그때의 정확도 출력
print(f"Best trial: {study.best_trial.params}")

[I 2024-02-20 18:00:09,916] A new study created in memory with name: no-name-1ace51dc-5738-47d2-b979-41d756df2a5c
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
[I 2024-02-20 18:00:53,628] Trial 0 finished with value: 0.9710066982838703 and parameters: {'learning_rate': 0.07762312957102822, 'bagging_temperature': 0.4514295330868797, 'n_estimators': 1694, 'max_depth': 2, 'random_strength': 11, 'l2_leaf_reg': 2.6565538705911684e-05, 'min_child_samples': 34, 'max_bin': 257}. Best is trial 0 with value: 0.9710066982838703.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
  'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
[I 2024-02-20 18:01:50,448] Trial 1 finished with value: 0.9703232125367287 and parameters: {'learning_rate': 0.02564420640306185, 'bagging_temperature': 0.1213048080692085, 'n_estimators': 2153, 'max

Best trial: {'learning_rate': 0.040075823012502135, 'bagging_temperature': 1.388074362792074, 'n_estimators': 2679, 'max_depth': 8, 'random_strength': 43, 'l2_leaf_reg': 1.814211208460745e-05, 'min_child_samples': 44, 'max_bin': 288}


In [93]:
param = {
    "random_state": 42,
    'early_stopping_rounds': 20,
    'loss_function': 'Logloss',
    'learning_rate': 0.040075823012502135, 
    'bagging_temperature': 1.388074362792074, 
    'n_estimators': 2679, 
    'max_depth': 8, 
    'random_strength': 43, 
    'l2_leaf_reg': 1.814211208460745e-05, 
    'min_child_samples': 44, 
    'max_bin': 288
}
model = cb.CatBoostClassifier(**param)

model.fit(Pool(data['X_train'], label=data['y_train']))

0:	learn: 0.6634309	total: 106ms	remaining: 4m 44s
1:	learn: 0.6422457	total: 189ms	remaining: 4m 12s
2:	learn: 0.6212268	total: 405ms	remaining: 6m
3:	learn: 0.6038759	total: 587ms	remaining: 6m 32s
4:	learn: 0.5801994	total: 703ms	remaining: 6m 16s
5:	learn: 0.5519085	total: 803ms	remaining: 5m 57s
6:	learn: 0.5408572	total: 881ms	remaining: 5m 36s
7:	learn: 0.5142132	total: 996ms	remaining: 5m 32s
8:	learn: 0.5013682	total: 1.07s	remaining: 5m 18s
9:	learn: 0.4897126	total: 1.14s	remaining: 5m 3s
10:	learn: 0.4807653	total: 1.2s	remaining: 4m 50s
11:	learn: 0.4703158	total: 1.29s	remaining: 4m 48s
12:	learn: 0.4614261	total: 1.46s	remaining: 4m 59s
13:	learn: 0.4446748	total: 1.59s	remaining: 5m 2s
14:	learn: 0.4323109	total: 1.68s	remaining: 4m 59s
15:	learn: 0.4230853	total: 1.76s	remaining: 4m 53s
16:	learn: 0.4149646	total: 1.86s	remaining: 4m 50s
17:	learn: 0.4048475	total: 1.98s	remaining: 4m 52s
18:	learn: 0.4012575	total: 2.11s	remaining: 4m 54s
19:	learn: 0.3977366	total: 2

<catboost.core.CatBoostClassifier at 0x225b9dcf390>

In [94]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [95]:
# 테스트 데이터로 예측
pred = model.predict(data['X_valid'])
pred = [val == 1 for val in pred]

In [96]:
get_clf_eval(data['y_valid'], pred)

오차행렬:
 [[10134    98]
 [   92 10140]]

정확도: 0.9907
정밀도: 0.9910
재현율: 0.9904
F1: 0.9907


### 테스트 데이터 예측

In [97]:
# 예측에 필요한 데이터 분리
test_pred = model.predict(data['test'].drop(["is_converted", "id"], axis=1))

In [98]:
test_pred = [val == 1 for val in test_pred]

In [99]:
sum(test_pred) # True로 예측된 개수

547

### 제출 파일 작성

In [100]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission_fe.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)