# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from context_data import (
    context_data_load,
    context_data_split
)

import lightgbm as lgb
import optuna

### 데이터 셋 읽어오기 / 학습, 검증 데이터 분리

In [None]:
# DATA LOAD
data = context_data_load()

# Train/Valid Split
data = context_data_split(data)

In [None]:
data['train'] # 학습용 데이터 살펴보기

In [None]:
x_train, x_val, y_train, y_val = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

## 3. 모델 학습

### 모델 정의/학습

In [None]:
train_data = lgb.Dataset(x_train, label=y_train, categorical_feature = data['cat_columns'])
validation_data = lgb.Dataset(x_val, label=y_val, categorical_feature = data['cat_columns'])

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'num_boost_round ' : 100 # 수정
    # 'device' : 'gpu'
}

In [None]:
model = lgb.train(params, train_data, valid_sets=[validation_data])

### 하이퍼파라미터 튜닝

In [None]:
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        'feature_pre_filter': trial.suggest_categorical('feature_pre_filter', [False]),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        
    }

    train_data = lgb.Dataset(x_train, label=y_train)
    validation_data = lgb.Dataset(x_val, label=y_val)

    model = lgb.train(param, train_data)
    preds = model.predict(x_val)
    pred_labels = np.rint(preds)
    F1 = f1_score(y_val, pred_labels)
    return F1


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
print('Best parameters:', study.best_params)
print('Best value:', study.best_value)
print('Best trial:', study.best_trial)

In [None]:
train_data = lgb.Dataset(x_train, label=y_train, categorical_feature = data['cat_columns'])
validation_data = lgb.Dataset(x_val, label=y_val, categorical_feature = data['cat_columns'])

model = lgb.train(study.best_params, train_data)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val)
pred = [1 if x >= 0.5 else 0 for x in pred]
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
test_pred = model.predict(data['test'].drop(["is_converted", "id"], axis=1))

In [None]:
test_pred = [1 if x >= 0.5 else 0 for x in test_pred]
sum(test_pred) # True로 예측된 개수

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub["is_converted"] = df_sub["is_converted"].astype(bool)
df_sub

In [None]:
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**