# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from context_data import (
    context_data_load,
    context_data_split,
    context_data_loader
)

### 데이터 셋 읽어오기 / 학습, 검증 데이터 분리

In [3]:
# DATA LOAD
data = context_data_load()

# Train/Valid Split
data = context_data_split(data)

In [4]:
data.keys()

dict_keys(['train', 'test', 'field_dims', 'cat_columns', 'X_train', 'X_valid', 'y_train', 'y_valid'])

In [5]:
data['train'] # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,...,category,product_count,timeline_count,idit_all,bant_submit_count,com_reg_count,idx_count,lead_log,lead_count,enterprise_count
0,1.00,0,0,0.066667,0,0,0,0.0,0,62,...,0,0,0,0,0,0,0,4.127134,0,0
1,1.00,0,0,0.066667,1,0,0,12.0,1,96,...,0,0,0,0,0,0,0,4.564348,0,1
2,1.00,1,0,0.088889,2,0,0,144.0,2,56,...,0,0,0,0,0,0,0,4.025352,0,1
3,1.00,1,0,0.088889,3,0,0,0.0,3,44,...,0,0,0,0,0,0,1,3.784190,0,0
4,1.00,1,0,0.088889,4,1,0,0.0,4,97,...,0,0,0,0,0,0,1,4.574711,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,42,0,0.000000,35107,0,1,0.0,2,200,...,0,0,0,0,0,1,0,5.298317,0,0
59295,0.75,9,0,0.040000,35108,1,0,0.0,3,70,...,1,0,0,0,0,1,1,4.248495,0,0
59296,0.75,38,0,0.040000,35109,1,0,0.0,2,34,...,0,0,0,0,0,1,1,3.526361,0,0
59297,1.00,38,0,0.040000,35110,4,0,0.0,3,377,...,1,0,0,0,0,1,1,5.932245,0,0


In [6]:
x_train, x_val, y_train, y_val = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

## 3. 모델 학습

### 모델 정의/학습

In [7]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [8]:
train_data = lgb.Dataset(x_train, label=y_train, categorical_feature = data['cat_columns'])
validation_data = lgb.Dataset(x_val, label=y_val, categorical_feature = data['cat_columns'])

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    # 'device' : 'gpu'
}

In [None]:
model = lgb.train(params, train_data, valid_sets=[validation_data])

### 하이퍼파라미터 튜닝

In [100]:
import optuna

In [101]:
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        'feature_pre_filter': trial.suggest_categorical('feature_pre_filter', [False]),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        
    }

    train_data = lgb.Dataset(x_train, label=y_train)
    validation_data = lgb.Dataset(x_val, label=y_val)

    model = lgb.train(param, train_data)
    preds = model.predict(x_val)
    pred_labels = np.rint(preds)
    F1 = f1_score(y_val, pred_labels)
    return F1


In [102]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)

[I 2024-02-19 02:35:30,011] A new study created in memory with name: no-name-57ab0de2-8987-4bd1-b744-7c69cb194b32
[I 2024-02-19 02:35:30,971] Trial 0 finished with value: 0.8169511477339612 and parameters: {'feature_pre_filter': False, 'lambda_l1': 0.41185130378682994, 'lambda_l2': 3.798058279275567, 'num_leaves': 176, 'feature_fraction': 0.44832243640842084, 'bagging_fraction': 0.9507386255191733, 'bagging_freq': 3, 'min_child_samples': 79}. Best is trial 0 with value: 0.8169511477339612.
[I 2024-02-19 02:35:32,243] Trial 1 finished with value: 0.8301449275362319 and parameters: {'feature_pre_filter': False, 'lambda_l1': 0.017632978703315098, 'lambda_l2': 0.00023590720424089502, 'num_leaves': 253, 'feature_fraction': 0.7290219604646175, 'bagging_fraction': 0.85672531298811, 'bagging_freq': 1, 'min_child_samples': 91}. Best is trial 1 with value: 0.8301449275362319.
[I 2024-02-19 02:35:32,935] Trial 2 finished with value: 0.8185507246376812 and parameters: {'feature_pre_filter': False,

In [103]:
print('Best parameters:', study.best_params)
print('Best value:', study.best_value)
print('Best trial:', study.best_trial)

Best parameters: {'feature_pre_filter': False, 'lambda_l1': 8.574435600918194e-08, 'lambda_l2': 6.179813365573579e-07, 'num_leaves': 231, 'feature_fraction': 0.7246549332042913, 'bagging_fraction': 0.8640911398204151, 'bagging_freq': 6, 'min_child_samples': 9}
Best value: 0.8477508650519031
Best trial: FrozenTrial(number=660, state=1, values=[0.8477508650519031], datetime_start=datetime.datetime(2024, 2, 19, 2, 53, 24, 382893), datetime_complete=datetime.datetime(2024, 2, 19, 2, 53, 25, 893962), params={'feature_pre_filter': False, 'lambda_l1': 8.574435600918194e-08, 'lambda_l2': 6.179813365573579e-07, 'num_leaves': 231, 'feature_fraction': 0.7246549332042913, 'bagging_fraction': 0.8640911398204151, 'bagging_freq': 6, 'min_child_samples': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'feature_pre_filter': CategoricalDistribution(choices=(False,)), 'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=

In [104]:
train_data = lgb.Dataset(x_train, label=y_train)
validation_data = lgb.Dataset(x_val, label=y_val)

model = lgb.train(study.best_params, train_data)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1993
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 28
[LightGBM] [Info] Start training from score 0.082274


### 모델 성능 보기

In [110]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [111]:
pred = model.predict(x_val)
pred = [1 if x >= 0.5 else 0 for x in pred]
get_clf_eval(y_val, pred)

오차행렬:
 [[  722   225]
 [   73 10840]]

정확도: 0.9749
정밀도: 0.9082
재현율: 0.7624
F1: 0.8289


## 4. 제출하기

### 테스트 데이터 예측

In [112]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [113]:
test_pred = model.predict(x_val)
sum(test_pred) # True로 예측된 개수

958.0946318392415

### 제출 파일 작성

In [114]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub = pd.DataFrame(columns=["is_converted"])
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**