# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [None]:
import pandas as pd
import numpy as np
# 모델링
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import optuna
from optuna.samplers import TPESampler

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    mean_squared_error
)
from context_data import (
    context_data_load,
    context_data_split
)

### 데이터 셋 읽어오기 / 학습, 검증 데이터 분리

In [None]:
# DATA LOAD
data = context_data_load()

# Train/Valid Split
data = context_data_split(data)

In [None]:
data['train'] # 학습용 데이터 살펴보기

In [None]:
x_train, x_val, y_train, y_val = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

In [None]:
selected_features = ['customer_idx', 'customer_type', 'enterprise_count', 'lead_owner', 'response_corporate', 'com_reg_ver_win_rate', 'com_reg_count', 'bant_submit', 'customer_country', 'inquiry_type', 'product_subcategory', 'product_category', 'business_unit', 'enterprise_weight', 'business_subarea', 'lead_log', 'idx_count', 'historical_existing_cnt', 'lead_desc_length', 'category']

In [None]:
# 선택된 변수로만 데이터셋 구성
x_train = x_train[selected_features]
x_val = x_val[selected_features]

## 3. 모델

### 3-1. RandomForestRegressor 학습

#### 모델 정의 

In [None]:
# Instantiate model 
model = RandomForestRegressor(random_state=42)

#### 모델 학습

In [None]:
# Train the model on training data
model.fit(x_train.fillna(0), y_train)

### 3-2. RandomForestRegressor 하이퍼파라미터 튜닝

In [None]:
def objective(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int("n_estimators", 100, 500),
        max_depth=trial.suggest_int("max_depth", 3, 6),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
        bootstrap=trial.suggest_categorical("bootstrap", [True, False]),
        random_state=42  # Setting random_state for reproducibility
    )

    # 새로운 데이터셋으로 모델 재학습  
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return mean_squared_error(y_val, y_pred)

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="random_forest_regressor", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=1000)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
model = RandomForestRegressor(**trial.params, verbose=False)

In [None]:
model.fit(x_train, y_train)

### 3-3. Randomclassifier 학습

#### 모델 정의 

In [None]:
# Instantiate model 
model = RandomForestClassifier(random_state=42)

#### 모델 학습

In [None]:
# Train the model on training data
model.fit(x_train.fillna(0), y_train)

### 3-4. Randomclassifier 하이퍼파라미터 튜닝

In [None]:
def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 100, 500),
        max_depth=trial.suggest_int("max_depth", 3, 6),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
        bootstrap=trial.suggest_categorical("bootstrap", [True, False]),
        random_state=42  # Setting random_state for reproducibility
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return f1_score(y_val, y_pred, labels=[True, False])

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="random_forest_classifier", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=500)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
model = RandomForestClassifier(**trial.params, verbose=False)

In [None]:
model.fit(x_train, y_train)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
# classifier
pred = model.predict(x_val.fillna(0))
#pred  = pred.astype(bool)
get_clf_eval(y_val, pred)

In [None]:
#regressor
pred = model.predict(x_val)
pred = [1 if x >= 0.5 else 0 for x in pred]
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
test_pred = model.predict(data['test'].drop(["is_converted", "id"], axis=1))

In [None]:
#classifier
sum(test_pred) # True로 예측된 개수

In [None]:
#Regressor
test_pred = [True if x >= 0.5 else False for x in test_pred]
sum(test_pred) # True로 예측된 개수

In [None]:
# 변수명을 가져오기
feature_names = x_train.columns

# 변수 중요도 출력
importances = model.feature_importances_

# 변수명과 중요도
for feature, imp in zip(feature_names, importances):
    print(f"{feature}: {imp}")

In [None]:
len(feature_names), len(importances)

In [None]:
import matplotlib.pyplot as plt

# 변수 중요도를 가져와 DataFrame으로 변환
importances_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': model.feature_importances_})

# 변수 중요도를 Importance 기준으로 내림차순 정렬
importances_df = importances_df.sort_values(by='Importance', ascending=True)

# 변수 중요도를 막대 그래프로 시각화
plt.figure(figsize=(10, 6))
plt.barh(importances_df['Feature'], importances_df['Importance'], color='purple')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub["is_converted"] = df_sub["is_converted"].astype(bool)
df_sub

In [None]:
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [None]:
# 제출 파일 저장 (classifier)
df_sub.to_csv("submission_rfc.csv", index=False)

In [None]:
# 제출 파일 저장 (regressor)
df_sub.to_csv("submission_rfr.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**