# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train_fe.csv") # 학습용 데이터
df_test = pd.read_csv("submission_fe.csv") # 테스트 데이터(제출파일의 데이터)

  df_train = pd.read_csv("train_fe.csv") # 학습용 데이터


In [3]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,...,category,product_count,timeline_count,idit_all,bant_submit_count,com_reg_count,idx_count,lead_log,lead_count,enterprise_count
0,1.0,philippines,AS,0.066667,32160,End Customer,Enterprise,0.0,purchasing,62,...,HVAC/ESS,1,0,0,1,1,1,4.127134,1,0
1,1.0,philippines,AS,0.066667,23122,End Customer,Enterprise,12.0,media and communication,96,...,HVAC/ESS,1,0,0,1,1,1,4.564348,1,1
2,1.0,india,AS,0.088889,1755,End Customer,Enterprise,144.0,engineering,56,...,HVAC/ESS,1,0,0,1,1,1,4.025352,1,1
3,1.0,india,AS,0.088889,4919,End Customer,Enterprise,0.0,entrepreneurship,44,...,HVAC/ESS,1,0,0,1,1,0,3.78419,1,0
4,1.0,india,AS,0.088889,17126,"Specifier, Influencer",Enterprise,0.0,consulting,97,...,HVAC/ESS,1,0,0,1,1,0,4.574711,1,0


## 2. 데이터 전처리

### 레이블 인코딩

In [4]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [5]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    "category",
    "product_count",
    "timeline_count",
    "idit_all"
]


df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [6]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [7]:
df_train["is_converted"] = df_train["is_converted"].astype(int)

In [8]:
df_train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,...,category,product_count,timeline_count,idit_all,bant_submit_count,com_reg_count,idx_count,lead_log,lead_count,enterprise_count
0,1.00,56,0,0.066667,32160,4,0,0.0,22,62,...,0,1,13,0,1,1,1,4.127134,1,0
1,1.00,56,0,0.066667,23122,4,0,12.0,16,96,...,0,1,13,0,1,1,1,4.564348,1,1
2,1.00,30,0,0.088889,1755,4,0,144.0,8,56,...,0,1,13,0,1,1,1,4.025352,1,1
3,1.00,30,0,0.088889,4919,4,0,0.0,9,44,...,0,1,13,0,1,1,0,3.784190,1,0
4,1.00,30,0,0.088889,17126,9,0,0.0,5,97,...,0,1,13,0,1,1,0,4.574711,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,57,0,,33747,4,1,0.0,8,200,...,0,1,13,0,1,0,1,5.298317,1,0
59295,0.75,15,0,0.040000,35420,9,0,0.0,9,70,...,5,1,13,0,1,0,0,4.248495,1,0
59296,0.75,55,0,0.040000,19249,9,0,0.0,8,34,...,0,1,13,0,1,0,0,3.526361,1,0
59297,1.00,55,0,0.040000,40327,0,0,0.0,9,377,...,5,1,13,0,1,0,0,5.932245,1,0


In [9]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3-1. 모델 학습

### 모델 정의 

In [17]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
model = RandomForestRegressor(random_state=42)

### 모델 학습

In [18]:
# Train the model on training data
model.fit(x_train.fillna(0), y_train)

## 3-2. 하이퍼파라미터 튜닝

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import optuna
from optuna.samplers import TPESampler

def objective(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int("n_estimators", 100, 500),
        max_depth=trial.suggest_int("max_depth", 4, 10),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
        bootstrap=trial.suggest_categorical("bootstrap", [True, False]),
        random_state=42  # Setting random_state for reproducibility
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return mean_squared_error(y_val, y_pred)

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="random_forest_regressor", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value:  0.06145980360605766
  Params: 
    n_estimators: 140
    max_depth: 7
    min_samples_split: 4
    min_samples_leaf: 2
    bootstrap: False


In [12]:
model = RandomForestRegressor(**trial.params, verbose=False)

In [28]:
model.fit(x_train, y_train)

### 모델 성능 보기

In [19]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [29]:
pred = model.predict(x_val.fillna(0))
pred  = pred.astype(bool)
get_clf_eval(y_val, pred)

오차행렬:
 [[  947     0]
 [10913     0]]

정확도: 0.0798
정밀도: 0.0798
재현율: 1.0000
F1: 0.1479


## 4. 제출하기

### 테스트 데이터 예측

In [30]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [32]:
test_pred = model.predict(x_test.fillna(0))
# sum(test_pred) # True로 예측된 개수

In [46]:
# 0~1 값 -> 0, 1 값 기준 0.5 -> T, F True 개수 1000 -> 임계값 찾기
df_sub = pd.read_csv("submission.csv")
def adjust_is_converted(value):
    if value >= 0.085:
        return 1
    else:
        return 0
test_pred = model.predict(x_test.fillna(0))
df_sub["is_converted"] = test_pred
df_sub['is_converted'] = df_sub['is_converted'].apply(adjust_is_converted)
df_sub["is_converted"] = df_sub["is_converted"].astype(bool)
test_pred = df_sub["is_converted"]

In [47]:
sum(df_sub['is_converted'])

1077

### 제출 파일 작성

In [25]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.00,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,...,LGESP,,1,0,0.001183,0.049840,retail,Electronics & Telco,278,True
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,...,LGEUS,,0,0,0.000013,,transportation,Others,437,True
2,8491,1.00,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,...,LGEGF,less than 3 months,0,0,0.000060,0.131148,hospital & health care,General Hospital,874,False
3,19895,0.50,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,...,LGEUS,more than a year,0,0,0.001183,0.049840,retail,,194,False
4,10465,1.00,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,...,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13855,0.50,/São Paulo/Brazil,AS,,40292,,Enterprise,10.0,,...,LGESP,,0,0,,,,,97,False
5267,7979,0.25,General / / United States,IT,,47466,,Enterprise,0.0,,...,LGEUS,,0,0,,,,,438,True
5268,12887,0.75,/ OURO BRANCO / Brazil,AS,,46227,Specifier/ Influencer,Enterprise,,,...,LGESP,less than 3 months,0,0,,,,,97,False
5269,17530,0.00,/ / Germany,IT,,45667,End Customer,SMB,,,...,LGEDG,,0,0,,,,,429,False


In [48]:
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**