# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
# 모델링
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import optuna
from optuna.samplers import TPESampler

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    mean_squared_error
)
from context_data import (
    context_data_load,
    context_data_split
)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


### 데이터 셋 읽어오기 / 학습, 검증 데이터 분리

In [2]:
# DATA LOAD
data = context_data_load()

# Train/Valid Split
data = context_data_split(data)

  'train':context_train.fillna(0),
  'test':context_test.fillna(0),


In [3]:
data['train'] # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,...,product_count,timeline_count,idit_all,bant_submit_count,com_reg_count,idx_count,lead_log,lead_count,enterprise_count,enterprise_weight
0,1.00,0,0,0.066667,0,0,0,0.0,0,62,...,0,0,0,0,0,0,4.127134,0,0,0.0
1,1.00,0,0,0.066667,1,0,0,12.0,1,96,...,0,0,0,0,0,0,4.564348,0,1,0.0
2,1.00,1,0,0.088889,2,0,0,144.0,2,56,...,0,0,0,0,0,0,4.025352,0,1,0.0
3,1.00,1,0,0.088889,3,0,0,0.0,3,44,...,0,0,0,0,0,1,3.784190,0,0,0.0
4,1.00,1,0,0.088889,4,1,0,0.0,4,97,...,0,0,0,0,0,1,4.574711,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55753,1.00,42,0,0.000000,35096,0,1,0.0,2,200,...,0,0,0,0,1,1,5.298317,0,0,0.0
55754,0.75,9,0,0.040000,35097,1,0,0.0,3,70,...,0,0,0,0,1,1,4.248495,0,0,0.0
55755,0.75,38,0,0.040000,35098,1,0,0.0,2,34,...,0,0,0,0,1,1,3.526361,0,0,0.0
55756,1.00,38,0,0.040000,35099,4,0,0.0,3,377,...,0,0,0,0,1,1,5.932245,0,0,0.0


In [29]:
x_train, x_val, y_train, y_val = data['X_train'], data['X_valid'], data['y_train'], data['y_valid']

In [30]:
selected_features = ['customer_idx', 'customer_type', 'enterprise_count', 'lead_owner', 'response_corporate', 'com_reg_ver_win_rate', 'com_reg_count', 'bant_submit', 'customer_country', 'inquiry_type', 'product_subcategory', 'product_category', 'business_unit', 'enterprise_weight', 'business_subarea', 'lead_log', 'idx_count', 'historical_existing_cnt', 'lead_desc_length', 'category']

In [31]:
# 선택된 변수로만 데이터셋 구성
x_train = x_train[selected_features]
x_val = x_val[selected_features]

## 3. 모델

### 3-1. RandomForestRegressor 학습

#### 모델 정의 

In [7]:
# Instantiate model 
model = RandomForestRegressor(random_state=42)

#### 모델 학습

In [10]:
# Train the model on training data
model.fit(x_train.fillna(0), y_train)

### 3-2. RandomForestRegressor 하이퍼파라미터 튜닝

In [6]:
def objective(trial):
    model = RandomForestRegressor(
        n_estimators=trial.suggest_int("n_estimators", 100, 500),
        max_depth=trial.suggest_int("max_depth", 3, 6),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
        bootstrap=trial.suggest_categorical("bootstrap", [True, False]),
        random_state=42  # Setting random_state for reproducibility
    )

    # 새로운 데이터셋으로 모델 재학습  
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return mean_squared_error(y_val, y_pred)

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="random_forest_regressor", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=1000)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  50
Best trial:
  Value:  0.060331708105276965
  Params: 
    n_estimators: 260
    max_depth: 6
    min_samples_split: 6
    min_samples_leaf: 2
    bootstrap: True


In [7]:
model = RandomForestRegressor(**trial.params, verbose=False)

In [8]:
model.fit(x_train, y_train)

### 3-3. Randomclassifier 학습

#### 모델 정의 

In [None]:
# Instantiate model 
model = RandomForestClassifier(random_state=42)

#### 모델 학습

In [None]:
# Train the model on training data
model.fit(x_train.fillna(0), y_train)

### 3-4. Randomclassifier 하이퍼파라미터 튜닝

In [5]:
def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 100, 500),
        max_depth=trial.suggest_int("max_depth", 3, 6),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 10),
        bootstrap=trial.suggest_categorical("bootstrap", [True, False]),
        random_state=42  # Setting random_state for reproducibility
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    return f1_score(y_val, y_pred, labels=[True, False])

optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=1)
study = optuna.create_study(study_name="random_forest_classifier", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=500)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  500
Best trial:
  Value:  0.9118266626542281
  Params: 
    n_estimators: 500
    max_depth: 6
    min_samples_split: 9
    min_samples_leaf: 9
    bootstrap: False


In [6]:
model = RandomForestClassifier(**trial.params, verbose=False)

In [7]:
model.fit(x_train, y_train)

### 모델 성능 보기

In [9]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [9]:
# classifier
pred = model.predict(x_val.fillna(0))
#pred  = pred.astype(bool)
get_clf_eval(y_val, pred)

오차행렬:
 [[9108 1124]
 [ 626 9606]]

정확도: 0.9145
정밀도: 0.9357
재현율: 0.8901
F1: 0.9124


In [10]:
#regressor
pred = model.predict(x_val)
pred = [1 if x >= 0.5 else 0 for x in pred]
get_clf_eval(y_val, pred)

오차행렬:
 [[9589  639]
 [ 991 9237]]

정확도: 0.9203
정밀도: 0.9063
재현율: 0.9375
F1: 0.9217


## 4. 제출하기

### 테스트 데이터 예측

In [18]:
# 예측에 필요한 데이터 분리
test_pred = model.predict(data['test'].drop(["is_converted", "id"], axis=1))

In [None]:
#classifier
sum(test_pred) # True로 예측된 개수

1391

In [12]:
#Regressor
test_pred = [True if x >= 0.5 else False for x in test_pred]
sum(test_pred) # True로 예측된 개수

2370

In [25]:
# 변수명을 가져오기
feature_names = x_train.columns

# 변수 중요도 출력
importances = model.feature_importances_

# 변수명과 중요도
for feature, imp in zip(feature_names, importances):
    print(f"{feature}: {imp}")

customer_idx: 0.010287215137597512
customer_type: 0.004820149328195799
enterprise_count: 0.002972480875195519
lead_owner: 0.018579840918996716
response_corporate: 0.3039427262918603
com_reg_ver_win_rate: 0.26429894692852884
com_reg_count: 3.348078788017439e-05
bant_submit: 0.0012059637410076977
customer_country: 5.366722707594618e-05
inquiry_type: 0.0010667270649794012
product_subcategory: 0.004759156633706606
product_category: 0.0032401647307066213
business_unit: 0.0043142482454008235
enterprise_weight: 3.2370322408075264e-05
business_subarea: 0.0001855605595239173
lead_log: 0.0292957054920657
idx_count: 0.00014171545359054573
historical_existing_cnt: 1.674132730147775e-05
lead_desc_length: 3.3005181397894426e-06
category: 5.628071316334751e-05


In [28]:
len(feature_names), len(importances)

(20, 35)

In [27]:
import matplotlib.pyplot as plt

# 변수 중요도를 가져와 DataFrame으로 변환
importances_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': model.feature_importances_})

# 변수 중요도를 Importance 기준으로 내림차순 정렬
importances_df = importances_df.sort_values(by='Importance', ascending=True)

# 변수 중요도를 막대 그래프로 시각화
plt.figure(figsize=(10, 6))
plt.barh(importances_df['Feature'], importances_df['Importance'], color='purple')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

ValueError: All arrays must be of the same length

### 제출 파일 작성

In [23]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub["is_converted"] = df_sub["is_converted"].astype(bool)
df_sub

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.00,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,...,LGESP,,1,0,0.001183,0.049840,retail,Electronics & Telco,278,0.703718
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,...,LGEUS,,0,0,0.000013,,transportation,Others,437,0.813999
2,8491,1.00,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,...,LGEGF,less than 3 months,0,0,0.000060,0.131148,hospital & health care,General Hospital,874,0.654730
3,19895,0.50,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,...,LGEUS,more than a year,0,0,0.001183,0.049840,retail,,194,0.264559
4,10465,1.00,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,...,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,0.731868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13855,0.50,/São Paulo/Brazil,AS,,40292,,Enterprise,10.0,,...,LGESP,,0,0,,,,,97,0.054560
5267,7979,0.25,General / / United States,IT,,47466,,Enterprise,0.0,,...,LGEUS,,0,0,,,,,438,0.856750
5268,12887,0.75,/ OURO BRANCO / Brazil,AS,,46227,Specifier/ Influencer,Enterprise,,,...,LGESP,less than 3 months,0,0,,,,,97,0.103148
5269,17530,0.00,/ / Germany,IT,,45667,End Customer,SMB,,,...,LGEDG,,0,0,,,,,429,0.663030


In [17]:
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [15]:
# 제출 파일 저장 (classifier)
df_sub.to_csv("submission_rfc.csv", index=False)

In [24]:
# 제출 파일 저장 (regressor)
df_sub.to_csv("submission_rfr.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**