# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.00,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.00,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.00,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.00,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,/Sląskie/Poland,AS,,33747,End Customer,SMB,,,,...,LGEPL,3 months ~ 6 months,0,0,0.000026,0.028777,public facility,Others,694,False
59295,0.75,/Bogotá DC /Colombia,AS,0.040000,35420,Specifier/ Influencer,Enterprise,,,,...,LGECB,9 months ~ 1 year,0,0,0.000026,0.028777,public facility,,39,False
59296,0.75,/Pisco/Peru,AS,0.040000,19249,Specifier/ Influencer,Enterprise,,,,...,LGEPR,less than 3 months,0,0,0.000026,0.028777,public facility,,125,False
59297,1.00,/santa cruz bolivia/Peru,AS,0.040000,40327,,Enterprise,,,,...,LGEPR,more than a year,0,0,0.000026,0.028777,public facility,,134,False


# expected_timeline

In [4]:
df_train['expected_timeline'].isnull().sum()

30863

In [5]:
# 특정 키워드 매핑
def update_expected_timeline(value):
    if pd.notna(value):
        original_value = value

        mapping_rules = {
            ('requi', 'reqi'): 'requirement',
            ('no', 'not'): 'no requirement',
            ('follow',): 'follow',
            ('details',): 'details shared',
            ('call back', 'call later', 'call after'): 'call back'
        }

        for keywords, updated_value in mapping_rules.items():
            if any(keyword in value for keyword in keywords):
                return updated_value

        return original_value


#행의 값이 1인 경우에는 Other 로 분류
def classify_single_occurrence_as_other(df):
    value_counts = df['expected_timeline'].value_counts()
    single_occurrence_values = value_counts[value_counts == 1].index
    df['expected_timeline'] = df['expected_timeline'].apply(lambda x: 'Other' if x in single_occurrence_values else x)
    return df



# 직접 매핑하는 함수
def apply_mapping_and_fillna(df, timeline_mapping):
    df['expected_timeline'] = df['expected_timeline'].apply(update_expected_timeline)
    df['expected_timeline'] = df['expected_timeline'].map(timeline_mapping).fillna(df['expected_timeline'])
    return df
    
timeline_mapping = {
    "less than 3 months": "0-3m",
    "less_than_3_months": "0-3m",
    
    "3 months ~ 6 months": "3-6m",
    "3_months_~_6_months": "3-6m",
    
    "6 months ~ 9 months": "6-9m",
    "6_months_~_9_months": "6-9m",
    
    "9 months ~ 1 year": "9-12m",
    "9_months_~_1_year": "9-12m",
    
    "up to december": "1y",
    "more than a year": "1y",
    "more_than_a_year": "1y",
    
    "couldn't connect": "no response",
    "didn't respond" : "no response",
    "rnr": "no response",
    "reponse": "response"
}

df_train = apply_mapping_and_fillna(df_train, timeline_mapping)
df_test = apply_mapping_and_fillna(df_test, timeline_mapping)

df_train = classify_single_occurrence_as_other(df_train)
df_test = classify_single_occurrence_as_other(df_test)

In [6]:

unique_values = df_train['expected_timeline'].nunique()

print(f"unique 값: {unique_values}")

value_counts = df_train['expected_timeline'].value_counts(dropna=False)

# 결과 출력
print("빈도:")
for value, count in zip(value_counts.index, value_counts.values):
    print(f"{value}: {count}")

unique 값: 28
빈도:
None: 30863
0-3m: 17326
3-6m: 5035
1y: 3028
9-12m: 1107
6-9m: 1102
Other: 176
no requirement: 120
requirement: 112
less than 6 months: 108
follow: 108
etc.: 95
details shared: 45
no response: 17
call back: 17
duplicate lead: 6
quote send: 5
budget issue: 4
december 2022: 3
already touch with customers: 3
assigned to partner. intial meeting done. will convert to opp post complete info: 3
october 2022: 2
fu under progress.: 2
less then 6 months: 2
more then 3 months: 2
price shared: 2
low budget: 2
less than 5 months: 2
fu under progress: 2


# business관련 결측치 처리

In [7]:
df_train['business_area'].isnull().sum()

40882

In [8]:
df_train['business_subarea'].isnull().sum()

53773

In [9]:
# df_train
grouped_data_train = df_train.groupby('business_area')['business_subarea'].apply(list)
mode_subarea_by_area_train = df_train.groupby('business_area')['business_subarea'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_train['business_subarea'] = df_train.apply(lambda row: mode_subarea_by_area_train.get(row['business_area'], row['business_subarea']), axis=1)

# df_test
grouped_data_test = df_test.groupby('business_area')['business_subarea'].apply(list)
mode_subarea_by_area_test = df_test.groupby('business_area')['business_subarea'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
df_test['business_subarea'] = df_test.apply(lambda row: mode_subarea_by_area_test.get(row['business_area'], row['business_subarea']), axis=1)



In [10]:
df_train['business_subarea'].isnull().sum()

40882

## 2. 데이터 전처리

### 레이블 인코딩

In [None]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [None]:
df_all

다시 학습 데이터와 제출 데이터를 분리합니다.

In [None]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [None]:
df_train

In [None]:
df_test

### 2-2. 학습, 검증 데이터 분리

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=684050,
)





## 3. 모델 학습

### 모델 정의 

In [None]:
model = DecisionTreeClassifier()

### 모델 학습

In [None]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [None]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**