# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.00,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.00,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.00,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.00,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.00,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59294,1.00,/Sląskie/Poland,AS,,33747,End Customer,SMB,,,,...,LGEPL,3 months ~ 6 months,0,0,0.000026,0.028777,public facility,Others,694,False
59295,0.75,/Bogotá DC /Colombia,AS,0.040000,35420,Specifier/ Influencer,Enterprise,,,,...,LGECB,9 months ~ 1 year,0,0,0.000026,0.028777,public facility,,39,False
59296,0.75,/Pisco/Peru,AS,0.040000,19249,Specifier/ Influencer,Enterprise,,,,...,LGEPR,less than 3 months,0,0,0.000026,0.028777,public facility,,125,False
59297,1.00,/santa cruz bolivia/Peru,AS,0.040000,40327,,Enterprise,,,,...,LGEPR,more than a year,0,0,0.000026,0.028777,public facility,,134,False


# expected_timeline - 4 + business관련 결측치 처리

expected_timeline : 고객의 요청한 처리 일정

1. 처리 날짜 : 0 ~ 3m 0 ~ 6m ...

2. 처리 상태 : requirement, response ...

3. 
4. 리드의 상태 기준 (Lead Status):
5. 기타 기준:

In [4]:
# 'expected_timeline' 열에서 값이 없는지 확인
df_train['expected_timeline'].isnull().sum()



30863

In [5]:
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

# 'expected_timeline' 열에서 값이 있는 행만 선택
df_valid_rows = df_train.dropna(subset=['expected_timeline'])

# 'expected_timeline' 열의 값을 모두 가져와서 텍스트로 결합
text_data = ' '.join(df_valid_rows['expected_timeline'].astype(str))

# 토큰화 (단어 분리)
tokens = word_tokenize(text_data)

# 단어 빈도수 계산
word_freq = Counter(tokens)

word_freq

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dessert_gomjelly/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Counter({'months': 30725,
         '3': 22288,
         'than': 20390,
         'less': 17366,
         '~': 7225,
         '6': 6235,
         'year': 4125,
         'a': 3035,
         'more': 3027,
         '9': 2200,
         '1': 1107,
         '.': 511,
         'to': 186,
         'the': 166,
         ',': 146,
         'client': 145,
         'with': 136,
         'not': 124,
         'for': 122,
         'is': 121,
         'details': 117,
         'up': 115,
         'in': 114,
         'followed': 98,
         'being': 97,
         'he': 95,
         'etc': 95,
         'shared': 85,
         'requirement': 81,
         'less_than_3_months': 76,
         'of': 75,
         'customer': 74,
         'and': 74,
         'demo': 65,
         'hence': 64,
         'will': 63,
         'on': 62,
         'system': 59,
         'call': 57,
         'no': 56,
         'are': 54,
         'idb': 50,
         'mailed': 46,
         'have': 43,
         'any': 41,
         'partner': 3

In [6]:
unique_values = df_train['expected_timeline'].nunique()

# 결과 출력
print(f"unique 값: {unique_values}")

# 각 숫자의 빈도와 함께 출력 (null 값 포함)
value_counts = df_train['expected_timeline'].value_counts(dropna=False)

# 결과 출력
print("빈도:")
for value, count in zip(value_counts.index, value_counts.values):
    print(f"{value}: {count}")


unique 값: 449
빈도:
nan: 30863
less than 3 months: 17250
3 months ~ 6 months: 5026
more than a year: 3023
9 months ~ 1 year: 1101
6 months ~ 9 months: 1098
less than 6 months: 108
etc.: 95
less_than_3_months: 76
being followed up: 66
being followed up.: 24
the client is not having any requirement hence closig in system. although the details of idb are mailed to client.: 21
no requirement: 12
3_months_~_6_months: 9
didn't respond: 7
duplicate lead: 6
9_months_~_1_year: 6
forwarded to bdo, being followed up: 5
details send: 5
couldn't connect: 5
the client is not having any requirement, he was only browsing through the produt hence closig in system. although the details of idb are mailed to client.: 5
quote send: 5
details shared: 5
rnr: 5
6_months_~_9_months: 4
not answering call: 4
budget issue: 4
more_than_a_year: 4
details shared.: 3
not answering call, lead shared with rd: 3
assigned to partner. intial meeting done. will convert to opp post complete info: 3
not required: 3
already tou

In [None]:
timeline_mapping = {
    # 날짜
    "less than 3 months": "0-3m",
    "less_than_3_months": "0-3m",
    
    "3 months ~ 6 months": "3-6m",
    "3_months_~_6_months": "3-6m",
    
    "6 months ~ 9 months": "6-9m",
    "6_months_~_9_months": "6-9m",
    
    "9 months ~ 1 year": "9-12m",
    "9_months_~_1_year": "9-12m",

    "up to december": "1y",
    "more than a year": "1y",
    "more_than_a_year": "1y",

    "couldn't connect": "no response",
    "rnr": "no response",
    
    # 오타
    "reponse": "response"
}

# 레이블을 통일된 형태로 매핑
df_train['expected_timeline'] = [timeline_mapping.get(label, label) for label in df_train['expected_timeline']]

# 요구 사항 분류 (requirement)
- not, no 로 부정 분류
- requi, reqi 로 시작하는 오타가 있음

In [7]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        # requi 또는 reqi를 포함하면서 부정형이 들어가 있는 경우를 'no requirement'로 분류
        if ('requi' in current_value or 'reqi' in current_value) and ('no' in current_value or 'not' in current_value):
            updated_value = 'no requirement'
        # requi 또는 reqi를 포함하면서 부정형이 없는 경우를 'requirement'로 분류
        elif 'requi' in current_value or 'reqi' in current_value:
            updated_value = 'requirement'
        
        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value


In [18]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'not intere' in current_value:
            updated_value = 'no requirement'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value

# 커뮤니케이션 사항 분류 (reponse)
- didn't not no 로 부정 분류

In [19]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'resp' in current_value and ('no' in current_value or 'not' in current_value or "didn't" in current_value):
            updated_value = 'no response'
       
        elif 'resp' in current_value:
            updated_value = 'reponse'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value


# 요구 사항 분류 (follow)

In [20]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'follow' in current_value:
            updated_value = 'follow'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value


# 요구 사항 분류 (details)

In [21]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'details' in current_value:
            updated_value = 'details shared'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value


# 커뮤니케이션 분류 (not answering == no response)

In [22]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'not answering' in current_value:
            updated_value = 'no response'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value

# 커뮤니케이션 분류 (call back, call later)

In [23]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'call back' in current_value:
            updated_value = 'call back'

        elif 'call later' in current_value:
            updated_value = 'call back'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value

In [24]:
for index, row in df_train.iterrows():
    current_value = row['expected_timeline']

    # NaN 값 체크
    if pd.notna(current_value):
        original_value = current_value
        updated_value = None

        if 'duplicate lead' in current_value:
            updated_value = 'duplicate lead'

        # 기본 케이스: 어떤 조건도 충족되지 않으면 원래 값 유지
        if updated_value is None:
            updated_value = original_value

        df_train.at[index, 'expected_timeline'] = updated_value

In [29]:
unique_values = df_train['expected_timeline'].nunique()

# 결과 출력
print(f"unique 값: {unique_values}")

# 각 숫자의 빈도와 함께 출력 (null 값 포함)
value_counts = df_train['expected_timeline'].value_counts(dropna=False)

# 결과 출력
print("빈도:")
for value, count in zip(value_counts.index, value_counts.values):
    print(f"{value}: {count}")


unique 값: 249
빈도:
nan: 30863
0-3m: 17326
3-6m: 5035
1y: 3028
9-12m: 1107
6-9m: 1102
follow: 109
less than 6 months: 108
etc.: 95
no requirement: 73
details shared: 66
no response: 56
requirement: 46
call back: 18
duplicate lead: 8
quote send: 5
budget issue: 4
december 2022: 3
already touch with customers: 3
assigned to partner. intial meeting done. will convert to opp post complete info: 3
fu under progress: 2
not lifted: 2
less then 6 months: 2
more then 3 months: 2
not reachable: 2
price shared: 2
less than 5 months: 2
october 2022: 2
low budget: 2
november 2022: 2
fu under progress.: 2
uttarpradesh state lead: 1
drop, now not intrested: 1
quotation shared with him for idb. he dont have enough fund as of now. he is just evaluating options.: 1
called to know the price of idb: 1
quotation shared for ultra stretch and 49vl5g. he will check with management and update us: 1
quote shared with him and he will update us after demo: 1
update- as per customer no enquiry was made.: 1
quote sha

In [17]:
# 'expected_timeline' 열에서 값이 있는 행만 선택
df_valid_rows = df_train.dropna(subset=['expected_timeline'])

# 'expected_timeline' 열의 값을 모두 가져와서 텍스트로 결합
text_data = ' '.join(df_valid_rows['expected_timeline'].astype(str))

# 토큰화 (단어 분리)
tokens = word_tokenize(text_data)

# 단어 빈도수 계산
word_freq = Counter(tokens)

word_freq

Counter({'0-3m': 17326,
         '3-6m': 5035,
         '1y': 3028,
         '9-12m': 1107,
         '6-9m': 1102,
         '.': 247,
         'no': 143,
         'months': 122,
         'requirement': 119,
         'less': 112,
         'than': 112,
         '6': 110,
         'follow': 109,
         'etc': 95,
         'shared': 90,
         'for': 69,
         'to': 68,
         'with': 67,
         'details': 66,
         'response': 57,
         'the': 50,
         ',': 49,
         'is': 45,
         'in': 42,
         'he': 41,
         'customer': 36,
         'demo': 36,
         'and': 34,
         'call': 32,
         'will': 31,
         'client': 30,
         'not': 28,
         'partner': 25,
         'on': 24,
         'budget': 24,
         'they': 24,
         'already': 23,
         'back': 23,
         'quote': 19,
         'lead': 19,
         'discussed': 17,
         'have': 16,
         'as': 15,
         'we': 15,
         'update': 14,
         'looking': 14,
 

## 2. 데이터 전처리

### 레이블 인코딩

In [None]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [None]:
df_all

다시 학습 데이터와 제출 데이터를 분리합니다.

In [None]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [None]:
df_train

In [None]:
df_test

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 모든 열에 대한 상자그림 플로팅
plt.figure(figsize=(20, 15))
sns.boxplot(data=df_train, orient="h")  # orient="h"는 가로 방향 상자그림을 의미합니다
plt.title('Boxplots for Outliers in All Columns')
plt.show()


In [None]:
print(df_train.describe())

In [None]:

# 각 열에 대한 기술 통계량 출력
print(df_train.describe())

# 각 열의 분포 시각화 (히스토그램, 상자 그림)
for column in df_train.columns:
    plt.figure(figsize=(5, 2))
    sns.histplot(df_train[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
# 각 열의 상관 관계 히트맵
correlation_matrix = df_train.corr()
plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# NaN 값 확인
nan_values = df_train.isnull().sum()
print("NaN Values:\n", nan_values)

df_train = df_train.fillna(0)

In [None]:
# 무한대 값 확인
inf_values = df_train[df_train == np.inf].sum()
print("Infinite Values:\n", inf_values)

1. 결측값 처리


2. 표준편차가 큰 열 처리:
해결방안:
표준화 (Standardization) 또는 정규화 (Normalization): 데이터의 스케일을 맞춰주어 표준편차가 큰 열의 영향을 줄일 수 있습니다. 주로 Z-score 표준화나 Min-Max 정규화를 사용합니다.



3. 이진 변수의 클래스 불균형 처리:
해결방안:
클래스 가중치 설정: 클래스의 불균형이 큰 경우, 모델 학습 시 클래스에 가중치를 부여하여 균형을 맞출 수 있습니다.

In [None]:
import pandas as pd

# df_train은 데이터프레임 객체라고 가정합니다.
# 'expected_timeline' 열과 다른 열들 간의 상관 관계 확인
correlation_matrix = df_train.corrwith(df_train['expected_timeline'])

# 출력
print("상관 관계:")
print(correlation_matrix)


In [None]:
import pandas as pd
import numpy as np

# df_train은 데이터프레임 객체라고 가정합니다.
unique_values = df_train['expected_timeline'].unique()

# 최솟값과 최댓값 출력
min_value = np.min(unique_values)
max_value = np.max(unique_values)

print("최솟값:", min_value)
print("최댓값:", max_value)

# 각 숫자의 빈도 출력
value_counts = df_train['expected_timeline'].value_counts()
print("각 숫자의 빈도:")
print(value_counts)



### 2-2. 학습, 검증 데이터 분리

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)





## 3. 모델 학습

### 모델 정의 

In [None]:
model = DecisionTreeClassifier()

### 모델 학습

In [None]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [None]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

## 4. 제출하기

### 테스트 데이터 예측

In [None]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [None]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**