In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [2]:
seed = 42
iteration = 3000
threshold = 0.055

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("submission.csv")

In [4]:
def split_customer_country_in_dataframe(df, column_name='customer_country'):
    def split_customer_country(customer_country):
        parts = customer_country.split('/')
        if len(parts) >= 3:
            city = parts[1].strip() if parts[1].strip() else 'NaN'
            country = parts[2].strip()
            return city, country
        return 'NaN', 'NaN'

    df[['city', 'country']] = df[column_name].apply(lambda x: pd.Series(split_customer_country(x) if pd.notnull(x) else (None, None)))
    return df

In [5]:
df_train = split_customer_country_in_dataframe(df_train, 'customer_country')
df_test = split_customer_country_in_dataframe(df_test, 'customer_country')

In [6]:
df_train = df_train.drop(columns=['customer_country', 
                                  'customer_country.1'])

In [7]:
df_test = df_test.drop(columns=['customer_country', 
                                  'customer_country.1'])

In [8]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}
    series = series.astype(str)
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [9]:
label_columns = [
    "city",
    "country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [10]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [11]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size = 0.2,
    shuffle = True,
    random_state = seed,
)

In [12]:
model_xgb = XGBRegressor(
    n_estimators = iteration,
    eta = 0.01,
    min_child_weight = 50,
    max_depth = 10,
    colsample_bytree = 0.9,
    subsample = 0.9,
    random_state = seed,
    objective = "binary:logistic",
    eval_metric = 'auc'
)

In [13]:
model_xgb.fit(x_train.fillna(0), y_train)

In [14]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[0, 1])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[0, 1])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[0, 1])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [15]:
pred_xgb = model_xgb.predict(x_val.fillna(0))
pred_xgb = (pred_xgb > threshold).astype(int)
pred_xgb

array([0, 0, 0, ..., 1, 0, 0])

In [16]:
get_clf_eval(y_val, pred_xgb)

오차행렬:
 [[9851 1024]
 [  49  936]]

정확도: 0.9095
정밀도: 0.4776
재현율: 0.9503
F1: 0.6357


In [17]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [18]:
test_pred_xgb = model_xgb.predict(x_test.fillna(0))
test_pred_xgb = (test_pred_xgb > threshold).astype(int)
test_pred_xgb

array([1, 1, 0, ..., 0, 0, 1])

In [19]:
sum(test_pred_xgb) # True로 예측된 개수

2168

In [20]:
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred_xgb

df_sub.to_csv("submission.csv", index=False)
df_sub

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.00,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,...,LGESP,,1,0,0.001183,0.049840,retail,Electronics & Telco,278,1
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,...,LGEUS,,0,0,0.000013,,transportation,Others,437,1
2,8491,1.00,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,...,LGEGF,less than 3 months,0,0,0.000060,0.131148,hospital & health care,General Hospital,874,0
3,19895,0.50,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,...,LGEUS,more than a year,0,0,0.001183,0.049840,retail,,194,0
4,10465,1.00,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,...,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13855,0.50,/São Paulo/Brazil,AS,,40292,,Enterprise,10.0,,...,LGESP,,0,0,,,,,97,0
5267,7979,0.25,General / / United States,IT,,47466,,Enterprise,0.0,,...,LGEUS,,0,0,,,,,438,1
5268,12887,0.75,/ OURO BRANCO / Brazil,AS,,46227,Specifier/ Influencer,Enterprise,,,...,LGESP,less than 3 months,0,0,,,,,97,0
5269,17530,0.00,/ / Germany,IT,,45667,End Customer,SMB,,,...,LGEDG,,0,0,,,,,429,0
