### 1. 라이브러리, 데이터 확인

In [6]:
import numpy as np
import pandas as pd
import random
import os
import torch
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sentence_transformers import SentenceTransformer
import re

def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # current gpu seed
    torch.cuda.manual_seed_all(seed) # All gpu seed
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # True로 하면 gpu에 적합한 알고리즘을 선택함.

RANDOM_SEED = 1500
seed_everything(RANDOM_SEED)

# 데이터 load
df_train = pd.read_csv("train.csv") 
df_test = pd.read_csv("submission.csv") 

# object 컬럼들 소문자로 변환
def lowercase_strings(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.lower()
    return df

# 함수 호출
df_train = lowercase_strings(df_train)
df_test = lowercase_strings(df_test)

### 2. 데이터 전처리

In [None]:
# 'customer_country'와 'customer_country.1' 컬럼 동일하므로 후자 삭제
df_train = df_train.drop('customer_country.1', axis=1)
df_test = df_test.drop('customer_country.1', axis=1)

# 'customer_country' 컬럼에서 '//'를 NaN으로 대체하고, 나라 이름으로 변환
df_train['customer_country'] = df_train['customer_country'].replace('//', np.nan)
df_test['customer_country'] = df_test['customer_country'].replace('//', np.nan)
df_train['customer_country'] = df_train['customer_country'].str.split('/').str[-1].str.strip()
df_test['customer_country'] = df_test['customer_country'].str.split('/').str[-1].str.strip()


def preprocess_customer_country(df):
    # 'customer_country' 소문자 변환 및 결측치 처리
    df['customer_country'] = df['customer_country'].replace('//', np.nan)
    df['customer_country'] = df['customer_country'].str.split('/').str[-1].str.strip()

    # 상세 주소명 처리
    df['customer_country'] = df['customer_country'].replace('700 patroon creek blvdalbanyny12206', np.nan)
    df['customer_country'] = df['customer_country'].replace('100 vestavia parkwaybirminghamal35216', np.nan)
    df['customer_country'] = df['customer_country'].replace('1100 itbprovout84602', np.nan)
    df['customer_country'] = df['customer_country'].replace('fl 33772', np.nan)
    df['customer_country'] = df['customer_country'].replace('3000 montour church road', np.nan)
    df['customer_country'] = df['customer_country'].replace('1380 enterprise dr', np.nan)
    df['customer_country'] = df['customer_country'].replace('222 maxine dr', np.nan)
    df['customer_country'] = df['customer_country'].replace('210 route 4 east fl 4', np.nan)
    df['customer_country'] = df['customer_country'].replace('600 5th street', np.nan)
    df['customer_country'] = df['customer_country'].replace('1100 itbprovout84602', np.nan)
    df['customer_country'] = df['customer_country'].replace('ma 01851', np.nan)
    df['customer_country'] = df['customer_country'].replace('il 60069', np.nan)
    df['customer_country'] = df['customer_country'].replace('717 general booth blvdvirginia beach, va 23451, usa', 'united states')
    df['customer_country'] = df['customer_country'].replace('2900 highway 280suite 250birminghamal35223', np.nan)
    df['customer_country'] = df['customer_country'].replace('6601 carroll highlands rd', np.nan)
    df['customer_country'] = df['customer_country'].replace('275 mishawum road', np.nan)
    df['customer_country'] = df['customer_country'].replace('3804 w broadway st, ardmore, ok 73401, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('101 metlife way, cary, nc, 27513 – met1', np.nan)
    df['customer_country'] = df['customer_country'].replace('delmar, ny 12054 united states', 'united states')
    df['customer_country'] = df['customer_country'].replace("via dell'informatica 10 - 37036 san martino buon albergo (veneto), italy", 'italy')
    df['customer_country'] = df['customer_country'].replace('richardson, texas, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('3 nasson avenue', np.nan)
    df['customer_country'] = df['customer_country'].replace('oh 45215, usa', 'united states')
    df['customer_country'] = df['customer_country'].replace('2877 prospect rd, fort lauderdale, fl 33309', np.nan)
    df['customer_country'] = df['customer_country'].replace('810 n kingston dr peoria, il 61604-2145', np.nan)
    df['customer_country'] = df['customer_country'].replace('mi 48827', np.nan)
    df['customer_country'] = df['customer_country'].replace('1001 main st', np.nan)
    df['customer_country'] = df['customer_country'].replace('152 bowdoin street', np.nan)
    df['customer_country'] = df['customer_country'].replace('ca 91915-6002', np.nan)
    df['customer_country'] = df['customer_country'].replace('300 east park drive', np.nan)
    df['customer_country'] = df['customer_country'].replace('united states 14503.', 'united states')
    df['customer_country'] = df['customer_country'].replace('mo 64108.', np.nan)
    df['customer_country'] = df['customer_country'].replace('239 court st, brooklyn, ny 11201, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('1110 morse rd, columbus, ohio, 43229, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('ca 92618 united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('jeffersonville, in united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('united states 32901', 'united states')
    df['customer_country'] = df['customer_country'].replace('2529 w busch blvd suite 1000, tampa, fl 33618, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('6564 headquarters drplanotx75051', np.nan)
    df['customer_country'] = df['customer_country'].replace('28001 238th st, le claire, ia 52753, usa', 'united states')
    df['customer_country'] = df['customer_country'].replace('grapevine, texas, united states united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('6398 college blvd, overland park, ks 66211, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('993 niagara ave, san diego, ca 92107 united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('via e. de amicis, 23 . 90044 carini (pa)', np.nan)
    df['customer_country'] = df['customer_country'].replace('45 n 200 wwillardut84340', np.nan)
    df['customer_country'] = df['customer_country'].replace('san francisco, ca 94128, united states', 'united states')
    df['customer_country'] = df['customer_country'].replace('ironhorse customs llc 4443 genella way north las vegas, nv 89031', np.nan)
    df['customer_country'] = df['customer_country'].replace('7700 west sunrise blvdplantationfl33322', np.nan)
    df['customer_country'] = df['customer_country'].replace('230 highland ave, suite 531somervillema2143', np.nan)
    df['customer_country'] = df['customer_country'].replace('5301 stevens creek blvdsanta claraca95051', np.nan)

    # 겹치는 국가 병합
    df['customer_country'] = df['customer_country'].replace('us', 'united states')
    df['customer_country'] = df['customer_country'].replace('us', 'united states')

    return df

df_train = preprocess_customer_country(df_train)
df_test = preprocess_customer_country(df_test)


# 빈도수가 2 미만인 값과 빈 문자열을 other 대체
value_counts = df_train['customer_country'].value_counts()
values_to_replace = value_counts[value_counts < 2].index.tolist()
values_to_replace.append('') 

df_train['customer_country'] = df_train['customer_country'].apply(lambda x: 'others' if x in values_to_replace else x)
df_test['customer_country'] = df_test['customer_country'].apply(lambda x: 'others' if x in values_to_replace else x)


def preprocess_business_unit(df):
    replacement_dict = {
        'as': "air solution business division",
        'id': "information display business division",
        'it': "information technology business division",
        'cm': "camera module business division",
        'solution': "solution business division",
    }
    df['business_unit'] = df['business_unit'].replace(replacement_dict)
    return df

df_train = preprocess_business_unit(df_train)
df_test = preprocess_business_unit(df_test)


def preprocess_customer_type(df):
    df['customer_type'] = df['customer_type'].replace(['end-customer', 'homeowner', 'home owner', 'end-user', 'commercial end-user'], 'end customer')
    df['customer_type'] = df['customer_type'].replace(['specifier/ influencer', 'specifier / influencer', 'consultant', 'architect/consultant'], 'influencers')
    df['customer_type'] = df['customer_type'].replace(['etc.', 'other'], 'others')
    df['customer_type'] = df['customer_type'].replace(['software / solution provider'], 'software/solution provider')
    df['customer_type'] = df['customer_type'].replace(['installer'], 'installer/contractor')
    df['customer_type'] = df['customer_type'].replace(['dealer/distributor'], 'distributor')
    
    return df

df_train = preprocess_customer_type(df_train)
df_test = preprocess_customer_type(df_test)



# 대략적인 그룹화 기준 설정
grouped_positions = {
    'Executive': ['ceo', 'founder', 'c-level', 'president', 'vice president', 'director',
                  'executive', 'manager', 'vp', 'the big boss', 'proprietário(a)', 'co-founder', 'chief executive officer',
                  'principal & director', 'ceo/fundador', 'entrepreneurship', 'decision-maker',
                  'c-levelexecutive', 'decision-influencer', 'leadership/executive office/owner',
                  'decision influencer', 'decision maker', 'vicepresident', 'c-level executive',
                  'ceo/founder', 'director'],
    'Technical': ['engineer', 'technical', 'developer', 'architect', 'installer', 'técnico', 'engineering',
                  'medical device manufacturer', 'manufacturer', 'software /solution provider',
                  'it', 'information technology', 'sysadmin'],
    'Sales_Marketing': ['sales', 'marketing', 'business development', 'business development/sales', 'subsidiary sales (ise)'],
    'Consulting': ['consultant', 'advisor', 'commercial consultant', 'consulting', 'associate/analyst'],
    'Education': ['teacher', 'professor', 'educator', 'lecturer', 'trainer', 'education professional',
                  'associate professor', 'quantitative aptitude faculty', 'maths lecturer', 'senior lecturer',
                  'education', 'neet/ olympiad expert faculty', 'associate professor in electronics engg',
                  'asst prof.', 'professor of mathematics', 'physics and mathematics teacher',  'science teacher',
                  'math and physics teacher', 'principal at oxford integrated pu science college', 'academic specialist',
                  'prof.', 'physics teacher', 'assistant professor'],
    'Administrative': ['administrative', 'secretary', 'assistant', 'professional trainer'],

    'Healthcare': ['doctor', 'nurse', 'medical', 'healthcare', 'pathologist', 'surgery professional', 'tierarzt',
                   'medical imaging specialist', 'hospital'],
    'Intern' : ['intern', 'trainee', 'entry level', 'employee', 'entrylevel']
}

# 그룹화 함수 정의  -,/  공백으로 대체
def preprocess_customer_position(position):
    for group, keywords in grouped_positions.items():
        for keyword in keywords:
            if keyword in position.lower().replace('-', ' ').replace('/', ' '):
                return group
    return 'Others'

# 모든 customer_position 범주에 대해 그룹화 실행
df_train['customer_position'] = df_train['customer_position'].apply(preprocess_customer_position)
df_test['customer_position'] = df_test['customer_position'].apply(preprocess_customer_position)


def preprocess_inquiry_type(df):
    df["inquiry_type"] = df["inquiry_type"].replace({
        'request for quotation or purchase': 'quotation or purchase consultation',
        'quotation_or_purchase_consultation': 'quotation or purchase consultation',
        'purchase or quotation': 'quotation or purchase consultation',
        'purchase': 'quotation or purchase consultation',
        'technical consultation': 'usage or technical consultation',
        'technical support': 'usage or technical consultation',
        'request for technical consulting': 'usage or technical consultation',
        'usage_or_technical_consultation': 'usage or technical consultation',
        'technical': 'usage or technical consultation',
        'technical_consultation': 'usage or technical consultation',
        'sales': 'sales inquiry',
        'etc.': 'other',
        'others': 'other',
        'other_': 'other'
    })
    return df


df_train = preprocess_inquiry_type(df_train)
df_test = preprocess_inquiry_type(df_test)



# 텍스트 정규화 함수 정의
def normalize_text(text):
    if pd.isnull(text):
        return ""  # NaN 값은 빈 문자열로 처리
    text = text.lower()  # 소문자 변환
    text = re.sub(r'[^a-z0-9\s]', '', text)  # 특수 문자 제거
    text = re.sub(r'\s+', ' ', text).strip()  # 불필요한 공백 제거
    return text

# 전체 규칙을 적용하는 함수 정의
def apply_full_merge_rules(text):
    full_merge_rules = {
        r'details\s+send': 'details shared',
        r'details\s+shared': 'details shared',
        r'3\s*months\s*6\s*months': '3 months - 6 months',
        r'quote\s+has\s+been\s+sent\s+to\s+customer': 'quote shared',
        r'quotation\s+shared': 'quote shared',
        r'being\s+followed\s+up': 'following up',
        r'following\s+up': 'following up',
        r'no\s+response': 'no response',
        r'not\s+responding': 'no response',
        r'budget\s+problem': 'budget issue'
    }
    for pattern, replacement in full_merge_rules.items():
        text = re.sub(pattern, replacement, text)
    full_others_rules = [
        'not require', 'budget issue', 'duplicate lead', 'rnr', 'demo scheduled'
    ]
    for pattern in full_others_rules:
        if re.search(pattern, text):
            return 'others'
    return text


# 정규화된 값들에 대해 전체 처리 적용
df_train['expected_timeline'] = df_train['expected_timeline'].apply(lambda x: apply_full_merge_rules(normalize_text(x)))
df_test['expected_timeline'] = df_test['expected_timeline'].apply(lambda x: apply_full_merge_rules(normalize_text(x)))


# 추가 정제 규칙에 따라 값을 변경하는 함수 정의
def refine_expected_timeline(text):
    refine_rules = {
        r'lessthan3months': 'less than 3 months',
        r'etc': 'others',
        r'the client is not having any requirement hence closig in system although the details of idb are mailed to client': 'no requirement',
        r'the client is not having any requirement he was only browsing through the produt hence closig in system although the details of idb are mailed to client': 'no requirement',
        r'he client is not having any requirement hence closig in system although the details of idb are mailed to client': 'no requirement',
        r"didn't respond": 'no response',
        r'couldnt connect': 'no response',
        r'not answering call': 'no response',
        r'not answering call lead shared with rd': 'no response',
        r'not reachable': 'no response',
        r'tried to reach several times but no response': 'no response',
        r'tried couple of times but he is no response we will try again': 'no response',
        r'didnt respond': 'no response',
        r'no response to calls': 'no response',
        r'not answering call sales remarks tried to reach him multiple times but he is no response request to shailja to reconnect with customer we are dropping this lead for now': 'no response',
        r'9months1year': '9 months 1 year',
        r'forwarded to bdo following up': 'following up',
        r'forwarded to bdo to followup': 'following up',
        r'6months9months': '6 months 9 months',
        r'morethanayear': 'more than a year',
        r'less then 6 months': 'less than 6 months',
        r'less than 5 months': 'less than 6 months',
        r'dicsussed with clientdetails shared on mail client have no budgets to buy now hence closing in the system': 'low budget',
        r'quote shared': 'quote send',
        r'quote shared for ultra strothersh and 49vl5g he will check with management and update us': 'quote send',
        }

    for pattern, replacement in refine_rules.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# 예시 데이터에 함수 적용
df_train['expected_timeline'] = df_train['expected_timeline'].apply(refine_expected_timeline)
df_test['expected_timeline'] = df_test['expected_timeline'].apply(refine_expected_timeline)

# ver_cus 컬럼, 조건을 만족하는 행의 ver_cus 값을 1로 업데이트
df_train.loc[((df_train['business_area'].isin(['corporate/office', 'retail', 'education', 'hotel & accommodation'])) &
              (df_train['customer_type'] == 'end customer')), 'ver_cus'] = 1

df_test.loc[((df_test['business_area'].isin(['corporate/office', 'retail', 'education', 'hotel & accommodation'])) &
              (df_test['customer_type'] == 'end customer')), 'ver_cus'] = 1


# 해당 값들 리스트화
product_category_list = df_train[df_train['ver_pro'] == 1]['product_category'].tolist()

# 'product_category' 컬럼의 값이 'product_category_list'에 포함되고,
# 'business_area' 컬럼의 값이 주어진 리스트 중 하나인 행의 'ver_pro' 값을 1로 업데이트
df_train.loc[((df_train['business_area'].isin(['corporate / office', 'retail', 'hotel & accommodation'])) &
              (df_train['product_category'].isin(product_category_list))), 'ver_pro'] = 1

df_test.loc[((df_test['business_area'].isin(['corporate / office', 'retail', 'hotel & accommodation'])) &
              (df_test['product_category'].isin(product_category_list))), 'ver_pro'] = 1


# 사업 영역별 전환율: business_area 별로 전체 리드 중 전환된 리드의 비율 계산
business_area_conversion_rate = df_train.groupby('business_area')['is_converted'].mean().reset_index()
business_area_conversion_rate.rename(columns={'is_converted': 'business_area_conversion_rate'}, inplace=True)

# 데이터셋에 사업 영역별 전환율을 매핑
df_train = pd.merge(df_train, business_area_conversion_rate, on='business_area', how='left')
df_test = pd.merge(df_test, business_area_conversion_rate, on='business_area', how='left')

# 리드 소유자별 전환율: lead_owner 별로 각 영업 담당자가 관리한 리드 중 전환된 비율 계산
lead_owner_conversion_rate = df_train.groupby('lead_owner')['is_converted'].mean().reset_index()
lead_owner_conversion_rate.rename(columns={'is_converted': 'lead_owner_conversion_rate'}, inplace=True)

# 데이터셋에 리드 소유자별 전환율을 매핑
df_train = pd.merge(df_train, lead_owner_conversion_rate, on='lead_owner', how='left')
df_test = pd.merge(df_test, lead_owner_conversion_rate, on='lead_owner', how='left')

### 3.텍스트 임베딩

In [None]:

def preprocess_and_embed(df):
    # Torch에서 GPU 지원 활성화
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # SentenceTransformer 모델을 GPU로 이동
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2').to(device)
    # 문장 임베딩을 생성할 컬럼 목록
    text_columns = ['customer_country', 'business_unit', 'customer_type', 'enterprise', 'customer_job', 'inquiry_type',
                    'product_category', 'product_subcategory', 'product_modelname', 'customer_position', 'response_corporate',
                    'expected_timeline', 'business_area', 'business_subarea']

    for column in text_columns:
        if column in df.columns:
            # 해당 컬럼의 모든 텍스트를 리스트로 추출, NaN 값 처리
            texts = df[column].fillna("").astype(str).tolist()
            # 문장 임베딩 생성
            embeddings = model.encode(texts)
            # 임베딩의 평균을 계산
            embedding_means = np.mean(embeddings, axis=1)
            # 새 컬럼 이름 정의
            new_column_name = f"{column}_embedding_mean"
            # 평균 임베딩 값을 새 컬럼으로 추가
            df[new_column_name] = embedding_means

    # 데이터프레임에서 특정 컬럼 드롭
    df.drop(columns=text_columns, inplace=True)
    return df


df_train = preprocess_and_embed(df_train)
df_test = preprocess_and_embed(df_test)

# 'is_converted' 컬럼에서 True를 1로, False를 0으로 변환
df_train['is_converted'] = df_train['is_converted'].astype(int)
df_test['is_converted'] = df_test['is_converted'].astype(int)

df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

X = df_train.drop("is_converted", axis=1)
y = df_train["is_converted"]
x_train, x_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=1500
)


### 4. 모델링

In [12]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import numpy as np

# 개별 모델 및 스태킹 앙상블 정의
rf = RandomForestClassifier(random_state=6666)
xgb = XGBClassifier(random_state=6666)
cat = CatBoostClassifier(random_state=6666, verbose=0) # CatBoost의 출력을 억제하기 위해 verbose=0 추가
lgbm = LGBMClassifier(random_state=6666)
estimator_stacking = [('rf', rf), ('xgb', xgb), ('cat',cat), ('lgbm',lgbm)]
stacking = StackingClassifier(estimators=estimator_stacking, final_estimator=rf)

# 교차 검증을 위한 StratifiedKFold 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=6666)

# 교차 검증 및 모델 평가
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

rus = RandomUnderSampler(random_state=6666)

for i, (train_index, val_index) in enumerate(cv.split(x_train, y_train)):
    x_train_fold, x_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    x_train_res, y_train_res = rus.fit_resample(x_train_fold, y_train_fold)
    
    stacking.fit(x_train_res, y_train_res.ravel()) 
    y_pred = stacking.predict(x_val_fold)

    accuracy_scores.append(accuracy_score(y_val_fold, y_pred))
    precision_scores.append(precision_score(y_val_fold, y_pred, average='macro')) 
    recall_scores.append(recall_score(y_val_fold, y_pred, average='macro')) 
    f1_scores.append(f1_score(y_val_fold, y_pred, average='macro')) 

# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
test_pred = stacking.predict(x_test)

# 평가 지표의 평균값 출력
print("평균 평가 지표:")
print("평균 정확도:", np.mean(accuracy_scores))
print("평균 정밀도:", np.mean(precision_scores))
print("평균 재현율:", np.mean(recall_scores))
print("평균 F1 스코어:", np.mean(f1_scores))
print(sum(test_pred)) # True로 예측된 개수 

[LightGBM] [Info] Number of positive: 3084, number of negative: 3084
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2201
[LightGBM] [Info] Number of data points in the train set: 6168, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2467, number of negative: 2467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2115
[LightGBM] [Info] Number of data points in the train set: 4934, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2467, number of negative: 

[LightGBM] [Info] Number of positive: 3084, number of negative: 3084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2187
[LightGBM] [Info] Number of data points in the train set: 6168, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2467, number of negative: 2467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2086
[LightGBM] [Info] Number of data points in the train set: 4934, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2467, number of negative: 2467
[LightGBM] [Info] Auto-choosing col-wise multi-threading, t

### 5. 제출

In [8]:
# 제출 데이터 읽어오기
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)