### Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Commit_test_folder/LGAimers-06-2/src

/content/drive/MyDrive/Commit_test_folder/LGAimers-06-2/src


In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OrdinalEncoder
import xgboost as xgb

### Data Load

In [4]:
train = pd.read_csv('data/train.csv').drop(columns=['ID'])
test = pd.read_csv('data/test.csv').drop(columns=['ID'])

In [5]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [6]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [7]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [8]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [9]:
numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

In [10]:
X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

### Train

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_encoded, y, test_size=0.2, random_state=42)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
param = {'max_depth': 40, 'eta': 0.05, 'objective': 'binary:logistic',
         'min_child_weight':0.5,'num_parallel_tree':10}
model = xgb.train(params = param, dtrain=dtrain, num_boost_round=50,
                  early_stopping_rounds=20, evals=[(dtrain, 'train'), (dval, 'val')])

[0]	train-logloss:0.55596	val-logloss:0.56415
[1]	train-logloss:0.54030	val-logloss:0.55927
[2]	train-logloss:0.52550	val-logloss:0.55495
[3]	train-logloss:0.51141	val-logloss:0.55108
[4]	train-logloss:0.49802	val-logloss:0.54769
[5]	train-logloss:0.48525	val-logloss:0.54467
[6]	train-logloss:0.47312	val-logloss:0.54204
[7]	train-logloss:0.46144	val-logloss:0.53975
[8]	train-logloss:0.45032	val-logloss:0.53758
[9]	train-logloss:0.43966	val-logloss:0.53573
[10]	train-logloss:0.42945	val-logloss:0.53406
[11]	train-logloss:0.41962	val-logloss:0.53265
[12]	train-logloss:0.41020	val-logloss:0.53130
[13]	train-logloss:0.40118	val-logloss:0.53019
[14]	train-logloss:0.39244	val-logloss:0.52928
[15]	train-logloss:0.38406	val-logloss:0.52838
[16]	train-logloss:0.37595	val-logloss:0.52765
[17]	train-logloss:0.36811	val-logloss:0.52711
[18]	train-logloss:0.36052	val-logloss:0.52654
[19]	train-logloss:0.35321	val-logloss:0.52604
[20]	train-logloss:0.34616	val-logloss:0.52569
[21]	train-logloss:0.33

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train_encoded, y, test_size=0.2, random_state=42,
                                                  stratify=y)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
param = {'max_depth': 30, 'eta': 0.1, 'objective': 'binary:logistic',
         'min_child_weight':2,'num_parallel_tree':10}
model2 = xgb.train(params = param, dtrain=dtrain, num_boost_round=40,
                  early_stopping_rounds=10, evals=[(dtrain, 'train'), (dval, 'val')])

[0]	train-logloss:0.54954	val-logloss:0.56101
[1]	train-logloss:0.52930	val-logloss:0.55217
[2]	train-logloss:0.51136	val-logloss:0.54505
[3]	train-logloss:0.49501	val-logloss:0.53939
[4]	train-logloss:0.48032	val-logloss:0.53463
[5]	train-logloss:0.46694	val-logloss:0.53082
[6]	train-logloss:0.45449	val-logloss:0.52761
[7]	train-logloss:0.44298	val-logloss:0.52496
[8]	train-logloss:0.43210	val-logloss:0.52282
[9]	train-logloss:0.42192	val-logloss:0.52092
[10]	train-logloss:0.41261	val-logloss:0.51947
[11]	train-logloss:0.40391	val-logloss:0.51847
[12]	train-logloss:0.39613	val-logloss:0.51761
[13]	train-logloss:0.38866	val-logloss:0.51705
[14]	train-logloss:0.38113	val-logloss:0.51657
[15]	train-logloss:0.37426	val-logloss:0.51629
[16]	train-logloss:0.36787	val-logloss:0.51604
[17]	train-logloss:0.36176	val-logloss:0.51604
[18]	train-logloss:0.35662	val-logloss:0.51605
[19]	train-logloss:0.35173	val-logloss:0.51612
[20]	train-logloss:0.34671	val-logloss:0.51616
[21]	train-logloss:0.34

### Predict

In [23]:
dtest = xgb.DMatrix(X_test_encoded)
pred_proba = model2.predict(dtest)
pred_proba

array([0.01703444, 0.01703444, 0.10139286, ..., 0.23045024, 0.13781676,
       0.01703444], dtype=float32)

### Submission

In [24]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission['probability'] = pred_proba

In [25]:
sample_submission.to_csv('data/0217xgboost_submit.csv', index=False)