In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Commit_test_folder/LGAimers-06-2/src

/content/drive/MyDrive/Commit_test_folder/LGAimers-06-2/src


# file import

In [3]:
import pandas as pd
import numpy as np
import lgAimersDPP as dpp

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from lightgbm import LGBMClassifier


# 데이터 전처리

- 코드가 길어지는 것을 방지하기 위해 'lgAimersDPP.py' 파일에 전처리 함수 패키지화.
- 자세한 코드는 'lgAimersDPP.py' 또는 'Data Pre-Processing.jpynb' 파일 참고.

In [4]:
train = pd.read_csv('data/train.csv')
train = dpp.dpp(train)
print(train.shape)
train.head()

(256351, 89)


Unnamed: 0,시술 시기 코드,시술 당시 나이,임신 시도 또는 마지막 임신 경과 연수,시술 유형,배란 자극 여부,배란 유도 유형,단일 배아 이식 여부,착상 전 유전 검사 사용 여부,착상 전 유전 진단 사용 여부,남성 주 불임 원인,...,기증용 배아,난자 저장용 배아,배아 저장용 배아,연구용 배아,현재 시술용 배아,sum_male,sum_female,sum_both,sum_all,mean_age
0,6,1,-1.0,1,1,0,0.0,0.0,0.0,0,...,0,0,0,0,1,1,1,0,2,2.333333
1,5,6,-1.0,1,0,-1,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,6.0
2,3,1,-1.0,1,1,0,0.0,0.0,0.0,0,...,0,0,0,0,1,1,0,0,1,2.333333
3,2,2,-1.0,1,1,0,0.0,0.0,0.0,0,...,0,0,0,0,1,1,0,0,1,4.0
4,3,1,-1.0,1,1,0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,2,0,2,2.333333


In [5]:
test = pd.read_csv('data/test.csv')
#test 데이터 전처리
test = dpp.dpp(test)
print(test.shape)

(90067, 88)


# 모델 학습(light GBM)

1. Grid Search
2. K-FOLD
3. 최종 학습 및 제출 데이터 생성

In [6]:
import warnings
warnings.filterwarnings('ignore')

x_train, x_val, y_train, y_val = train_test_split(
    train.drop("임신 성공 여부", axis=1),
    train["임신 성공 여부"],
    test_size=0.3,
    random_state=42,
)

from sklearn.model_selection import GridSearchCV

parm_gbm = {"feature_fraction":[0.9, 0.8, 0.7],
            "bagging_fraction":[1.0, 0.9]}
gbm = LGBMClassifier( max_depth= 20, n_estimators= 100, num_leaves= 20,random_state=42,)
gscv = GridSearchCV(gbm, parm_gbm, cv=2, scoring='roc_auc')
gscv.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 23170, number of negative: 66552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 89722, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258242 -> initscore=-1.055125
[LightGBM] [Info] Start training from score -1.055125
[LightGBM] [Info] Number of positive: 23170, number of negative: 66553
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 732
[LightGBM] [Info] Number of data points in the train set: 89723, number of used features: 80
[LightGBM] [Info] [b

In [7]:
print('Best Parameters:', gscv.best_params_)
print('Best Score:', gscv.best_score_)

Best Parameters: {'bagging_fraction': 1.0, 'feature_fraction': 0.7}
Best Score: 0.7383264794156323


In [8]:
#k-fold
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

X = np.array(train.drop('임신 성공 여부', axis=1))
y = train['임신 성공 여부']

# split 개수, 셔플 여부 및 seed 설정
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

# split 개수 스텝 만큼 train, test 데이터셋을 매번 분할
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

accuracy_history = []
auc_history = []
# K-fold 검증 과정으로 실제 랜덤 포레스트 모델을 학습하여 정확도 평균을 내는 방법
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LGBMClassifier(max_depth= 20, n_estimators= 100, num_leaves= 20,random_state=42,
                      feature_fraction=0.7)
    model.fit(X_train, y_train) # 모델 학습

    y_pred = model.predict(X_test) # 예측 라벨
    accuracy_history.append(accuracy_score(y_pred, y_test)) # 정확도 측정 및 기록
    auc_history.append(roc_auc_score(y_test, model.predict(X_test)))

print("각 분할의 정확도 기록 :", accuracy_history)
print("평균 정확도 :", np.mean(accuracy_history))
print("각 분할의 AUC 기록 :", auc_history)
print("평균 AUC :", np.mean(auc_history))


[LightGBM] [Info] Number of positive: 53102, number of negative: 151978
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.208300 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 773
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051521
[LightGBM] [Info] Start training from score -1.051521
[LightGBM] [Info] Number of positive: 52877, number of negative: 152204
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 775
[LightGBM] [Info] Number of data points in the train set: 205081, number of used features: 81
[LightGBM] [Info

In [9]:
clf = LGBMClassifier(max_depth= 20, n_estimators= 100, num_leaves= 20, random_state=42, feature_fraction = 0.8,
                      bagging_fraction = 0.9)
X_train = train.drop("임신 성공 여부", axis=1)
y_train = train["임신 성공 여부"]
clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.271225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 779
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568


In [10]:
pred_proba = clf.predict_proba(test)[:, 1]



In [11]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission['probability'] = pred_proba
sample_submission.to_csv('data/final_submission.csv', index=False)