In [7]:
# 라이브러리 

import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [4]:
# 데이터 불러오기

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = train.drop(['ID_code'],1)
test = test.drop(['ID_code'],1)

# train data의 feature 변수와 target 변수 분리
X = train.drop(['target'], axis=1)
y = train['target']

In [12]:
# 클래스 불균형을 해소하기 위한 oversampling

from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy=0.33,random_state= 42)
X_over, y_over = oversample.fit_resample(X, y)

# 샘플링 후 클래스 0과 클래스 1의 갯수 확인 

print(Counter(y_over))

Counter({0: 143922, 1: 47494})


In [13]:
# train - test로 데이터 split

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X_over, y_over, test_size=0.2, random_state=42)


In [14]:
# RandomSearchCV()를 통해 최적 파라미터 설정 후 모델 생성

clf = lgb.LGBMClassifier(bagging_fraction=0.85, bagging_freq=1, colsample_bytree=1,
               feature_fraction=1, importance_type='split',
               learning_rate=0.1, max_bin=256, max_depth=1,
               min_child_samples=153, min_child_weight=0.1,
               n_estimators=4000, n_jobs=-1, num_leaves=4, num_threads=8,
               objective='binary', reg_alpha=0.1,
               reg_lambda=0, seed=500, silent=True, subsample=0.8)

#모델 적합

clf.fit(train_x, train_y)
pred = clf.predict(test_x)


# validation set으로 계산한 accuracy와 f1 score

print(clf.score(train_x, train_y))
print(f1_score(test_y, pred, average='macro'))


0.8686427395972103
0.7988496157221074
