In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
test_id = test.id
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'] ,axis=1, inplace=True)

X_train = train.drop(['target'], axis=1)
y_train = train.target
X_test = test

In [4]:
h_skew = X_train.loc[:, train.skew() >= 2].columns # with skewed
l_skew = X_train.loc[:, train.skew() < 2].columns # Bimodal

# high skewd columns:
# add mean, median, skew, min, max, var, std
X_train['median_h'] = X_train[h_skew].median(axis=1)
X_test['median_h'] = X_test[h_skew].median(axis=1)
X_train['var_h'] = X_train[h_skew].var(axis=1)
X_test['var_h'] = X_test[h_skew].var(axis=1)

# bimodal columns:

X_train['mean_l'] = X_train[l_skew].mean(axis=1)
X_test['mean_l'] = X_test[l_skew].mean(axis=1)
X_train['std_l'] = X_train[l_skew].std(axis=1)
X_test['std_l'] = X_test[l_skew].std(axis=1)
X_train['median_l'] = X_train[l_skew].median(axis=1)
X_test['median_l'] = X_test[l_skew].median(axis=1)
X_train['skew_l'] = X_train[l_skew].skew(axis=1)
X_test['skew_l'] = X_test[l_skew].skew(axis=1)
X_train['max_l'] = X_train[l_skew].max(axis=1)
X_test['max_l'] = X_test[l_skew].max(axis=1)
X_train['var_l'] = X_train[l_skew].var(axis=1)
X_test['var_l'] = X_test[l_skew].var(axis=1)

In [5]:
X_train.shape, X_test.shape

((600000, 108), (540000, 108))

In [6]:
X_train2 = X_train.iloc[:1000]
y_train2 = y_train[:1000]

---

In [7]:
# 훈련데이터와 테스트 데이터 구축 후 훈련데이터를 축소훈련데이터와 축소테스트데이터로 구분해준다.
xtrain, xtest, ytrain, ytest = train_test_split(X_train2, y_train2, test_size=0.1, random_state=42)

In [8]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((900, 108), (100, 108), (900,), (100,))

In [9]:
# 사용할 모델은 lgbm이다.
model = LGBMClassifier()
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': 'warn',
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [10]:
# lgbm의 서치 범위이다.
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [True, False]
}

In [11]:
cv = StratifiedKFold(n_splits=5, shuffle=True,
                    random_state=42)

In [12]:
search_model = RandomizedSearchCV(model, param_grid, cv=cv,
                                  scoring='roc_auc',
                                  n_jobs=-1,
                                  verbose=0,n_iter=1
                                 ).fit(xtrain, ytrain,
                                       early_stopping_rounds=15,
                                       eval_set=[(xtest, ytest)],
                                       eval_metric=['AUC'])

OverflowError: Python int too large to convert to C long

In [None]:
pd.DataFrame(search_model.cv_results_)