In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./data/train.csv', index_col='id')
test = pd.read_csv('./data/test.csv', index_col='id')
submission = pd.read_csv('./data/sample_submission.csv', index_col='id')


In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data.drop(columns=['target'], inplace=True)
all_features = all_data.columns

In [3]:
from sklearn.preprocessing import OneHotEncoder
# 명목형 피처 추출
cat_features = [feature for feature in all_features if 'cat' in feature]
onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])


In [4]:
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [5]:
# 추가로 제거할 피처
drop_features = ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin','ps_ind_13_bin', 'ps_ind_14', 'ps_car_14']

remaining_features = [feature for feature in all_features 
                      if ('cat' not in feature and 'calc' not in feature and feature not in drop_features)]
remaining_features.append('num_missing')

In [6]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else : 
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'
        

In [7]:
cat_count_features = []
for feature in cat_features + ['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].map(lambda x: val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')


In [8]:
from scipy import sparse
drop_features = ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin','ps_ind_13_bin', 'ps_ind_14', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features]#.drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining), encoded_cat_matrix], format='csr')


In [9]:
num_train = len(train)

X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]
y = train['target'].values

In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1004)
bayes_dtrain = lgb.Dataset(X_train,y_train)
bayes_dvalid = lgb.Dataset(X_valid,y_valid)



In [30]:
fixed_params = {
    'objective':'binary',
    'learning_rate': 0.005,
    'bagging_freq':1,
    'force_row_wise':True,
    'random_state' : 1004    
}
param_bounds = {
    'num_leaves' : (30,40),
    'lambda_l1' : (0.7, 0.9),
    'lambda_l2' : (0.9, 1),
    'feature_fraction' : (0.6, 0.7),
    'bagging_fraction': (0.6, 0.9),
    'min_child_samples' : (6,10),
    'min_child_weight' : (10,40),
}

In [13]:
def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    
    n_samples = y_true.shape[0]
    L_mid = np.linspace(1 / n_samples, 1, n_samples)
    
    pred_order = y_true[y_pred.argsort()]
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    G_pred = np.sum(L_mid - L_pred) 
    
    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(true_order)
    G_true = np.sum(L_mid - L_true) 
    return G_pred/G_true

def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True

In [34]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction, bagging_fraction, min_child_samples, min_child_weight):
    params = {
        'num_leaves' : int(round(num_leaves)),
        'lambda_l1' : lambda_l1,
        'lambda_l2' : lambda_l2,
        'feature_fraction' : feature_fraction,
        'bagging_fraction': bagging_fraction,
        'min_child_samples' : int(round(min_child_samples)),
        'min_child_weight' : min_child_weight,
        'feature_pre_filter': False        
        }
    params.update(fixed_params)
    print(params)
    
    lgb_model = lgb.train(params=params,
                          train_set=bayes_dtrain,
                          num_boost_round=2500,
                          valid_sets=bayes_dvalid,
                          feval=gini,
                          early_stopping_rounds=300,
                          verbose_eval=100,               
                                                    
                         )

    preds = lgb_model.predict(X_valid)
    
    gini_score = eval_gini(y_valid, preds)
    print(f'지니계수 : {gini_score}\n')
                  

!pip install bayesian-optimization


In [35]:
from bayes_opt import BayesianOptimization
optimizer = BayesianOptimization(f=eval_function, # 평가 지표 계산함수
                                 pbounds=param_bounds, # 하이퍼 파라미터 범위
                                 random_state=1004
                                )

In [37]:
optimizer.maximize(init_points=1, n_iter=3)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
{'num_leaves': 39, 'lambda_l1': 0.8214218185782893, 'lambda_l2': 0.9104182029133968, 'feature_fraction': 0.661550001288853, 'bagging_fraction': 0.672679206034253, 'min_child_samples': 8, 'min_child_weight': 29.64768513657372, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1004}
[LightGBM] [Info] Number of positive: 17377, number of negative: 458792
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036493 -> initscore=-3.273449
[LightGBM] [Info] Start training from score -3.273449
Training until validation scores don't improve for 300 rounds
[100]	valid_0's 

TypeError: '>=' not supported between instances of 'NoneType' and 'NoneType'

In [21]:
optimizer.max['params']

{'bagging_fraction': 0.6116854915458474,
 'feature_fraction': 0.685649586008824,
 'lambda_l1': 0.8493000615914119,
 'lambda_l2': 0.9863927893244906,
 'min_child_samples': 7.679953216979513,
 'min_child_weight': 12.41449220662578,
 'num_leaves': 32.31636298494031}