<a href="https://colab.research.google.com/github/choeuneheol/python-practice/blob/master/%EB%AC%B8%EC%A0%9C%ED%95%B4%EA%B2%B0%EC%B1%85DAY9(%EC%95%88%EC%A0%84_%EC%9A%B4%EC%A0%84%EC%9E%90_%EC%98%88%EC%B8%A1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [None]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis = 1) # 타깃값 제거

all_features = all_data.columns # 전체 피처

In [None]:
from sklearn.preprocessing import OneHotEncoder

# 명목형 피처
cat_features = [feature for feature in all_features if 'cat' in feature]

# 원-핫 인코딩 적용
onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [None]:
# '데이터 하나당 결측값 개수'를 파생 피처로 추가
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [None]:
# 명목형 피처, calc 분류의 피처를 제외한 피처
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

# num_missing을 remaining_features에 추가
remaining_features.append('num_missing')

In [None]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
  if is_first_feature:
    all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
    is_first_feature = False
  else:
    all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [None]:
all_data['mix_ind']

In [None]:
all_data['ps_ind_02_cat'].value_counts()

In [None]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

In [None]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
  val_counts_dict = all_data[feature].value_counts().to_dict()
  all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:
                                                         val_counts_dict[x])
  cat_count_features.append(f'{feature}_count')

In [None]:
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

In [None]:
from scipy import sparse
#필요 없는 피처들
drop_features = ['ps_ind_14','ps_ind_10_bin','ps_ind_11_bin',
                 'ps_ind_12_bin','ps_ind_13_bin','ps_car_14']
                 
#remaining_featrues, cat_count_features에서 drop_features를 제거한 데이터
all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

# 데이터 합치기
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [None]:
#e데이터 나누기
num_train = len(train)

x = all_data_sprs[:num_train]
x_test = all_data_sprs[num_train:]

y = train['target'].values

In [None]:
#하이퍼파라미터 최적화

import lightgbm as lgb
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y,
                                                      test_size=0.2,
                                                      random_state=0)

bayes_dtrain = lgb.Dataset(x_train, y_train)
bayes_dvalid = lgb.Dataset(x_valid, y_valid)

In [None]:
param_bounds = {'num_leaves':(30, 40),
                'lambda_l1':(0.7, 0.9),
                'lambda_l1':(0.9,1),
                'feature_fraction':(0.6, 0.7),
                'bagging_fraction':(0.6, 0.7),
                'min_child_samples':(6,10),
                'min_child_weight':(10,40)}

fixed_params = {'objective' : 'binary',
                'learning_rate':0.005,
                'bagging_freq':1,
                'force_row_wise':True,
                'random_state':1991}

In [None]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction,
                  bagging_fraction, min_child_samples, min_child_weight):
  '''최적화하려는 평가지표(지니계수) 계산 함수'''

  # 베이지안 최적화를 수행할 하이퍼파라미터
  params = {'num_leaves' : int(round(num_leaves)),
            'lambda_l1':lambda_l1,
            'lambda_l2':lambda_l2,
            'feature_fraction':feature_fraction,
            'bagging_fraction':bagging_fraction,
            'min_child_samples':int(round(min_child_samples)),
            'min_child_weight':min_child_weight,
            'feature_pre_filter':False}
  # 고정된 하이퍼파라미터도 추가
  params.update(fixed_params)

  print('하이퍼파라미터:',params)

  # LightGBM 모델 훈련
  lgb_model = lgb.train(params=params,
                        train_set=bayes_dtrain,
                        num_boost_round=2500,
                        valid_sets=bayes_dvalid,
                        feval=gini,
                        early_stopping_rounds=300,
                        verbose_eval=False)
  # 검증 데이터로 예측 수행
  preds = lgb_model.predict(x_valid)

  #지니계수 계산
  gini_score = eval_gini(y_valid, preds)

  print(f'지니계수 : {gini_score}\n')

  return gini_score

In [None]:
#최적화 수행

!pip3 install bayesian-optimization

from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function,
                                 pbounds=param_bounds,
                                 random_state=0)


In [None]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function,
                                 pbounds=param_bounds,
                                 random_state=0)

In [19]:
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------


TypeError: ignored

In [20]:
# 성능개선 2  : XGBoost 모델

#피처 엔지니어링

def gini(preds, dtrrain):
  labels = dtrain.get_label()
  return 'gini',eval_gini(labels, preds), True

In [21]:
def gini(preds, dtrain):
  labels = dtrain.get_label()
  return 'gini', elval_gini(labels, preds)


In [24]:
#데이터셋 준비
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 8:2 비율로 훈련 데이터, 검증 데이터 분리(베이지안 최적화 수행용)
x_train, x_valid, y_train, y_valid = train_test_split(x,y,
                                                      test_size=0.2,
                                                      random_state=0)

# 베이지안 최적화용 데이터셋
bayes_dtrain = xgb.DMatrix(x_train, y_train)
bayes_dvalid = xgb.DMatrix(x_valid, y_valid)

In [25]:
# 하이퍼파라미터 범위 설정

# 베이지안 최적화를 위한 하이퍼파라미터 범위

param_bounds = {'max_depth':(4,8),
                'subsample':(0.6,0.9),
                'colsample_bytree':(0.7,1.0),
                'min_child_weight':(5,7),
                'gamma':(8,11),
                'reg_alpha':(7,9),
                'reg_lambda':(1.1,1.5),
                'scale_pos_weight':(1.4, 1.6)}

#값이 고정된 하이퍼파라미터
fixed_params = {'bojective':'binary:logistic',
                'learning_rate':0.02,
                'random_state':1991}

In [28]:
#(베이지안 최적화용) 평가지표 계산 함수 작성

def eval_function(max_depth, subsample, colsample_bytree, min_child_weight,
                  reg_alpha, gamma, reg_lambda, scale_pos_weight):
  '''최적화하려는 평가지표(지니계수) 계산 함수'''

  # 베이지안 최적화를 수행할 하이퍼파라미터
  params = {'max_depth':int(round(max_depth)),
            'subsample':subsample,
            'colsample_bytree':colsample_bytree,
            'gamma':gamma,
            'reg_alpha':reg_alpha,
            'reg_lambda':reg_lambda,
            'scale_pos_weight':scale_pos_weight}
  # 값이 고정된 하이퍼 파라미터도 추가
  params.update(fixed_params)

  print('하이퍼파라미터 : ',params)

  # XGBoost 모델 훈련
  xgb_model = xgb.train(params=params,
                        dtrain=bayes_dtrain,
                        num_boost_round=2000,
                        evals=[(bayes_dvalid, 'bayes_dvalid')],
                        maximize=True,
                        feval=gini,
                        early_stopping_rounds=200,
                        verbose_eval=False)
  best_iter = xgb_model.best_iteration

  #검정 데이터로 예측 수행
  preds = xgb_model.predict(bayes_dvalid,
                            iteration_range=(0, best_iter))
  
  #지니계수 계산
  gini_score = eval_gini(y_valid, preds)
  print(f'지니계수: {gini_score}\n')

  return gini_score


In [30]:
#최적화 수행
from bayes_opt import BayesianOptimization

#베이지안 최적화 객체 생성
optiminzer = BayesianOptimization(f=eval_function,
                                  pbounds=param_bounds,
                                  random_state=0)

# 베이지안 최적화 수행
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------


TypeError: ignored