<a href="https://colab.research.google.com/github/choeuneheol/python-practice/blob/master/%EB%AC%B8%EC%A0%9C%ED%95%B4%EA%B2%B0%EC%B1%85DAY9(%EC%95%88%EC%A0%84_%EC%9A%B4%EC%A0%84%EC%9E%90_%EC%98%88%EC%B8%A1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis = 1) # 타깃값 제거

all_features = all_data.columns # 전체 피처

In [4]:
from sklearn.preprocessing import OneHotEncoder

# 명목형 피처
cat_features = [feature for feature in all_features if 'cat' in feature]

# 원-핫 인코딩 적용
onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

In [5]:
# '데이터 하나당 결측값 개수'를 파생 피처로 추가
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [6]:
# 명목형 피처, calc 분류의 피처를 제외한 피처
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

# num_missing을 remaining_features에 추가
remaining_features.append('num_missing')

In [7]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
  if is_first_feature:
    all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
    is_first_feature = False
  else:
    all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [8]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

In [9]:
all_data['ps_ind_02_cat'].value_counts()

 1    1079327
 2     309747
 3      70172
 4      28259
-1        523
Name: ps_ind_02_cat, dtype: int64

In [10]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

{-1: 523, 1: 1079327, 2: 309747, 3: 70172, 4: 28259}

In [11]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
  val_counts_dict = all_data[feature].value_counts().to_dict()
  all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:
                                                         val_counts_dict[x])
  cat_count_features.append(f'{feature}_count')

In [12]:
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

In [13]:
from scipy import sparse
#필요 없는 피처들
drop_features = ['ps_ind_14','ps_ind_10_bin','ps_ind_11_bin',
                 'ps_ind_12_bin','ps_ind_13_bin','ps_car_14']
                 
#remaining_featrues, cat_count_features에서 drop_features를 제거한 데이터
all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

# 데이터 합치기
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [14]:
#e데이터 나누기
num_train = len(train)

x = all_data_sprs[:num_train]
x_test = all_data_sprs[num_train:]

y = train['target'].values

In [15]:
#하이퍼파라미터 최적화

import lightgbm as lgb
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y,
                                                      test_size=0.2,
                                                      random_state=0)

bayes_dtrain = lgb.Dataset(x_train, y_train)
bayes_dvalid = lgb.Dataset(x_valid, y_valid)

In [16]:
param_bounds = {'num_leaves':(30, 40),
                'lambda_l1':(0.7, 0.9),
                'lambda_l1':(0.9,1),
                'feature_fraction':(0.6, 0.7),
                'bagging_fraction':(0.6, 0.7),
                'min_child_samples':(6,10),
                'min_child_weight':(10,40)}

fixed_params = {'objective' : 'binary',
                'learning_rate':0.005,
                'bagging_freq':1,
                'force_row_wise':True,
                'random_state':1991}

In [39]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction,
                  bagging_fraction, min_child_samples, min_child_weight):
  '''최적화하려는 평가지표(지니계수) 계산 함수'''

  # 베이지안 최적화를 수행할 하이퍼파라미터
  params = {'num_leaves' : int(round(num_leaves)),
            'lambda_l1':lambda_l1,
            'lambda_l2':lambda_l2,
            'feature_fraction':feature_fraction,
            'bagging_fraction':bagging_fraction,
            'min_child_samples':int(round(min_child_samples)),
            'min_child_weight':min_child_weight,
            'feature_pre_filter':False}
  # 고정된 하이퍼파라미터도 추가
  params.update(fixed_params)

  print('하이퍼파라미터:',params)

  # LightGBM 모델 훈련
  lgb_model = lgb.train(params=params,
                        train_set=bayes_dtrain,
                        num_boost_round=2500,
                        valid_sets=bayes_dvalid,
                        feval=gini,
                        early_stopping_rounds=300,
                        verbose_eval=False)
  # 검증 데이터로 예측 수행
  preds = lgb_model.predict(x_valid)

  #지니계수 계산
  gini_score = eval_gini(y_valid, preds)

  print(f'지니계수 : {gini_score}\n')

  return gini_score

In [40]:
#최적화 수행

!pip3 install bayesian-optimization

from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function,
                                 pbounds=param_bounds,
                                 random_state=0)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [41]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function,
                                 pbounds=param_bounds,
                                 random_state=0)

In [42]:
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------


TypeError: ignored