# 피처 엔지니어링
## 1.데이터 합치기

In [1]:
import pandas as pd

data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'
train = pd.read_csv(data_path + 'train.csv', index_col="id")
test = pd.read_csv(data_path + 'test.csv', index_col="id")
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col="id")

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop("target", axis=1) # 타깃값 제거

In [3]:
all_features = all_data.columns # 전체 피처
all_features

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

## 2.명목형 피처 원-핫 인코딩

In [4]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if "cat" in feature] # 명목형 피처 추출
onehot_encoder = OneHotEncoder() # 원-핫 인코더 객체 생성
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])
encoded_cat_matrix

<1488028x184 sparse matrix of type '<class 'numpy.float64'>'
	with 20832392 stored elements in Compressed Sparse Row format>

## 3.파생 피처 추가

In [5]:
# "데이터 하나당 결측값 개수"를 파생 피처로 추가
all_data["num_missing"] = (all_data==-1).sum(axis=1)

In [6]:
# 명목형 피처, calc 분류의 피처를 제외한 피처
remaining_features = [feature for feature in all_features 
                      if ("cat" not in feature and "calc" not in feature)]
# num_missing을 remaining_features에 추가
remaining_features.append("num_missing")

In [7]:
# 분류가 ind인 피처
ind_features = [feature for feature in all_features if "ind" in feature]
is_first_feature = True

for ind_feature in ind_features:
    if is_first_feature:
        all_data["mix_ind"] = all_data[ind_feature].astype(str) + "_"
        is_first_feature = False
    else:
        all_data["mix_ind"] += all_data[ind_feature].astype(str) + "_"

In [8]:
# 명목형 피처의 고윳값별 개수
all_data["ps_ind_02_cat"].value_counts()

ps_ind_02_cat
 1    1079327
 2     309747
 3      70172
 4      28259
-1        523
Name: count, dtype: int64

In [9]:
all_data["ps_ind_02_cat"].value_counts().to_dict()

{1: 1079327, 2: 309747, 3: 70172, 4: 28259, -1: 523}

In [10]:
cat_count_features = []

for feature in cat_features+["mix_ind"]:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x:val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

## 4.필요 없는 피처 제거

In [11]:
from scipy import sparse

# 필요 없는 피처들 제거
drop_features = ["ps_ind_14", "ps_ind_10_bin", "ps_ind_11_bin", "ps_ind_12_bin", "ps_ind_13_bin", "ps_car_14"]
# remaining_features, cat_count_features에서 drop_features를 제거한 데이터
all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)
# 데이터 합치기
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining), encoded_cat_matrix], format="csr")

## 5.데이터 나누기

In [12]:
num_train = len(train)
X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]
y = train["target"].values

# 평가지표 계산 함수 작성
## 1.정규화된 지니계수 계산 함수

In [13]:
import numpy as np

def eval_gini(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0] # 데이터 개수
    L_mid = np.linspace(1/n_samples, 1, n_samples) # 대각선 값
    
    # 1) 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()] # y_pred 크기순으로, y_true 값 정렬
    L_pred = np.cumsum(pred_order)/np.sum(pred_order) # 로렌츠 곡선
    G_pred = np.sum(L_mid - L_pred) # 예측값에 대한 지니계수
    # 2) 예측이 완벽할 때 지니계수
    true_order = y_true[y_true.argsort()] # y_true 크기순으로, y_true 값 정렬
    L_true = np.cumsum(true_order)/np.sum(true_order) # 로렌츠 곡선
    G_true = np.sum(L_mid - L_true) # 예측이 완벽할 때 지니계수
    # 3) 정규화된 지니계수
    return G_pred/G_true

## 2.LightGBM용 지니계수

In [14]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return "gini", eval_gini(labels, preds), True # 반환값

# 하이퍼파라미터 최적화
## 1.데이터셋 준비

In [15]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
bayes_dtrain = lgb.Dataset(X_train, y_train)
bayes_dvalid = lgb.Dataset(X_valid, y_valid)

## 2.하이퍼파라미터 범위 설정

In [16]:
# 베이지안 최적화를 위한 하이퍼파라미터 범위
param_bounds = {"num_leaves": (30, 40),
                "lambda_l1": (0.7, 0.9),
                "lambda_l2": (0.9, 1),
                "feature_fraction": (0.6, 0.7),
                "bagging_fraction": (0.6, 0.9),
                "min_child_samples": (6, 10),
                "min_child_weight": (10, 40)}

# 값이 고정된 하이퍼파라미터
fixed_params = {"objective": "binary",
                "learning_rate": 0.005,
                "bagging_freq": 1,
                "force_row_wise": True,
                "random_state": 1991}

## 3.(베이지안 최적화용) 평가지표 계산함수 작성

In [17]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction, 
                  bagging_fraction, min_child_samples, min_child_weight):
    '''최적화하려는 평가지표(지니계수) 계산 함수'''
    # 베이지안 최적화를 수행할 하이퍼파라미터
    params = {"num_leaves": int(round(num_leaves)),
              "lambda_l1": lambda_l1,
              "lambda_l2": lambda_l2,
              "feature_fraction": feature_fraction,
              "bagging_fraction": bagging_fraction,
              "min_child_samples": int(round(min_child_samples)),
              "min_child_weight": min_child_weight,
              "feature_pre_filter": False}
    params.update(fixed_params) # 고정된 하이퍼파라미터도 추가
    print("하이퍼파라미터:", params)
    
    # LightGBM 모델 훈련
    lgb_model = lgb.train(params=params,
                         train_set=bayes_dtrain,
                         num_boost_round=2500,
                         valid_sets=bayes_dvalid,
                         feval=gini,
                         early_stopping_rounds=300,
                         verbose_eval=False)
    preds = lgb_model.predict(X_valid) # 검증 데이터로 예측 수행
    gini_score = eval_gini(y_valid, preds) # 지니계수 계산
    print(f'지니계수: {gini_score}\n')
    
    return gini_score

## 4.최적화 수행

In [18]:
from bayes_opt import BayesianOptimization

# 베이지안 최적화 객체 생성
optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=0)
# 베이지안 최적화 수행
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'num_leaves': 34, 'lambda_l1': 0.8205526752143287, 'lambda_l2': 0.9544883182996897, 'feature_fraction': 0.6715189366372419, 'bagging_fraction': 0.7646440511781974, 'min_child_samples': 8, 'min_child_weight': 29.376823391999682, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}




[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.2855811556220905

| [0m1        [0m | [0m0.2856   [0m | [0m0.7646   [0m | [0m0.6715   [0m | [0m0.8206   [0m | [0m0.9545   [0m | [0m7.695    [0m | [0m29.38    [0m | [0m34.38    [0m |
하이퍼파라미터: {'num_leaves': 39, 'lambda_l1': 0.7766883037651555, 'lambda_l2': 0.9791725038082665, 'feature_fraction': 0.6963662760501029, 'bagging_fraction': 0.867531900234624, 'min_child_samples': 8, 'min_child_weight': 27.04133683281797, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGB



[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.2828993761731121

| [0m4        [0m | [0m0.2829   [0m | [0m0.8978   [0m | [0m0.6594   [0m | [0m0.8445   [0m | [0m0.9234   [0m | [0m8.619    [0m | [0m10.55    [0m | [0m30.09    [0m |
하이퍼파라미터: {'num_leaves': 37, 'lambda_l1': 0.7738449330497988, 'lambda_l2': 0.9032695189818599, 'feature_fraction': 0.6606341064409726, 'bagging_fraction': 0.7666713964943057, 'min_child_samples': 9, 'min_child_weight': 29.306172421380474, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.28513273331754563

| [0m5        [0m | [0m0.2851   [0m | [0m0.7667   [0m | [0m0.6606   [0m | [0m0.7738   [0m | [0m0.9033   [0m | [0m8.769    [0m | [0m29.31    [0m | [0m36.6     [0m |
하이퍼파라미터: {'num_leaves': 33, 'lambda_l1': 0.878140825240546, 'lambda_l2': 0.9, 'feature_fraction': 0.6949207801131031, 'bagging_fraction': 0.6580631827594777, 'min_child_samples': 10, 'min_child_weight': 35.85667779964393, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.28531708475434286

| [0m6        [0m | [0m0.2853   [0m | [0m0.6581   [0m | [0m0.6949   [0m | [0m0.8781   [0m | [0m0.9      [0m | [0m9.826    [0m | [0m35.86    [0m | [0m32.8     [0m |
하이퍼파라미터: {'num_leaves': 37, 'lambda_l1': 0.8433793375135147, 'lambda_l2': 0.9479651949974717, 'feature_fraction': 0.6859622896374784, 'bagging_fraction': 0.8362539818721497, 'min_child_samples': 6, 'min_child_weight': 39.77484183530247, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.2854766974907317

| [0m7        [0m | [0m0.2855   [0m | [0m0.8363   [0m | [0m0.686    [0m | [0m0.8434   [0m | [0m0.948    [0m | [0m6.002    [0m | [0m39.77    [0m | [0m36.8     [0m |
하이퍼파라미터: {'num_leaves': 30, 'lambda_l1': 0.7243619242443197, 'lambda_l2': 0.9, 'feature_fraction': 0.6, 'bagging_fraction': 0.6, 'min_child_samples': 10, 'min_child_weight': 27.951241679061347, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.28455469364758784

| [0m8        [0m | [0m0.2846   [0m | [0m0.6      [0m | [0m0.6      [0m | [0m0.7244   [0m | [0m0.9      [0m | [0m10.0     [0m | [0m27.95    [0m | [0m30.0     [0m |
하이퍼파라미터: {'num_leaves': 36, 'lambda_l1': 0.7, 'lambda_l2': 1.0, 'feature_fraction': 0.7, 'bagging_fraction': 0.9, 'min_child_samples': 6, 'min_child_weight': 33.90131741687068, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786




[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 217
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
지니계수: 0.2840251406982248

| [0m9        [0m | [0m0.284    [0m | [0m0.9      [0m | [0m0.7      [0m | [0m0.7      [0m | [0m1.0      [0m | [0m6.0      [0m | [0m33.9     [0m | [0m36.05    [0m |


## 5.결과 확인

In [19]:
# 평가함수 점수가 최대일 때 하이퍼파라미터
max_params = optimizer.max["params"]
max_params

{'bagging_fraction': 0.6213108174593661,
 'feature_fraction': 0.608712929970154,
 'lambda_l1': 0.7040436794880651,
 'lambda_l2': 0.9832619845547939,
 'min_child_samples': 9.112627003799401,
 'min_child_weight': 36.10036444740457,
 'num_leaves': 39.78618342232764}

In [20]:
# 정수형 하이퍼파라미터 변환
max_params["num_leaves"] = int(round(max_params["num_leaves"]))
max_params["min_child_samples"] = int(round(max_params["min_child_samples"]))
# 값이 고정된 하이퍼파라미터 추가
max_params.update(fixed_params)
max_params

{'bagging_fraction': 0.6213108174593661,
 'feature_fraction': 0.608712929970154,
 'lambda_l1': 0.7040436794880651,
 'lambda_l2': 0.9832619845547939,
 'min_child_samples': 9,
 'min_child_weight': 36.10036444740457,
 'num_leaves': 40,
 'objective': 'binary',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'force_row_wise': True,
 'random_state': 1991}

# 모델 훈련 및 성능 검증
## 1.최적 하이퍼파라미터를 이용해 LightGBM 훈련

In [21]:
from sklearn.model_selection import StratifiedKFold

# 층화 K 폴드 교차 검증기
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)
# OOF 방식으로 훈련된 모델로, 검증 데이터 타깃값을 예측한 확률을 담을 1차원 배열
oof_val_preds = np.zeros(X.shape[0])
# OOF 방식으로 훈련된 모델로, 테스트 데이터 타깃값을 예측한 확률을 담을 1차원 배열
oof_test_preds = np.zeros(X_test.shape[0])

In [22]:
# OOF 방식으로 모델 훈련, 검증, 예측
for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    print('#'*40, f'폴드 {idx+1} / 폴드 {folds.n_splits}', '#'*40) # 각 폴드를 구분하는 문구 출력

    X_train, y_train = X[train_idx], y[train_idx] # 훈련용 데이터
    X_valid, y_valid = X[valid_idx], y[valid_idx] # 검증용 데이터
    dtrain = lgb.Dataset(X_train, y_train)        # LightGBM 전용 훈련 데이터
    dvalid = lgb.Dataset(X_valid, y_valid)        # LightGBM 전용 검증 데이터

    # LightGBM 모델 훈련
    lgb_model = lgb.train(params=max_params,      # 훈련용 하이퍼파라미터
                          train_set=dtrain,       # 훈련 데이터셋
                          num_boost_round=2500,   # 부스팅 반복 횟수
                          valid_sets=[dvalid],    # 성능 평가용 검증 데이터셋 (주의: 리스트로 넘겨야 함)
                          feval=gini,             # 검증용 평가지표
                          callbacks=[lgb.early_stopping(300, first_metric_only=False)]) # 조기종료 콜백

    # 테스트 데이터를 활용해 OOF 예측
    oof_test_preds += lgb_model.predict(X_test)/folds.n_splits
    # 모델 성능 평가를 위한 검증 데이터 타깃값 예측
    oof_val_preds[valid_idx] += lgb_model.predict(X_valid)
    # 검증 데이터 예측 확률에 대한 정규화 지니계수
    gini_score = eval_gini(y_valid, oof_val_preds[valid_idx])
    print(f'폴드 {idx+1} 지니계수: {gini_score}\n')

######################################## 폴드 1 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 300 rounds
Did not meet early stopping. Best iteration is:
[2458]	valid_0's binary_logloss: 0.151355	valid_0's gini: 0.29865
폴드 1 지니계수: 0.2986504843987991

######################################## 폴드 2 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1560
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore

## 2.OOF 검증 데이터 지니계수

In [23]:
print("OOF 검증 데이터 지니계수:", eval_gini(y, oof_val_preds))

OOF 검증 데이터 지니계수: 0.2889651000887542


# 예측 및 결과 제출

In [24]:
submission["target"] = oof_test_preds
submission.to_csv('submission.csv')