In [1]:
# 라이브러리 설치
!pip install gdrive_dataset



In [2]:
from gdrivedataset import loader

file_id = "1A3_7oKAjZPntGPxTPxSD-kbfrEszy-6l"
loader.load_from_google_drive(file_id)


data/train.csv
data/test.csv
data/sample_submission.csv



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os

In [4]:
DATA_DIR = 'data'

In [5]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

In [6]:
train.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,target
0,1,Female,disloyal Customer,22,Business travel,Eco,1599,3,0,3,3,4,3,4,4,5,4,4,4,5,4,0,0.0,0
1,2,Female,Loyal Customer,37,Business travel,Business,2810,2,4,4,4,1,4,3,5,5,4,2,1,5,2,18,18.0,0
2,3,Male,Loyal Customer,46,Business travel,Business,2622,1,1,1,1,4,5,5,4,4,4,4,5,4,3,0,0.0,1
3,4,Female,disloyal Customer,24,Business travel,Eco,2348,3,3,3,3,3,3,3,3,2,4,5,3,4,3,10,2.0,0
4,5,Female,Loyal Customer,58,Business travel,Business,105,3,3,3,3,4,4,5,4,4,4,4,4,4,5,0,0.0,1


In [7]:
# 데이터 전처리
train_x = train.drop(["id", "target"], axis=1)
train_y = train['target']

# 라벨 인코딩을 하기 위한 dictionary map 생성 함수
def make_label_map(dataframe):
  label_maps = {}
  for col in dataframe.columns:
    if dataframe[col].dtype == 'object':
      label_map = {'unknown': 0}
      for i, key in enumerate(dataframe[col].unique()):
        label_map[key] = i+1
      label_maps[col] = label_map
  return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
  for col in dataframe.columns:
    if dataframe[col].dtype == 'object':
      dataframe[col] = dataframe[col].map(label_map[col])
      dataframe[col] = dataframe[col].fillna(label_map[col]['unknown'])
  return dataframe

# train 데이터 라벨 인코딩
label_map = make_label_map(train_x)
train_x = label_encoder(train_x, label_map)
train_x.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,1,22,1,1,1599,3,0,3,3,4,3,4,4,5,4,4,4,5,4,0,0.0
1,1,2,37,1,2,2810,2,4,4,4,1,4,3,5,5,4,2,1,5,2,18,18.0
2,2,2,46,1,2,2622,1,1,1,1,4,5,5,4,4,4,4,5,4,3,0,0.0
3,1,1,24,1,1,2348,3,3,3,3,3,3,3,3,2,4,5,3,4,3,10,2.0
4,1,2,58,1,2,105,3,3,3,3,4,4,5,4,4,4,4,4,4,5,0,0.0


In [9]:
from sklearn.preprocessing import StandardScaler

num_features = ['Age','Flight Distance','Departure Delay in Minutes','Arrival Delay in Minutes']

scaler = StandardScaler()

scaler.fit(train_x[num_features])
train_x[num_features] = scaler.transform(train_x[num_features])

train_x.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,1,-1.138798,1,1,-0.373641,3,0,3,3,4,3,4,4,5,4,4,4,5,4,-0.346846,-0.352289
1,1,2,-0.145833,1,2,0.804446,2,4,4,4,1,4,3,5,5,4,2,1,5,2,0.052482,0.045978
2,2,2,0.449945,1,2,0.621556,1,1,1,1,4,5,5,4,4,4,4,5,4,3,-0.346846,-0.352289
3,1,1,-1.006402,1,1,0.355003,3,3,3,3,3,3,3,3,2,4,5,3,4,3,-0.124997,-0.308037
4,1,2,1.244317,1,2,-1.827036,3,3,3,3,4,4,5,4,4,4,4,4,4,5,-0.346846,-0.352289


In [12]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=52f80c51dea296cb40b87d0dcb90ac10e271b5c1f90c8f462cf9e4a70fa91f46
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [13]:
from sklearn.tree import DecisionTreeClassifier
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [14]:
# 의사결정나무의 하이퍼 파라미터의 범위를 dictionary로 지정
# Key는 의사결정나무의 hyperparameter 이름이고, value는 탐색할 범위
parameter_bound = {
    'max_depth': (1, 3), # 나무의 깊이
    'min_samples_split': (10, 30), # 데이터가 분할하는 데 필요한 샘플 데이터의 수
}

In [15]:
# bayesian optimization
def bo(max_depth, min_samples_split):
  params = {
      'max_depth': int(round(max_depth)),
      'min_samples_split': int(round(min_samples_split)),
  }
  clf = DecisionTreeClassifier(**params)

  X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2)

  clf.fit(X_train, y_train)
  score = accuracy_score(y_valid, clf.predict(X_valid))
  return score

In [16]:
BO = BayesianOptimization(f=bo, pbounds=parameter_bound, random_state=0)

In [17]:
BO.maximize(init_points=5, n_iter=5)

|   iter    |  target   | max_depth | min_sa... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.8283  [0m | [0m 2.098   [0m | [0m 24.3    [0m |
| [0m 2       [0m | [0m 0.8183  [0m | [0m 2.206   [0m | [0m 20.9    [0m |
| [0m 3       [0m | [0m 0.8233  [0m | [0m 1.847   [0m | [0m 22.92   [0m |
| [0m 4       [0m | [0m 0.815   [0m | [0m 1.875   [0m | [0m 27.84   [0m |
| [95m 5       [0m | [95m 0.865   [0m | [95m 2.927   [0m | [95m 17.67   [0m |
| [0m 6       [0m | [0m 0.8433  [0m | [0m 3.0     [0m | [0m 16.08   [0m |
| [0m 7       [0m | [0m 0.8383  [0m | [0m 1.631   [0m | [0m 17.66   [0m |
| [0m 8       [0m | [0m 0.8383  [0m | [0m 2.986   [0m | [0m 18.46   [0m |
| [0m 9       [0m | [0m 0.8567  [0m | [0m 3.0     [0m | [0m 17.14   [0m |
| [0m 10      [0m | [0m 0.815   [0m | [0m 2.489   [0m | [0m 17.46   [0m |


In [18]:
# 하이퍼파라미터의 결과값을 불러와 'max_params'라는 변수에 저장
max_params = BO.max['params']

max_params['max_depth'] = int(max_params['max_depth'])
max_params['min_samples_split'] = int(max_params['min_samples_split'])
print('최적 파라미터: ', max_params)

최적 파라미터:  {'max_depth': 2, 'min_samples_split': 17}


In [20]:
# Bayesian Optimization의 결과를 "BO_tuned"라는 변수에 저장
BO_tuned = DecisionTreeClassifier(**max_params)
BO_tuned.fit(train_x, train_y)

DecisionTreeClassifier(max_depth=2, min_samples_split=17)

In [22]:
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
test_x = test.drop(['id'], axis=1)
test_x = label_encoder(test_x, label_map)
test_x[num_features] = scaler.transform(test_x[num_features])

pred = BO_tuned.predict(test_x)

sample_submission = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))

sample_submission.target = pred
sample_submission.to_csv('submission.csv', index=False)