# Default

In [1]:
import os
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from scipy import stats

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.ensemble import StackingRegressor, ExtraTreesRegressor, RandomForestRegressor

import tensorflow as tf
import json
from statsmodels.stats.outliers_influence import variance_inflation_factor

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM

from statsmodels.graphics.mosaicplot import mosaic
from statistics import stdev

import xgboost as xgb 
import lightgbm as lgb

import itertools

from korean_lunar_calendar import KoreanLunarCalendar
from pycaret.classification import *
from pycaret.regression import *
from time import time

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics import *

from bayes_opt import BayesianOptimization
# from hyperopt import fmin, tpe, hp,Trials
from catboost import Pool, CatBoostRegressor

# 폰트 설정 - 윈도우
plt.rcParams['font.family'] = 'NanumGothic'

# 그래프의 크기
plt.rcParams['figure.figsize'] = 12, 6

# 글자 크기
plt.rcParams['font.size'] = 14

# 폰트 설정 시 - 기호 깨는거 방지하기
plt.rcParams['axes.unicode_minus'] = False

import seaborn as sns

# 불필요한 경고 메시지를 나오지 않도록 한다.
import warnings
warnings.filterwarnings('ignore')

# 절기 구분을 위한 모듈
import datetime
dateformat = '%Y-%m-%d'

# scipy
from scipy.stats import boxcox, yeojohnson
from scipy.special import inv_boxcox

In [2]:
BASE_DIR = './data'

In [3]:
train_path = os.path.join(BASE_DIR, '2021 빅콘테스트_데이터분석분야_퓨처스리그_홍수ZERO_댐유입량,강우,수위데이터_210902_update.xlsx')

data = pd.read_excel(train_path)
data = data[1 : ]
data = data.reset_index(drop = True)
data.iloc[:,6:] = data.iloc[:,6:].apply(pd.to_numeric)  # 수치형으로 변환

data.columns = ['홍수사상번호', '연', '월', '일', '시간', '유입량', 
                '1_유역평균강수', '1_강우(A지역)', '1_강우(B지역)', '1_강우(C지역)', '1_강우(D지역)', '1_수위(E지역)', '1_수위(D지역)', 
                '2_유역평균강수', '2_강우(A지역)', '2_강우(B지역)', '2_강우(C지역)', '2_강우(D지역)', '2_수위(E지역)', '2_수위(D지역)', 
                '3_유역평균강수', '3_강우(A지역)', '3_강우(B지역)', '3_강우(C지역)', '3_강우(D지역)', '3_수위(E지역)', '3_수위(D지역)',
                '4_유역평균강수', '4_강우(A지역)', '4_강우(B지역)', '4_강우(C지역)', '4_강우(D지역)', '4_수위(E지역)', '4_수위(D지역)',
                '5_유역평균강수', '5_강우(A지역)', '5_강우(B지역)', '5_강우(C지역)', '5_강우(D지역)', '5_수위(E지역)', '5_수위(D지역)',
                '6_유역평균강수', '6_강우(A지역)', '6_강우(B지역)', '6_강우(C지역)', '6_강우(D지역)', '6_수위(E지역)', '6_수위(D지역)']

In [4]:
weather_path = os.path.join(BASE_DIR, '기상데이터.csv')

weather_data = pd.read_csv(weather_path, encoding = 'euc-kr')
print('기상데이터 적용 전 :', data.shape)
data = pd.merge(data, weather_data,left_index=True, right_index=True, how='left')

print('기상데이터 적용 후 :', data.shape)

기상데이터 적용 전 : (3051, 48)
기상데이터 적용 후 : (3051, 52)


# Labeling

In [7]:
data_label = data[data['홍수사상번호'] != 26]
data_target = data[data['홍수사상번호'] == 26]

In [9]:
# 유입량의 최대값과 최소값의 차를 2로 나눈 값을 기준으로 설정
maxmin_diff = (data_label.groupby(['홍수사상번호'])['유입량'].max() - data_label.groupby(['홍수사상번호'])['유입량'].min()) / 2
maxmin_diff.iloc[0]

10657.65113

In [11]:
df = data_label[['홍수사상번호', '유입량']]

In [12]:
maxmin_diff = (data_label.groupby(['홍수사상번호'])['유입량'].max() - data_label.groupby(['홍수사상번호'])['유입량'].min()) / 2
for i in range(1, 26) :
    # 유입량이 기준값보다 큰 값들의 인덱스를 저장
    lst = df[df['홍수사상번호'] == i]['유입량'].values > maxmin_diff.iloc[i - 1]
    idx = df[df['홍수사상번호'] == i][['유입량']][lst].index
    # 기준값보다 큰 유입량을 가진 관측치들을 1로 설정
    df.loc[idx, 'label_mm'] = 1

In [13]:
# 나머지는 결측값으로 남아있다
# 그것들을 0으로 결측치대체를 시켜주어 0과 1의 라벨을 생성하였다.
df = df.fillna(0)
df[['label_mm']]

Unnamed: 0,label_mm
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
2886,0.0
2887,0.0
2888,0.0
2889,0.0


In [14]:
df['label_mm'].mean()       # 기준값보다 큰 유입량의 비율이 약 27.2퍼센트정도이다.

0.27187824282255274

In [15]:
# 기존 칼럼들과 변환된 칼럼 합치기
data_label = pd.concat([data_label, df[['label_mm']]], axis = 1)

In [16]:
data_label

Unnamed: 0,홍수사상번호,연,월,일,시간,유입량,1_유역평균강수,1_강우(A지역),1_강우(B지역),1_강우(C지역),1_강우(D지역),1_수위(E지역),1_수위(D지역),2_유역평균강수,2_강우(A지역),2_강우(B지역),2_강우(C지역),2_강우(D지역),2_수위(E지역),2_수위(D지역),3_유역평균강수,3_강우(A지역),3_강우(B지역),3_강우(C지역),3_강우(D지역),3_수위(E지역),3_수위(D지역),4_유역평균강수,4_강우(A지역),4_강우(B지역),4_강우(C지역),4_강우(D지역),4_수위(E지역),4_수위(D지역),5_유역평균강수,5_강우(A지역),5_강우(B지역),5_강우(C지역),5_강우(D지역),5_수위(E지역),5_수위(D지역),6_유역평균강수,6_강우(A지역),6_강우(B지역),6_강우(C지역),6_강우(D지역),6_수위(E지역),6_수위(D지역),일시,기온,풍속,습도,label_mm
0,1.0,2006.0,7.0,10.0,8.0,189.100000,6.4000,7,7,7,8,2.54,122.56875,6.3000,7,7,7,8,2.54,122.541667,6.3000,7,7,7,8,2.54,122.550000,6.4000,7,7,8,8,2.54,122.675000,6.4000,7,7,8,8,2.54,122.660,6.4000,7,7,8,8,2.54,122.610,2006-07-10 08:00,24.3,2.0,85.0,0.0
1,1.0,2006.0,7.0,10.0,9.0,216.951962,6.3000,7,8,7,8,2.53,122.56250,6.4000,7,8,7,8,2.53,122.550000,6.4000,7,8,7,8,2.53,122.558333,7.3000,7,8,10,10,2.53,122.667857,7.3000,7,8,10,10,2.53,122.648,7.3000,7,8,10,10,2.53,122.600,2006-07-10 09:00,24.6,1.5,83.0,0.0
2,1.0,2006.0,7.0,10.0,10.0,251.424419,6.4000,7,9,7,8,2.53,122.55625,7.3000,7,9,7,8,2.53,122.558333,7.3000,7,9,8,8,2.53,122.566667,8.2000,7,9,10,11,2.53,122.660714,8.2000,7,9,10,11,2.53,122.636,8.2000,7,9,10,11,2.53,122.590,2006-07-10 10:00,25.4,1.6,79.0,0.0
3,1.0,2006.0,7.0,10.0,11.0,302.812199,7.3000,7,10,7,8,2.53,122.55625,8.2000,7,10,8,8,2.53,122.566667,8.2000,7,10,10,10,2.53,122.575000,11.3000,9,10,15,14,2.53,122.653571,11.3000,9,10,15,14,2.53,122.620,11.3000,9,10,15,14,2.53,122.585,2006-07-10 11:00,25.4,0.9,82.0,0.0
4,1.0,2006.0,7.0,10.0,12.0,384.783406,8.2000,7,12,8,10,2.53,122.55625,11.3000,9,12,10,10,2.53,122.575000,11.3000,9,12,10,11,2.53,122.575000,14.4000,12,12,18,16,2.53,122.639286,14.4000,12,12,18,16,2.53,122.604,14.4000,12,12,18,16,2.53,122.575,2006-07-10 12:00,25.3,0.8,81.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886,25.0,2017.0,7.0,18.0,18.0,513.368437,22.7836,6,0,1,1,3.03,137.16875,8.2586,6,0,1,1,3.03,137.200000,8.2586,6,0,1,1,3.03,137.200000,4.1089,6,0,1,1,3.03,137.050000,3.2841,6,0,1,1,3.03,137.088,1.7366,6,0,1,1,3.03,137.130,2017-07-18 18:00,29.8,2.5,72.0,0.0
2887,25.0,2017.0,7.0,18.0,19.0,502.846843,8.2586,2,0,1,1,3.00,137.18125,4.1089,2,0,1,1,3.00,137.200000,4.1089,2,0,1,1,3.00,137.200000,3.3854,2,0,1,1,3.00,137.067857,2.7514,2,0,1,1,3.00,137.104,1.7366,2,0,1,1,3.00,137.145,2017-07-18 19:00,29.1,1.9,76.0,0.0
2888,25.0,2017.0,7.0,18.0,20.0,491.954805,4.1089,1,0,1,1,2.98,137.18750,3.3854,1,0,1,1,2.98,137.200000,3.3854,1,0,1,1,2.98,137.200000,3.2841,1,0,1,1,2.98,137.085714,2.1142,1,0,1,1,2.98,137.120,1.7366,1,0,1,1,2.98,137.155,2017-07-18 20:00,28.4,0.6,80.0,0.0
2889,25.0,2017.0,7.0,18.0,21.0,481.103083,3.3854,1,0,1,1,2.96,137.19375,3.2841,1,0,1,1,2.96,137.200000,3.2841,1,0,1,1,2.96,137.208333,2.7514,1,0,1,1,2.96,137.103571,1.8734,1,0,1,1,2.96,137.136,1.7366,1,0,1,1,2.96,137.165,2017-07-18 21:00,27.4,0.6,92.0,0.0


# pycaret을 이용한 target데이터 labeling 예측

- 홍수사상 26은 유입량이 결측이기 때문에, 나머지 사상의 유입량을 제외한 정보로 'label_mm'을 학습한 뒤, pycaret을 이용하여 예측값을 홍수사상 26에 적용시킨다.

In [17]:
from pycaret.classification import *
from time import time

In [18]:
# 유입량을 제외한 변수로 학습하기 위해 유입량 제거
data_mm = data_label.drop(['유입량'], axis = 1)
target_mm = data_target.drop(['유입량'], axis = 1)
cell_start_time = time()
# target을 label_mm 으로 설정
clf = setup(data = data_mm, target='label_mm', silent=True)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Description,Value
0,session_id,6158
1,Target,label_mm
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(2891, 52)"
5,Missing Values,False
6,Numeric Features,50
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


CELL RUN TIME :  1.2508771419525146


In [19]:
cell_start_time = time()
# RECALL 값으로 모델의 성능 파악
top5_models = compare_models(fold = 5, round = 3, sort = 'Recall', n_select = 5)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.981,0.997,0.962,0.969,0.965,0.952,0.952,0.064
xgboost,Extreme Gradient Boosting,0.976,0.996,0.962,0.95,0.955,0.939,0.939,0.184
lightgbm,Light Gradient Boosting Machine,0.979,0.997,0.962,0.96,0.961,0.946,0.946,0.038
rf,Random Forest Classifier,0.975,0.996,0.945,0.963,0.954,0.937,0.937,0.076
gbc,Gradient Boosting Classifier,0.966,0.993,0.932,0.942,0.937,0.913,0.914,0.144
knn,K Neighbors Classifier,0.944,0.986,0.896,0.899,0.897,0.858,0.859,0.548
dt,Decision Tree Classifier,0.945,0.926,0.885,0.908,0.896,0.858,0.859,0.008
ada,Ada Boost Classifier,0.944,0.983,0.879,0.91,0.894,0.856,0.856,0.05
lr,Logistic Regression,0.893,0.936,0.751,0.835,0.791,0.719,0.721,0.82
lda,Linear Discriminant Analysis,0.886,0.941,0.724,0.833,0.774,0.699,0.702,0.016


CELL RUN TIME :  19.820796489715576


상위 3개의 모델 생성

In [35]:
total_models = []
cell_start_time = time()
model_et = create_model('et', fold = 5)
total_models.append(model_et)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9827,0.9978,0.9636,0.9725,0.968,0.9562,0.9562
1,0.9852,0.9986,0.9636,0.9815,0.9725,0.9623,0.9624
2,0.9802,0.9983,0.9633,0.9633,0.9633,0.9498,0.9498
3,0.9802,0.9976,0.9817,0.9469,0.964,0.9503,0.9506
4,0.9777,0.9947,0.9358,0.9808,0.9577,0.9426,0.9431
Mean,0.9812,0.9974,0.9616,0.969,0.9651,0.9523,0.9524
SD,0.0025,0.0014,0.0147,0.0129,0.0049,0.0066,0.0065


CELL RUN TIME :  0.7385349273681641


In [36]:
cell_start_time = time()
model_xgb = create_model('xgboost', fold = 5)
total_models.append(model_xgb)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9728,0.998,0.9727,0.9304,0.9511,0.9323,0.9328
1,0.9753,0.9935,0.9545,0.9545,0.9545,0.9376,0.9376
2,0.9802,0.9981,0.9908,0.9391,0.9643,0.9506,0.9513
3,0.9752,0.9967,0.9633,0.9459,0.9545,0.9375,0.9376
4,0.9752,0.9956,0.9266,0.9806,0.9528,0.9361,0.9368
Mean,0.9758,0.9964,0.9616,0.9501,0.9555,0.9388,0.9392
SD,0.0024,0.0017,0.0212,0.0172,0.0046,0.0062,0.0063


CELL RUN TIME :  1.6542713642120361


In [37]:
cell_start_time = time()
model_lgbm = create_model('lightgbm', fold = 5)
total_models.append(model_lgbm)
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9802,0.9979,0.9818,0.9474,0.9643,0.9506,0.9509
1,0.9728,0.9952,0.9455,0.9541,0.9498,0.9312,0.9312
2,0.9852,0.9989,0.9725,0.9725,0.9725,0.9623,0.9623
3,0.9777,0.9975,0.9725,0.9464,0.9593,0.9439,0.9441
4,0.9777,0.9956,0.9358,0.9808,0.9577,0.9426,0.9431
Mean,0.9787,0.997,0.9616,0.9602,0.9607,0.9461,0.9463
SD,0.004,0.0014,0.0177,0.0139,0.0075,0.0102,0.0102


CELL RUN TIME :  0.8887083530426025


In [38]:
total_models

[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=6158, verbose=0,
                      warm_start=False),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints='',
               learning_rate=0.300000012, max_delta_step=0, max_depth=6,
               min_child_weight=1, missing=nan, monotone_constraints='()',
               n_estimators=100, n_jobs=-1, num_parallel_tree=1,
               objective='binary:log

In [39]:
# 모델 튜닝
model_et = tune_model(model_et, fold=5, optimize = 'Recall', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8741,0.9729,0.9636,0.6928,0.8061,0.7165,0.7379
1,0.8667,0.9502,0.8636,0.709,0.7787,0.6846,0.6914
2,0.916,0.9774,0.9633,0.7778,0.8607,0.8016,0.8109
3,0.8837,0.9678,0.945,0.7153,0.8142,0.7319,0.7469
4,0.8936,0.9715,0.9358,0.7391,0.8259,0.7508,0.7616
Mean,0.8868,0.968,0.9343,0.7268,0.8171,0.7371,0.7498
SD,0.0172,0.0094,0.0369,0.0295,0.0268,0.0389,0.0386


In [40]:
model_lgbm = tune_model(model_lgbm, fold=5, optimize = 'Recall', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9704,0.9949,0.9364,0.9537,0.945,0.9247,0.9248
1,0.963,0.99,0.9182,0.9439,0.9309,0.9056,0.9057
2,0.9778,0.9981,0.9817,0.9386,0.9596,0.9443,0.9448
3,0.9678,0.9947,0.9541,0.9286,0.9412,0.919,0.9192
4,0.9777,0.9949,0.9358,0.9808,0.9577,0.9426,0.9431
Mean,0.9713,0.9945,0.9452,0.9491,0.9469,0.9273,0.9275
SD,0.0058,0.0026,0.0215,0.0178,0.0107,0.0146,0.0148


In [41]:
model_xgb = tune_model(model_xgb, fold=5, optimize = 'Recall', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6716,0.923,1.0,0.4527,0.6232,0.3982,0.4986
1,0.684,0.9083,0.9909,0.4619,0.6301,0.4123,0.5055
2,0.6765,0.9365,1.0,0.4542,0.6246,0.404,0.5032
3,0.6411,0.9183,1.0,0.4291,0.6006,0.3582,0.4671
4,0.646,0.9306,1.0,0.4325,0.6039,0.3645,0.4721
Mean,0.6638,0.9234,0.9982,0.4461,0.6165,0.3875,0.4893
SD,0.0171,0.0098,0.0036,0.0129,0.0119,0.0219,0.0163


In [43]:
tuned_models = [model_et, model_lgbm, model_xgb]
for model in tuned_models:
    display(predict_model(model))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9735,0.9904,0.9456,0.9576,0.9516,0.9333,0.9334


Unnamed: 0,홍수사상번호,연,월,일,시간,1_강우(A지역),1_강우(B지역),1_강우(C지역),1_강우(D지역),2_강우(C지역),...,일시_hour_3,일시_hour_4,일시_hour_5,일시_hour_6,일시_hour_7,일시_hour_8,일시_hour_9,label_mm,Label,Score
0,20.0,2012.0,8.0,21.0,2.0,21.0,5.0,3.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00
1,15.0,2011.0,7.0,13.0,5.0,63.0,26.0,48.0,44.0,48.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00
2,22.0,2013.0,7.0,16.0,13.0,322.0,205.0,12.0,87.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00
3,24.0,2017.0,7.0,3.0,18.0,156.0,135.0,108.0,48.0,107.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89
4,17.0,2011.0,8.0,21.0,7.0,0.0,0.0,6.0,13.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,20.0,2012.0,8.0,16.0,7.0,30.0,39.0,57.0,43.0,58.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.92
864,8.0,2009.0,7.0,14.0,8.0,219.0,126.0,64.0,38.0,62.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.95
865,9.0,2009.0,7.0,19.0,6.0,93.0,80.0,21.0,31.0,21.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.98
866,20.0,2012.0,8.0,25.0,16.0,13.0,11.0,7.0,4.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.00


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.97,0.9903,0.9331,0.9571,0.9449,0.9244,0.9245


Unnamed: 0,홍수사상번호,연,월,일,시간,1_강우(A지역),1_강우(B지역),1_강우(C지역),1_강우(D지역),2_강우(C지역),...,일시_hour_3,일시_hour_4,일시_hour_5,일시_hour_6,일시_hour_7,일시_hour_8,일시_hour_9,label_mm,Label,Score
0,20.0,2012.0,8.0,21.0,2.0,21.0,5.0,3.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0000
1,15.0,2011.0,7.0,13.0,5.0,63.0,26.0,48.0,44.0,48.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9998
2,22.0,2013.0,7.0,16.0,13.0,322.0,205.0,12.0,87.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9997
3,24.0,2017.0,7.0,3.0,18.0,156.0,135.0,108.0,48.0,107.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9994
4,17.0,2011.0,8.0,21.0,7.0,0.0,0.0,6.0,13.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,20.0,2012.0,8.0,16.0,7.0,30.0,39.0,57.0,43.0,58.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.9992
864,8.0,2009.0,7.0,14.0,8.0,219.0,126.0,64.0,38.0,62.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9999
865,9.0,2009.0,7.0,19.0,6.0,93.0,80.0,21.0,31.0,21.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.9981
866,20.0,2012.0,8.0,25.0,16.0,13.0,11.0,7.0,4.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0000


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.6682,0.9166,0.9874,0.453,0.6211,0.3912,0.4873


Unnamed: 0,홍수사상번호,연,월,일,시간,1_강우(A지역),1_강우(B지역),1_강우(C지역),1_강우(D지역),2_강우(C지역),...,일시_hour_3,일시_hour_4,일시_hour_5,일시_hour_6,일시_hour_7,일시_hour_8,일시_hour_9,label_mm,Label,Score
0,20.0,2012.0,8.0,21.0,2.0,21.0,5.0,3.0,4.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5196
1,15.0,2011.0,7.0,13.0,5.0,63.0,26.0,48.0,44.0,48.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5151
2,22.0,2013.0,7.0,16.0,13.0,322.0,205.0,12.0,87.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5138
3,24.0,2017.0,7.0,3.0,18.0,156.0,135.0,108.0,48.0,107.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5177
4,17.0,2011.0,8.0,21.0,7.0,0.0,0.0,6.0,13.0,6.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.5196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,20.0,2012.0,8.0,16.0,7.0,30.0,39.0,57.0,43.0,58.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.5082
864,8.0,2009.0,7.0,14.0,8.0,219.0,126.0,64.0,38.0,62.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.5111
865,9.0,2009.0,7.0,19.0,6.0,93.0,80.0,21.0,31.0,21.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.5154
866,20.0,2012.0,8.0,25.0,16.0,13.0,11.0,7.0,4.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5196


In [44]:
final_model = finalize_model(model_xgb)

- predict를 진행하면, 데이터의 마지막열에 "Label", "Score" 2가지 열이 추가됩니다.
  - Label은 예측한 결과값 입니다.
  - Score은 결과값이 나올 확률 입니다. Score를 기반으로 0.5를 넘기면 1, 넘기지 못하면 0으로 Label이 적용됩니다.

In [45]:
prediction = predict_model(final_model, data = target_mm)
prediction.head()

Unnamed: 0,홍수사상번호,연,월,일,시간,1_유역평균강수,1_강우(A지역),1_강우(B지역),1_강우(C지역),1_강우(D지역),...,6_강우(C지역),6_강우(D지역),6_수위(E지역),6_수위(D지역),일시,기온,풍속,습도,Label,Score
2891,26.0,2018.0,7.0,1.0,6.0,14.2576,32,0,0,0,...,1,0,1.93,120.515,2018-07-01 06:00,21.8,1.3,100.0,0.0,0.5185
2892,26.0,2018.0,7.0,1.0,7.0,10.9657,20,1,0,0,...,14,8,1.93,120.51,2018-07-01 07:00,21.8,0.9,100.0,0.0,0.5185
2893,26.0,2018.0,7.0,1.0,8.0,7.9442,11,5,0,0,...,19,24,1.92,120.505,2018-07-01 08:00,21.9,0.5,100.0,0.0,0.5181
2894,26.0,2018.0,7.0,1.0,9.0,7.8916,3,11,0,0,...,22,33,1.92,120.5,2018-07-01 09:00,22.1,0.6,100.0,0.0,0.5181
2895,26.0,2018.0,7.0,1.0,10.0,13.3484,4,25,1,8,...,23,36,1.92,120.495,2018-07-01 10:00,22.2,0.8,100.0,0.0,0.5162
