# Default

In [3]:
import os
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler

import tensorflow as tf
import json

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM

from statsmodels.graphics.mosaicplot import mosaic
from statistics import stdev

import xgboost as xgb 
import lightgbm as lgb

import itertools

from korean_lunar_calendar import KoreanLunarCalendar
from pycaret.classification import *
from pycaret.regression import *
from time import time

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn import metrics
from sklearn.metrics import *

# 폰트 설정 - 윈도우
plt.rcParams['font.family'] = 'NanumGothic'

# 그래프의 크기
plt.rcParams['figure.figsize'] = 12, 6

# 글자 크기
plt.rcParams['font.size'] = 14

# 폰트 설정 시 - 기호 깨는거 방지하기
plt.rcParams['axes.unicode_minus'] = False

import seaborn as sns

# 불필요한 경고 메시지를 나오지 않도록 한다.
import warnings
warnings.filterwarnings('ignore')

# 절기 구분을 위한 모듈
import datetime
dateformat = '%Y-%m-%d'

In [4]:
BASE_DIR = './data'

In [6]:
train_path = os.path.join(BASE_DIR, '2021 빅콘테스트_데이터분석분야_퓨처스리그_홍수ZERO_댐유입량,강우,수위데이터_210902_update.xlsx')

data = pd.read_excel(train_path)
data = data[1 : ]
data = data.reset_index(drop = True)
data.iloc[:,6:] = data.iloc[:,6:].apply(pd.to_numeric)  # 수치형으로 변환

data.columns = ['홍수사상번호', '연', '월', '일', '시간', '유입량', 
                '1_유역평균강수', '1_강우(A지역)', '1_강우(B지역)', '1_강우(C지역)', '1_강우(D지역)', '1_수위(E지역)', '1_수위(D지역)', 
                '2_유역평균강수', '2_강우(A지역)', '2_강우(B지역)', '2_강우(C지역)', '2_강우(D지역)', '2_수위(E지역)', '2_수위(D지역)', 
                '3_유역평균강수', '3_강우(A지역)', '3_강우(B지역)', '3_강우(C지역)', '3_강우(D지역)', '3_수위(E지역)', '3_수위(D지역)',
                '4_유역평균강수', '4_강우(A지역)', '4_강우(B지역)', '4_강우(C지역)', '4_강우(D지역)', '4_수위(E지역)', '4_수위(D지역)',
                '5_유역평균강수', '5_강우(A지역)', '5_강우(B지역)', '5_강우(C지역)', '5_강우(D지역)', '5_수위(E지역)', '5_수위(D지역)',
                '6_유역평균강수', '6_강우(A지역)', '6_강우(B지역)', '6_강우(C지역)', '6_강우(D지역)', '6_수위(E지역)', '6_수위(D지역)']

# 데이터 집단7 만들어보기
- 수위(E지역)은 모든 집단에서 같은 값을 가지므로 만들지 않았습니다.

In [9]:
# 7_유역평균강수 생성
data_7 = data[['1_유역평균강수', '2_유역평균강수', '3_유역평균강수', '4_유역평균강수', '5_유역평균강수', '6_유역평균강수']]
lst = []
lst_mean = []
for i in range(len(data_7)) :
    lst.append(list(data_7.loc[i]))
    lst[i].sort()
    lst[i].pop()
    lst[i].sort(reverse = True)
    lst[i].pop()
    lst_mean.append(np.mean(lst[i]))

data_feat_7 = pd.DataFrame(lst_mean, columns = ['7_유역평균강수'])

# 7_강우(A지역) 생성
data_7 = data[['1_강우(A지역)', '2_강우(A지역)', '3_강우(A지역)', '4_강우(A지역)', '5_강우(A지역)', '6_강우(A지역)']]
lst = []
lst_mean = []
for i in range(len(data_7)) :
    lst.append(list(data_7.loc[i]))
    lst[i].sort()
    lst[i].pop()
    lst[i].sort(reverse = True)
    lst[i].pop()
    lst_mean.append(np.mean(lst[i]))

data_feat_7 = pd.concat([data_feat_7, pd.DataFrame(lst_mean, columns = ['7_강우(A지역)'])], axis = 1)

# 7_강우(B지역) 생성
data_7 = data[['1_강우(B지역)', '2_강우(B지역)', '3_강우(B지역)', '4_강우(B지역)', '5_강우(B지역)', '6_강우(B지역)']]
lst = []
lst_mean = []
for i in range(len(data_7)) :
    lst.append(list(data_7.loc[i]))
    lst[i].sort()
    lst[i].pop()
    lst[i].sort(reverse = True)
    lst[i].pop()
    lst_mean.append(np.mean(lst[i]))

data_feat_7 = pd.concat([data_feat_7, pd.DataFrame(lst_mean, columns = ['7_강우(B지역)'])], axis = 1)

# 7_강우(C지역) 생성
data_7 = data[['1_강우(C지역)', '2_강우(C지역)', '3_강우(C지역)', '4_강우(C지역)', '5_강우(C지역)', '6_강우(C지역)']]
lst = []
lst_mean = []
for i in range(len(data_7)) :
    lst.append(list(data_7.loc[i]))
    lst[i].sort()
    lst[i].pop()
    lst[i].sort(reverse = True)
    lst[i].pop()
    lst_mean.append(np.mean(lst[i]))

data_feat_7 = pd.concat([data_feat_7, pd.DataFrame(lst_mean, columns = ['7_강우(C지역)'])], axis = 1)

# 7_강우(D지역) 생성
data_7 = data[['1_강우(D지역)', '2_강우(D지역)', '3_강우(D지역)', '4_강우(D지역)', '5_강우(D지역)', '6_강우(D지역)']]
lst = []
lst_mean = []
for i in range(len(data_7)) :
    lst.append(list(data_7.loc[i]))
    lst[i].sort()
    lst[i].pop()
    lst[i].sort(reverse = True)
    lst[i].pop()
    lst_mean.append(np.mean(lst[i]))

data_feat_7 = pd.concat([data_feat_7, pd.DataFrame(lst_mean, columns = ['7_강우(D지역)'])], axis = 1)

# 7_수위(D지역) 생성
data_7 = data[['1_수위(D지역)', '2_수위(D지역)', '3_수위(D지역)', '4_수위(D지역)', '5_수위(D지역)', '6_수위(D지역)']]
lst = []
lst_mean = []
for i in range(len(data_7)) :
    lst.append(list(data_7.loc[i]))
    lst[i].sort()
    lst[i].pop()
    lst[i].sort(reverse = True)
    lst[i].pop()
    lst_mean.append(np.mean(lst[i]))

data_feat_7 = pd.concat([data_feat_7, pd.DataFrame(lst_mean, columns = ['7_수위(D지역)'])], axis = 1)
data_feat = pd.concat([data, data_feat_7], axis = 1)

In [11]:
data_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3051 entries, 0 to 3050
Data columns (total 58 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   홍수사상번호     3051 non-null   float64
 1   연          3051 non-null   float64
 2   월          3051 non-null   float64
 3   일          3051 non-null   float64
 4   시간         3051 non-null   float64
 5   유입량        2891 non-null   float64
 6   1_유역평균강수   3051 non-null   float64
 7   1_강우(A지역)  3051 non-null   int64  
 8   1_강우(B지역)  3051 non-null   int64  
 9   1_강우(C지역)  3051 non-null   int64  
 10  1_강우(D지역)  3051 non-null   int64  
 11  1_수위(E지역)  3051 non-null   float64
 12  1_수위(D지역)  3051 non-null   float64
 13  2_유역평균강수   3051 non-null   float64
 14  2_강우(A지역)  3051 non-null   int64  
 15  2_강우(B지역)  3051 non-null   int64  
 16  2_강우(C지역)  3051 non-null   int64  
 17  2_강우(D지역)  3051 non-null   int64  
 18  2_수위(E지역)  3051 non-null   float64
 19  2_수위(D지역)  3051 non-null   float64
 20  3_유역평균강수