# 대청댐 망간 분석
* Kwater_input.csv 활용
* mn_h, mn_d, mn_c 평균해서 사용

In [1]:
!pip install gdown

!gdown https://drive.google.com/uc?id=13xLgCRGuQLEuCjjmKQ88Y0845Z3Xqj25 --output kwater_input.csv
!gdown https://drive.google.com/uc?id=18JcFLhkKURlSvcmOzaSjvspv15MvQ_3_ --output 대청댐_자동수질측정_2008_2023.csv

!gdown https://drive.google.com/uc?id=1JhYAbpVn7_DtW5PxHuOCvDUYncsY0cST --output NanumGothic.ttf    # 한글 그래프용 나눔 고딕 다운로드
!gdown https://drive.google.com/uc?id=1g3UeYk3SxlO8-Y9kzKCM3DNmajmg4eWR --output NanumGothic.otf
!gdown https://drive.google.com/uc?id=14XaW_5r2zGJ-h_ImK6e7ZE24Q7EBQup- --output water_function.py  # water_function.py 다운로드

Downloading...
From: https://drive.google.com/uc?id=13xLgCRGuQLEuCjjmKQ88Y0845Z3Xqj25
To: /content/kwater_input.csv
100% 94.7k/94.7k [00:00<00:00, 107MB/s]
Downloading...
From: https://drive.google.com/uc?id=18JcFLhkKURlSvcmOzaSjvspv15MvQ_3_
To: /content/대청댐_자동수질측정_2008_2023.csv
100% 55.3M/55.3M [00:00<00:00, 160MB/s]
Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1JhYAbpVn7_DtW5PxHuOCvDUYncsY0cST 

Downloading...
From: https://drive.google.com/uc?id=1g3UeYk3SxlO8-Y9kzKCM3DNmajmg4eWR
To: /content/NanumGothic.otf
100% 2.27M/2.27M [00:00<00:00, 193MB/s]
Downloading...
From: https://drive.google.com/uc?id=14XaW_5r2zGJ-h_ImK6e7ZE24Q7EBQup-
To: /content/water_function.py
100% 5.65k/5.65k [00:00<00:00, 27.8MB/s]


In [2]:
import os
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from matplotlib import rc
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import xgboost

from itertools import combinations
from tqdm import tqdm

import warnings
# from pandas.core.common import SettingWithCopyWarning
# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

import joblib
import pickle


from sklearn.impute import KNNImputer
from water_function import basic_chart, buildDataSet_DT
from water_function import total_chart, AL_GradientBoosting, AL_XGBoosting, AL_SVR, AL_GradientBoosting, AL_RandomForest, Performance_index

In [3]:
# matplotlib 한글화
import matplotlib.font_manager as fm
import os
import matplotlib.pyplot as plt

fe = fm.FontEntry(
    fname=r'/content/NanumGothic.otf',
    name='NanumGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 20, 'font.family': 'NanumGothic'}) # 폰트 설정

### 데이터 수집 및 가공

In [4]:
# 데이터 범위 지정
start_date = '2013-01-07'
end_date = '2023-06-19'

In [5]:
kwater_input = pd.read_csv('./kwater_input.csv', sep=',', encoding='CP949')
daechung = pd.read_csv('./대청댐_자동수질측정_2008_2023.csv', thousands=r',', sep=',', encoding='utf-8')

In [6]:
daechung.replace('-', np.nan, inplace=True)
daechung

Unnamed: 0,수질관측일시,수질측정일시,수심,탁도,저수위,pH,수온,전기전도도,DO,Chl-a,TP,TN,TOC
0,2008-01-15 22:20,2008-01-15 18:00,1,1.0,72.54,7.40,7.00,114.0,8.90,0.0,,,
1,2008-01-15 22:20,2008-01-15 18:01,2,1.0,72.54,7.40,7.00,114.1,9.10,0.0,,,
2,2008-01-15 22:20,2008-01-15 18:02,3,1.2,72.54,7.40,7.00,114.1,9.10,0.0,,,
3,2008-01-15 22:20,2008-01-15 18:03,4,1.6,72.54,7.40,7.00,114.1,8.90,0.0,,,
4,2008-01-15 22:20,2008-01-15 18:04,5,1.2,72.54,7.30,7.00,114.1,8.90,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635661,2023-08-31 14:15,2023-08-10 13:06,10,7.5,74.50,6.73,24.22,118.0,6.94,0.0,,,
635662,2023-08-31 14:15,2023-08-10 13:07,11,7.1,74.50,6.69,23.87,117.0,6.71,0.0,,,
635663,2023-08-31 14:15,2023-08-10 13:07,12,6.8,74.50,6.65,23.64,117.0,6.55,0.0,,,
635664,2023-08-31 14:15,2023-08-10 13:08,13,6.9,74.50,6.63,23.38,117.0,6.42,0.0,,,


In [7]:
# drop
daechung.drop(['TP','TN','TOC'], axis=1, inplace=True)
daechung

Unnamed: 0,수질관측일시,수질측정일시,수심,탁도,저수위,pH,수온,전기전도도,DO,Chl-a
0,2008-01-15 22:20,2008-01-15 18:00,1,1.0,72.54,7.40,7.00,114.0,8.90,0.0
1,2008-01-15 22:20,2008-01-15 18:01,2,1.0,72.54,7.40,7.00,114.1,9.10,0.0
2,2008-01-15 22:20,2008-01-15 18:02,3,1.2,72.54,7.40,7.00,114.1,9.10,0.0
3,2008-01-15 22:20,2008-01-15 18:03,4,1.6,72.54,7.40,7.00,114.1,8.90,0.0
4,2008-01-15 22:20,2008-01-15 18:04,5,1.2,72.54,7.30,7.00,114.1,8.90,0.0
...,...,...,...,...,...,...,...,...,...,...
635661,2023-08-31 14:15,2023-08-10 13:06,10,7.5,74.50,6.73,24.22,118.0,6.94,0.0
635662,2023-08-31 14:15,2023-08-10 13:07,11,7.1,74.50,6.69,23.87,117.0,6.71,0.0
635663,2023-08-31 14:15,2023-08-10 13:07,12,6.8,74.50,6.65,23.64,117.0,6.55,0.0
635664,2023-08-31 14:15,2023-08-10 13:08,13,6.9,74.50,6.63,23.38,117.0,6.42,0.0


In [8]:
# 망강 데이터 평균화하여 사용
mn_data = kwater_input[['tdy_dt','mn_h','mn_d','mn_c']].groupby(['tdy_dt']).mean().dropna().mean(axis='columns')
mn_data.index = pd.to_datetime(mn_data.index)
mn_data.name = 'mn'
mn_data

tdy_dt
2014-02-04    0.026000
2014-02-17    0.013000
2014-02-24    0.006333
2014-03-03    0.026667
2014-03-10    0.026333
                ...   
2023-05-22    0.030667
2023-05-30    0.041333
2023-06-07    0.039667
2023-06-12    0.046000
2023-06-19    0.039333
Name: mn, Length: 484, dtype: float64

In [9]:
print(mn_data.mean())

0.04778126721763086


In [10]:
# 중복데이터 확인
print("대청댐")
display(daechung[daechung.duplicated(keep=False)])

대청댐


Unnamed: 0,수질관측일시,수질측정일시,수심,탁도,저수위,pH,수온,전기전도도,DO,Chl-a


In [11]:
# 날짜 데이터 0 .. 23 형식 바꾸기
daechung['수질관측일시'] = daechung['수질관측일시'].apply(lambda x: pd.to_datetime(str(x.split()[0] + ' 00:' + x.split()[1].split(':')[1])) + timedelta(days=1) if x.split()[1].split(':')[0] == '24' else x)
daechung['수질관측일시'] = pd.to_datetime(daechung['수질관측일시'])
daechung.drop(['수질측정일시'], axis=1, inplace=True)
display(daechung)

Unnamed: 0,수질관측일시,수심,탁도,저수위,pH,수온,전기전도도,DO,Chl-a
0,2008-01-15 22:20:00,1,1.0,72.54,7.40,7.00,114.0,8.90,0.0
1,2008-01-15 22:20:00,2,1.0,72.54,7.40,7.00,114.1,9.10,0.0
2,2008-01-15 22:20:00,3,1.2,72.54,7.40,7.00,114.1,9.10,0.0
3,2008-01-15 22:20:00,4,1.6,72.54,7.40,7.00,114.1,8.90,0.0
4,2008-01-15 22:20:00,5,1.2,72.54,7.30,7.00,114.1,8.90,0.0
...,...,...,...,...,...,...,...,...,...
635661,2023-08-31 14:15:00,10,7.5,74.50,6.73,24.22,118.0,6.94,0.0
635662,2023-08-31 14:15:00,11,7.1,74.50,6.69,23.87,117.0,6.71,0.0
635663,2023-08-31 14:15:00,12,6.8,74.50,6.65,23.64,117.0,6.55,0.0
635664,2023-08-31 14:15:00,13,6.9,74.50,6.63,23.38,117.0,6.42,0.0


In [12]:
daechung.dtypes

수질관측일시    datetime64[ns]
수심                 int64
탁도               float64
저수위              float64
pH               float64
수온               float64
전기전도도            float64
DO               float64
Chl-a            float64
dtype: object

In [13]:
data = daechung[daechung.columns[:-1]].groupby(['수질관측일시', '수심']).mean().reset_index(level=1) # 이유는 모르겠으나 탁도, 전기전도도, DO가 삭제됨 @2023-10-13 중간 데이터가 datetime형식이 다름. xxxx.00
data.index = data.index.strftime("%Y-%m-%d") # 망간 데이터가 일데이터이기 떄문에 날짜 형식을 바꿈
data.index.name = 'date'
display(data)

Unnamed: 0_level_0,수심,탁도,저수위,pH,수온,전기전도도,DO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-01-15,1,1.0,72.54,7.40,7.00,114.0,8.90
2008-01-15,2,1.0,72.54,7.40,7.00,114.1,9.10
2008-01-15,3,1.2,72.54,7.40,7.00,114.1,9.10
2008-01-15,4,1.6,72.54,7.40,7.00,114.1,8.90
2008-01-15,5,1.2,72.54,7.30,7.00,114.1,8.90
...,...,...,...,...,...,...,...
2023-08-31,10,7.5,74.50,6.73,24.22,118.0,6.94
2023-08-31,11,7.1,74.50,6.69,23.87,117.0,6.71
2023-08-31,12,6.8,74.50,6.65,23.64,117.0,6.55
2023-08-31,13,6.9,74.50,6.63,23.38,117.0,6.42


In [14]:
data = data.groupby(['date', '수심']).mean().reset_index(level=1)
data.index = pd.to_datetime(data.index)
display(data)

Unnamed: 0_level_0,수심,탁도,저수위,pH,수온,전기전도도,DO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-01-15,1,1.0,72.540,7.40,7.00,114.0,8.90
2008-01-15,2,1.0,72.540,7.40,7.00,114.1,9.10
2008-01-15,3,1.2,72.540,7.40,7.00,114.1,9.10
2008-01-15,4,1.6,72.540,7.40,7.00,114.1,8.90
2008-01-15,5,1.2,72.540,7.30,7.00,114.1,8.90
...,...,...,...,...,...,...,...
2023-08-31,10,7.5,74.404,6.73,24.22,118.0,6.94
2023-08-31,11,7.1,74.404,6.69,23.87,117.0,6.71
2023-08-31,12,6.8,74.404,6.65,23.64,117.0,6.55
2023-08-31,13,6.9,74.404,6.63,23.38,117.0,6.42


In [15]:
data2 = data.loc[start_date:end_date]
data2

Unnamed: 0_level_0,수심,탁도,저수위,pH,수온,전기전도도,DO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-07,10,2.725000,74.015000,7.2000,6.300000,121.000000,10.600000
2013-01-07,22,2.187500,74.051250,7.2000,6.325000,120.375000,10.475000
2013-01-08,10,2.641667,73.982500,7.2000,6.183333,120.300000,10.608333
2013-01-09,10,2.608333,73.920000,7.2000,6.033333,120.283333,10.708333
2013-01-10,10,2.575000,73.868333,7.2000,5.900000,120.166667,10.758333
...,...,...,...,...,...,...,...
2023-06-19,10,4.000000,67.992500,8.6225,18.197500,175.000000,6.650000
2023-06-19,11,7.175000,67.992500,8.5150,16.877500,174.500000,7.040000
2023-06-19,12,3.425000,67.992500,8.4200,15.917500,173.000000,7.457500
2023-06-19,13,3.225000,67.992500,8.2750,13.962500,171.750000,8.017500


$$ PEA = \frac{1}{h} \int_{-h}^0 (\bar{\rho} - \rho)gzdz $$  
$$ \bar{\rho} = \frac{1}{h} \int_{-h}^0 \rho dz$$  
$ h $ : 총 수심  
$ \bar{\rho} $ : 평균밀도  
$ \rho $ : 밀도  
$ g $ : 중력가속도

In [16]:
# 각 날짜별 MAX 수심 h 계산
# data = data.reset_index()
data2 = pd.concat([data2, data2.groupby('date').max()['수심']], axis=1)
data2.columns = list(data2.columns[:-1]) + ['DEPTH_MAX']

In [17]:
# Z(팔길이) 계산
COUNT = 0 # list_z 인덱스 지정용
DAY_COUNT = 0 # 날짜 넘어가는 거 체크용 (처음 열에서는 0으로)
depth_before = 0

list_z = [np.nan] * len(data2)
for date in sorted(set(data2.index)):
    data_day = data2.loc[date]
    # print(data_day)
    # break

    # 수심자료가 1개일 경우에는 pd.Series로 넘어옴...
    if type(data_day) == pd.Series:
        list_z[COUNT] = np.nan
        COUNT += 1
        continue

    # 수심자료가 2개 이상이어서 pd.DataFrmae 으로 넘어옴
    else:

        # 수심자료가 20개 미만
        if len(data_day) <= 10:
            for _ in data_day['수심']:
                list_z[COUNT] = np.nan
                COUNT += 1

        # (정상) 수심자료가 20개 이상
        else:
            for idx, row in data_day.iterrows():
                # 간단하게 (최대수심-관측수심) 으로 수행
                # 향후 필요 시 조금 더 디테일하게 계산
                list_z[COUNT] = row['DEPTH_MAX'] - row['수심']
                COUNT += 1

data2['z'] = list_z

In [18]:
# 수심별 밀도 계산 함수
C1 = 0.000055
C2 = 0.008436
C3 = 0.064579
C4 = 999.842381
def calculate_density(depth):
    return C1 * (depth ** 3) - C2 * (depth ** 2) + C3 * depth + C4

In [19]:
data2['density'] = data2['수온'].apply(calculate_density)

In [20]:
GRAVITY_ACC = 9.81

# PEA 계산
data_pea = pd.DataFrame(None, columns=['pea'], index=pd.date_range(start=start_date, end=end_date))

for date in sorted(set(data2.index)):
    data_day = data2.loc[date]

    # 수심별 밀도가 계산되었을 경우 (수심이 21개 이상인 경우)
    if not np.isnan(data_day['z']).any():
        pea = 0
        mean_density = data_day['density'].mean()
        for depth, density in zip(data_day['z'], data_day['density']):
            pea += (mean_density - density) * GRAVITY_ACC * depth

        # print(date)
        data_pea.loc[date]['pea'] = pea

In [21]:
data_pea.dropna(inplace=True)
data_pea

Unnamed: 0,pea
2013-02-28,-0.121224
2013-03-01,0.612682
2013-03-02,0.042982
2013-03-03,0.061593
2013-03-04,0.134598
...,...
2023-06-15,503.26327
2023-06-16,518.077865
2023-06-17,535.129729
2023-06-18,531.853238


In [22]:
data3 = data2.groupby('date').mean()[['수심','탁도','저수위','pH','수온','전기전도도','DO']]
data3

Unnamed: 0_level_0,수심,탁도,저수위,pH,수온,전기전도도,DO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-07,16.0,2.456250,74.033125,7.200000,6.312500,120.687500,10.537500
2013-01-08,10.0,2.641667,73.982500,7.200000,6.183333,120.300000,10.608333
2013-01-09,10.0,2.608333,73.920000,7.200000,6.033333,120.283333,10.708333
2013-01-10,10.0,2.575000,73.868333,7.200000,5.900000,120.166667,10.758333
2013-01-11,10.0,2.541667,73.806667,7.200000,5.800000,120.233333,10.808333
...,...,...,...,...,...,...,...
2023-06-15,7.5,3.141071,68.305000,9.406071,20.322500,167.232143,9.744643
2023-06-16,7.5,2.869048,68.240000,9.522857,20.302143,167.285714,9.861429
2023-06-17,7.5,3.266071,68.150000,9.134821,20.528929,167.821429,9.269821
2023-06-18,7.5,3.378571,68.072500,8.967500,20.540357,168.035714,9.213929


In [23]:
tmp1 = pd.concat([data3, data_pea], axis=1).loc[start_date:end_date]
data4 = pd.concat([tmp1, mn_data], axis=1).loc[start_date:end_date]
display(data4)

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn
2013-01-07,16.0,2.456250,74.033125,7.200000,6.312500,120.687500,10.537500,,
2013-01-08,10.0,2.641667,73.982500,7.200000,6.183333,120.300000,10.608333,,
2013-01-09,10.0,2.608333,73.920000,7.200000,6.033333,120.283333,10.708333,,
2013-01-10,10.0,2.575000,73.868333,7.200000,5.900000,120.166667,10.758333,,
2013-01-11,10.0,2.541667,73.806667,7.200000,5.800000,120.233333,10.808333,,
...,...,...,...,...,...,...,...,...,...
2023-06-15,7.5,3.141071,68.305000,9.406071,20.322500,167.232143,9.744643,503.26327,
2023-06-16,7.5,2.869048,68.240000,9.522857,20.302143,167.285714,9.861429,518.077865,
2023-06-17,7.5,3.266071,68.150000,9.134821,20.528929,167.821429,9.269821,535.129729,
2023-06-18,7.5,3.378571,68.072500,8.967500,20.540357,168.035714,9.213929,531.853238,


In [24]:
data = data4

In [25]:
data.describe()

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,mn
count,3415.0,3415.0,3415.0,3415.0,3415.0,3415.0,3415.0,484.0
mean,10.800357,30.157802,71.267797,8.311171,14.24072,147.461871,31.007856,0.047781
std,2.70207,159.343262,4.317534,2.736578,6.750224,22.130583,36.040669,0.035239
min,0.0,0.0,0.0,4.227679,3.8,10.0,1.378819,0.0
25%,10.0,1.744444,69.29767,7.2,7.824263,132.437866,8.272083,0.023917
50%,11.5,2.648958,71.830833,7.885577,13.930682,149.214286,11.25,0.041
75%,12.0,3.919772,73.502679,8.76631,20.073958,165.241379,65.692776,0.0605
max,17.5,1260.071429,135.529292,35.051607,30.42,241.483516,172.066667,0.2099


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3468 entries, 2013-01-07 to 2023-06-19
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   수심      3415 non-null   float64
 1   탁도      3415 non-null   float64
 2   저수위     3415 non-null   float64
 3   pH      3415 non-null   float64
 4   수온      3415 non-null   float64
 5   전기전도도   3415 non-null   float64
 6   DO      3415 non-null   float64
 7   pea     2466 non-null   object 
 8   mn      484 non-null    float64
dtypes: float64(8), object(1)
memory usage: 270.9+ KB


### 망간 머신러닝 예측

In [27]:
train_data = data

In [28]:
# # KNN Imputer
# knn_imputer = KNNImputer(n_neighbors=6)

# train_columns = train_data.drop(['mn'], axis=1).columns
# train_data[train_columns] = knn_imputer.fit_transform(train_data[train_columns])
# display(train_data)
# print(train_data.info())
# mn 선형보간
train_data['mn_linear'] = train_data['mn'].interpolate(method='linear')
train_data['mn_reg1'] = train_data['mn_linear'].shift(1)
train_data['mn_reg2'] = train_data['mn_linear'].shift(2)
train_data['mn_reg3'] = train_data['mn_linear'].shift(3)
train_data['mn_reg4'] = train_data['mn_linear'].shift(4)
train_data['mn_reg5'] = train_data['mn_linear'].shift(5)
train_data

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5
2013-01-07,16.0,2.456250,74.033125,7.200000,6.312500,120.687500,10.537500,,,,,,,,
2013-01-08,10.0,2.641667,73.982500,7.200000,6.183333,120.300000,10.608333,,,,,,,,
2013-01-09,10.0,2.608333,73.920000,7.200000,6.033333,120.283333,10.708333,,,,,,,,
2013-01-10,10.0,2.575000,73.868333,7.200000,5.900000,120.166667,10.758333,,,,,,,,
2013-01-11,10.0,2.541667,73.806667,7.200000,5.800000,120.233333,10.808333,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-15,7.5,3.141071,68.305000,9.406071,20.322500,167.232143,9.744643,503.26327,,0.043143,0.044095,0.045048,0.046000,0.044733,0.043467
2023-06-16,7.5,2.869048,68.240000,9.522857,20.302143,167.285714,9.861429,518.077865,,0.042190,0.043143,0.044095,0.045048,0.046000,0.044733
2023-06-17,7.5,3.266071,68.150000,9.134821,20.528929,167.821429,9.269821,535.129729,,0.041238,0.042190,0.043143,0.044095,0.045048,0.046000
2023-06-18,7.5,3.378571,68.072500,8.967500,20.540357,168.035714,9.213929,531.853238,,0.040286,0.041238,0.042190,0.043143,0.044095,0.045048


In [29]:
train_data

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5
2013-01-07,16.0,2.456250,74.033125,7.200000,6.312500,120.687500,10.537500,,,,,,,,
2013-01-08,10.0,2.641667,73.982500,7.200000,6.183333,120.300000,10.608333,,,,,,,,
2013-01-09,10.0,2.608333,73.920000,7.200000,6.033333,120.283333,10.708333,,,,,,,,
2013-01-10,10.0,2.575000,73.868333,7.200000,5.900000,120.166667,10.758333,,,,,,,,
2013-01-11,10.0,2.541667,73.806667,7.200000,5.800000,120.233333,10.808333,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-15,7.5,3.141071,68.305000,9.406071,20.322500,167.232143,9.744643,503.26327,,0.043143,0.044095,0.045048,0.046000,0.044733,0.043467
2023-06-16,7.5,2.869048,68.240000,9.522857,20.302143,167.285714,9.861429,518.077865,,0.042190,0.043143,0.044095,0.045048,0.046000,0.044733
2023-06-17,7.5,3.266071,68.150000,9.134821,20.528929,167.821429,9.269821,535.129729,,0.041238,0.042190,0.043143,0.044095,0.045048,0.046000
2023-06-18,7.5,3.378571,68.072500,8.967500,20.540357,168.035714,9.213929,531.853238,,0.040286,0.041238,0.042190,0.043143,0.044095,0.045048


In [30]:
# 정규화
display(train_data)
scaler = MinMaxScaler()
scaler.fit(train_data[train_data.columns])
train_data_ = scaler.transform(train_data[train_data.columns])
train_data = pd.DataFrame(train_data_, columns=train_data.columns, index=train_data.index)
train_data

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5
2013-01-07,16.0,2.456250,74.033125,7.200000,6.312500,120.687500,10.537500,,,,,,,,
2013-01-08,10.0,2.641667,73.982500,7.200000,6.183333,120.300000,10.608333,,,,,,,,
2013-01-09,10.0,2.608333,73.920000,7.200000,6.033333,120.283333,10.708333,,,,,,,,
2013-01-10,10.0,2.575000,73.868333,7.200000,5.900000,120.166667,10.758333,,,,,,,,
2013-01-11,10.0,2.541667,73.806667,7.200000,5.800000,120.233333,10.808333,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-15,7.5,3.141071,68.305000,9.406071,20.322500,167.232143,9.744643,503.26327,,0.043143,0.044095,0.045048,0.046000,0.044733,0.043467
2023-06-16,7.5,2.869048,68.240000,9.522857,20.302143,167.285714,9.861429,518.077865,,0.042190,0.043143,0.044095,0.045048,0.046000,0.044733
2023-06-17,7.5,3.266071,68.150000,9.134821,20.528929,167.821429,9.269821,535.129729,,0.041238,0.042190,0.043143,0.044095,0.045048,0.046000
2023-06-18,7.5,3.378571,68.072500,8.967500,20.540357,168.035714,9.213929,531.853238,,0.040286,0.041238,0.042190,0.043143,0.044095,0.045048


Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5
2013-01-07,0.914286,0.001949,0.546252,0.096429,0.094384,0.478166,0.053657,,,,,,,,
2013-01-08,0.571429,0.002096,0.545878,0.096429,0.089532,0.476492,0.054072,,,,,,,,
2013-01-09,0.571429,0.002070,0.545417,0.096429,0.083897,0.476420,0.054658,,,,,,,,
2013-01-10,0.571429,0.002044,0.545036,0.096429,0.078888,0.475916,0.054951,,,,,,,,
2013-01-11,0.571429,0.002017,0.544581,0.096429,0.075131,0.476204,0.055244,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-15,0.428571,0.002493,0.503987,0.167999,0.620680,0.679237,0.049012,0.382619,,0.205540,0.210077,0.214615,0.219152,0.213117,0.207083
2023-06-16,0.428571,0.002277,0.503507,0.171788,0.619915,0.679468,0.049697,0.385419,,0.201003,0.205540,0.210077,0.214615,0.219152,0.213117
2023-06-17,0.428571,0.002592,0.502843,0.159199,0.628435,0.681783,0.046231,0.388641,,0.196465,0.201003,0.205540,0.210077,0.214615,0.219152
2023-06-18,0.428571,0.002681,0.502271,0.153771,0.628864,0.682708,0.045903,0.388022,,0.191928,0.196465,0.201003,0.205540,0.210077,0.214615


In [31]:
# 종속 변수를 다시 집어 넣기
train_data['mn'] = data['mn']
train_data

# leadtime 적용
train_data['mn2real'] = train_data['mn'].shift(-2)
train_data

# 종속변수의 min max 설정
mn_max = train_data['mn'].max()
mn_min = train_data['mn'].min()

print(mn_max)
print(mn_min)


0.2099
0.0


In [32]:
train_data.describe()

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5,mn2real
count,3415.0,3415.0,3415.0,3415.0,3415.0,3415.0,3415.0,2466.0,484.0,3075.0,3074.0,3073.0,3072.0,3071.0,3070.0,484.0
mean,0.617163,0.023933,0.525848,0.132478,0.392213,0.59383,0.173586,0.432581,0.047781,0.227135,0.227148,0.227159,0.227169,0.227178,0.227185,0.047781
std,0.154404,0.126456,0.031857,0.088781,0.253577,0.095603,0.21115,0.159824,0.035239,0.162208,0.162233,0.162258,0.162284,0.16231,0.162336,0.035239
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.571429,0.001384,0.511311,0.096429,0.151174,0.528927,0.040385,0.295546,0.023917,0.111867,0.111856,0.111845,0.111818,0.111792,0.111766,0.023917
50%,0.657143,0.002102,0.530002,0.118671,0.380567,0.6014,0.057832,0.371768,0.041,0.198507,0.198507,0.198507,0.198571,0.198507,0.198507,0.041
75%,0.685714,0.003111,0.542338,0.147244,0.611343,0.670637,0.376793,0.541545,0.0605,0.291749,0.291749,0.291749,0.291862,0.291976,0.292089,0.0605
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.2099,1.0,1.0,1.0,1.0,1.0,1.0,0.2099


In [33]:
train_data.dropna(subset=['mn2real'], inplace=True)
train_data

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5,mn2real
2014-02-02,0.571429,0.001250,0.506975,0.125627,0.060105,0.617755,0.054902,,,,,,,,,0.026000
2014-02-15,0.571429,0.001065,0.505014,0.128871,0.044766,0.617755,0.058808,,,0.071463,0.076227,0.080991,0.085755,0.090519,0.095283,0.013000
2014-02-22,0.571429,0.000967,0.503789,0.129166,0.041664,0.617755,0.061054,,,0.039248,0.043785,0.048322,0.052860,0.057397,0.061934,0.006333
2014-03-01,0.571429,0.000743,0.502602,0.128871,0.045079,0.617755,0.061533,,,0.099367,0.085528,0.071689,0.057851,0.044012,0.030173,0.026667
2014-03-08,0.771429,0.001357,0.501497,0.081291,0.044413,0.623816,0.063470,0.288105,,0.125910,0.126137,0.126364,0.126591,0.126818,0.127045,0.026333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-08,,,,,,,,,0.053667,0.255677,0.331904,0.265206,0.181039,0.166746,0.198507,0.030667
2023-05-28,0.428571,0.002090,0.497697,0.156036,0.477561,0.689651,0.059640,0.366622,,0.184215,0.177862,0.171510,0.165158,0.158806,0.152454,0.041333
2023-06-05,0.428571,0.002419,0.508765,0.159402,0.568155,0.675920,0.056476,0.364937,,0.190964,0.191956,0.192949,0.193942,0.194934,0.195927,0.039667
2023-06-10,0.428571,0.002701,0.506201,0.170021,0.598295,0.678543,0.052913,0.379634,,0.207083,0.201048,0.195013,0.188979,0.189971,0.190964,0.046000


In [34]:
# def buildDataSet_MN(timeSeries, y, target=0, seqLength=1, pre=0, shuffle=False):
#     xdata = pd.DataFrame()
#     ydata = pd.DataFrame()

#     count = 0
#     for i in range(seqLength, len(timeSeries)):
#         if np.isnan(timeSeries.iloc[i][y]):
#             continue
#         tmp = pd.DataFrame()
#         for j in range(seqLength+1):
#             tmp = pd.concat([tmp, timeSeries.iloc[i-(seqLength - j)]], axis=0)
#         tmp = tmp.T

#         ydata = pd.concat([ydata, pd.Series(timeSeries.iloc[i][y])], axis=0, ignore_index=False)
#         tmp = tmp.drop([y], axis=1)
#         xdata = pd.concat([xdata, tmp], axis=0, ignore_index=False)

#     trainX, testX, trainY, testY = train_test_split(xdata, ydata,
#                                                     test_size=0.2,
#                                                     shuffle=shuffle,
#                                                    random_state=45)

#     return trainX, testX, trainY, testY

# trainX, testX, trainY, testY = buildDataSet_MN(train_data, 'mn', seqLength=0)
# display(train_data.head(10))
# display(trainX.head(5))
# display(trainY.head(5))

In [35]:
### 추가 : Mn은 있는데 대청 수질측정망 자료가 없는 경우이고, 수심이 측정이 안되면, 다른 일반 x변수도 없다
train_data.dropna(subset=['수심', 'mn_reg1'], inplace=True)
train_data

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5,mn2real
2014-02-15,0.571429,0.001065,0.505014,0.128871,0.044766,0.617755,0.058808,,,0.071463,0.076227,0.080991,0.085755,0.090519,0.095283,0.013000
2014-02-22,0.571429,0.000967,0.503789,0.129166,0.041664,0.617755,0.061054,,,0.039248,0.043785,0.048322,0.052860,0.057397,0.061934,0.006333
2014-03-01,0.571429,0.000743,0.502602,0.128871,0.045079,0.617755,0.061533,,,0.099367,0.085528,0.071689,0.057851,0.044012,0.030173,0.026667
2014-03-08,0.771429,0.001357,0.501497,0.081291,0.044413,0.623816,0.063470,0.288105,,0.125910,0.126137,0.126364,0.126591,0.126818,0.127045,0.026333
2014-03-15,0.771429,0.001265,0.501674,0.082984,0.056465,0.622768,0.065635,0.288586,,0.092561,0.099140,0.105719,0.112298,0.118877,0.125457,0.016667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-24,0.428571,0.001820,0.523577,0.153852,0.089675,0.609115,0.054752,0.287486,,0.371429,0.371253,0.371076,0.370900,0.370723,0.370547,0.060333
2023-05-28,0.428571,0.002090,0.497697,0.156036,0.477561,0.689651,0.059640,0.366622,,0.184215,0.177862,0.171510,0.165158,0.158806,0.152454,0.041333
2023-06-05,0.428571,0.002419,0.508765,0.159402,0.568155,0.675920,0.056476,0.364937,,0.190964,0.191956,0.192949,0.193942,0.194934,0.195927,0.039667
2023-06-10,0.428571,0.002701,0.506201,0.170021,0.598295,0.678543,0.052913,0.379634,,0.207083,0.201048,0.195013,0.188979,0.189971,0.190964,0.046000


In [36]:
train_data

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5,mn2real
2014-02-15,0.571429,0.001065,0.505014,0.128871,0.044766,0.617755,0.058808,,,0.071463,0.076227,0.080991,0.085755,0.090519,0.095283,0.013000
2014-02-22,0.571429,0.000967,0.503789,0.129166,0.041664,0.617755,0.061054,,,0.039248,0.043785,0.048322,0.052860,0.057397,0.061934,0.006333
2014-03-01,0.571429,0.000743,0.502602,0.128871,0.045079,0.617755,0.061533,,,0.099367,0.085528,0.071689,0.057851,0.044012,0.030173,0.026667
2014-03-08,0.771429,0.001357,0.501497,0.081291,0.044413,0.623816,0.063470,0.288105,,0.125910,0.126137,0.126364,0.126591,0.126818,0.127045,0.026333
2014-03-15,0.771429,0.001265,0.501674,0.082984,0.056465,0.622768,0.065635,0.288586,,0.092561,0.099140,0.105719,0.112298,0.118877,0.125457,0.016667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-24,0.428571,0.001820,0.523577,0.153852,0.089675,0.609115,0.054752,0.287486,,0.371429,0.371253,0.371076,0.370900,0.370723,0.370547,0.060333
2023-05-28,0.428571,0.002090,0.497697,0.156036,0.477561,0.689651,0.059640,0.366622,,0.184215,0.177862,0.171510,0.165158,0.158806,0.152454,0.041333
2023-06-05,0.428571,0.002419,0.508765,0.159402,0.568155,0.675920,0.056476,0.364937,,0.190964,0.191956,0.192949,0.193942,0.194934,0.195927,0.039667
2023-06-10,0.428571,0.002701,0.506201,0.170021,0.598295,0.678543,0.052913,0.379634,,0.207083,0.201048,0.195013,0.188979,0.189971,0.190964,0.046000


In [37]:
train_data.describe()

Unnamed: 0,수심,탁도,저수위,pH,수온,전기전도도,DO,pea,mn,mn_linear,mn_reg1,mn_reg2,mn_reg3,mn_reg4,mn_reg5,mn2real
count,454.0,454.0,454.0,454.0,454.0,454.0,454.0,333.0,6.0,454.0,454.0,454.0,454.0,454.0,454.0,454.0
mean,0.600535,0.032597,0.527257,0.138536,0.397513,0.592725,0.197949,0.424099,0.059778,0.228142,0.228154,0.22806,0.227622,0.22731,0.227525,0.047929
std,0.158566,0.147133,0.033428,0.096153,0.249549,0.101248,0.22512,0.154283,0.025293,0.159471,0.157858,0.158893,0.16143,0.165198,0.167736,0.035996
min,0.0,0.0,0.28881,0.029946,0.005901,0.0,0.004485,0.28743,0.037333,0.010926,0.01123,0.006352,0.006352,0.00574,0.0,0.0
25%,0.571429,0.001395,0.516802,0.098745,0.161558,0.519413,0.042372,0.29221,0.042083,0.112564,0.116617,0.117516,0.117777,0.110415,0.109735,0.022725
50%,0.657143,0.002171,0.532059,0.126653,0.383822,0.610582,0.058869,0.366622,0.048667,0.199571,0.200447,0.200581,0.197458,0.196919,0.196721,0.041
75%,0.685714,0.003327,0.54428,0.155553,0.612622,0.671394,0.439671,0.529654,0.0775,0.290274,0.294131,0.295458,0.291531,0.290274,0.290615,0.061525
max,0.857143,0.990817,0.884866,0.994983,0.915008,0.803297,1.0,0.985701,0.097,0.907303,0.875723,0.844143,0.830872,0.901155,1.0,0.2099


In [38]:
# 망간 농도 예측
model_list = ["GBM", "RF", "XGB"]   # 분석 모델 리스트 설정 : LSTM, GBM, RF, SVR
performance_list = ["RMSE", "R2", "MSE", "MAE"]    # 분석 성능평가 리스트 설정 : RMSE, R2, MSE, MAE

# var_list = [train_data.columns]
# var_list = [train_data.drop(['pea'], axis=1).columns]
var_list = [['탁도', '저수위', 'pH', '수온', '전기전도도', 'DO', 'mn_reg1','mn_reg2', 'mn_reg3', 'mn_reg4', 'mn_reg5', 'mn2real']]
y_var = "mn2real"

target=0  # 0은 당일 mn, 1은 다음 주프레임 mn 예측
seqLength=0

temp_list_name = ["Mn"]
trainSize_rate = 0.8  # 학습 및 예측 셋 구분

In [39]:
# 일반
best_list = list()
best_r2 = 0
best_md = ""
for md in model_list:
    count = 0
    for df in var_list:
        """
        Modulation 2 : 학습데이터 정제
        """
        # trainX, testX, trainY, testY = train_test_split(
        #     train_data2[df[:-1]],
        #     train_data2[y_var],
        #     test_size=0.2,
        #     random_state=42,
        #     shuffle=True)

        # xy = train_data[df].iloc[(train_data.index >= start_date) & (train_data.index <= end_date)]
        xy = train_data[df]

        trainX, testX, trainY, testY = train_test_split(xy.drop([y_var], axis=1), xy[y_var],
                                                    test_size=0.2,
                                                    shuffle=True,
                                                   random_state=45)

#         print(trainX)
#         break

#         print("@ :",trainY)
#         break

        """
        Modulation 3 : 모델 학습
        """
        # print("Modulation 3 : 모델 학습")

        if md == "GBM":
            model, predict = AL_GradientBoosting(trainX, testX, trainY, testY)
        elif md == "RF":
            model, predict = AL_RandomForest(trainX, testX, trainY, testY)
        elif md == "SVR":
            model, predict = AL_SVR(trainX, testX, trainY, testY)
        elif md == "XGB":
            model, predict = AL_XGBoosting(trainX, testX, trainY, testY)

        yhat = predict
        actual = testY

        # print(yhat)
        # print(actual)
        # pd.DataFrame(yhat).to_csv("대청댐_lead0_pred.csv", encoding='cp949')
        # actual.to_csv('대청댐_lead0_actual.csv', encoding='cp949')
        # break


        # 성과지표 표출 부분 : 적용 항목은 confing > performance_list[] 참조
        for pi in performance_list:
            # rmse = Performance_index(actual_back, yhat_back, pi)
            # print(temp_list_name[count] + " " + md + ' 예측 ' + pi + ' : ', rmse)
            rmse = Performance_index(actual, yhat, pi)
            print(temp_list_name[count] + " " + md + ' 예측 ' + pi + ' : ', rmse)
        break
        # r2 = Performance_index(actual_back, yhat_back, "R2")
        r2 = Performance_index(actual, yhat, "R2")
        if r2 > best_r2:
            best_r2 = r2
            best_list = df
            best_md = md

        """
        Modulation 4 : 결과 데이터 저장
        """
        # print("Modulation 4 : 결과 데이터 저장")
        # print("")

        # 복원된 데이터 저장
        # pd_actual_save = pd.DataFrame(actual)
        # pd_actual_save.to_csv(dir_list['data'][0] + temp_list_name[count] + "_actual", mode='w')
        # pd_predict_save = pd.DataFrame(yhat)
        # pd_predict_save.to_csv(dir_list['data'][0] + temp_list_name[count], mode='w')

        # 모델 저장
        # joblib(sv_regressor, "svr_model.pkl")
        with open(md+'_mn.pkl', 'wb') as f:
            pickle.dump(model, f)

      # ''' # feature importance 2023-08-11
      #  ser = pd.Series(model.feature_importances_, index=xy.drop([y_var], axis=1).columns)
      #  feature_importances = ser.sort_values(ascending=False) # 내림차순 정렬
      #  plt.figure(figsize=(8,6))
      #  plt.title("Feature Importances")
      #  sns.barplot(x=feature_importances, y=feature_importances.index)'''

        # 그래프 저장
        basic_chart(actual, yhat, 'line')
        # plt.title(md + " : " + temp_list_name[0] + "\n" + start_date.split('-')[0] + "-" + end_date.split('-')[0] + "\n" + str(Performance_index(actual_back, yhat_back, "R2")))
        plt.title(md + " : " + temp_list_name[0] + "\n" + start_date.split('-')[0] + "-" + end_date.split('-')[0] + "\n" + str(Performance_index(actual, yhat, "R2")))
        # plt.savefig(dir_list['graph'][0] + temp_list_name[0] + '.png')
        plt.show()

        count += 1

# plt.show()
print("일반")
print("best list : ", best_list)
print("best R2 : ", best_r2)
print("best Model :", best_md)

Mn GBM 예측 RMSE :  0.010251364148927382
Mn GBM 예측 R2 :  0.8866514745472198
Mn GBM 예측 MSE :  0.0001050904669139136
Mn GBM 예측 MAE :  0.005528263359376898
Mn RF 예측 RMSE :  0.012423166052772095
Mn RF 예측 R2 :  0.8335372237072461
Mn RF 예측 MSE :  0.000154335054774749
Mn RF 예측 MAE :  0.006844217582417576
Mn XGB 예측 RMSE :  0.012327424558830995
Mn XGB 예측 R2 :  0.8360930911144296
Mn XGB 예측 MSE :  0.00015196539625366955
Mn XGB 예측 MAE :  0.006931932498727526
일반
best list :  []
best R2 :  0
best Model : 


In [40]:
# plt.scatter(actual_back, yhat_back)
# plt.show()

NameError: ignored

In [41]:
# month 데이터 추가
train_data['month'] = train_data.index.month

In [42]:
# 망간 농도 예측 + month 추가
model_list = ["GBM", "RF", "XGB"]   # 분석 모델 리스트 설정 : LSTM, GBM, RF, SVR
performance_list = ["RMSE", "R2", "MSE", "MAE"]    # 분석 성능평가 리스트 설정 : RMSE, R2, MSE, MAE

# var_list = [train_data.columns]
# var_list = [train_data.drop(['pea'], axis=1).columns]
#var_list = [['month', '탁도', '저수위', 'pH', '수온', '전기전도도', 'DO', 'mn_reg1', 'mn_reg5', 'mn']]
var_list = [['month', '탁도', '저수위', 'pH', '수온', '전기전도도', 'DO', 'mn_reg1','mn_reg2', 'mn_reg3', 'mn_reg4', 'mn_reg5', 'mn2real']]

y_var = "mn2real"

target=2   # 0은 당일 mn, 1은 다음 주프레임 mn 예측
seqLength=0

temp_list_name = ["Mn"]
trainSize_rate = 0.8  # 학습 및 예측 셋 구분

In [43]:
# 동절기 하절기 일반
best_list1 = list()
best_r2_1 = 0
best_md1 = ""

best_list2 = list()
best_r2_2 = 0
best_md2 = ""
for md in model_list:
    count = 0
    for df in var_list:
        """
        Modulation 2 : 학습데이터 정제
        """
        # trainX, testX, trainY, testY = train_test_split(
        #     train_data2[df[:-1]],
        #     train_data2[y_var],
        #     test_size=0.2,
        #     random_state=42,
        #     shuffle=True)

        # xy = train_data[df].iloc[(train_data.index >= start_date) & (train_data.index <= end_date)]
        xy = train_data[df]

        # 동절기
        xy1 = xy[((xy['month'] >= 10) & (xy['month'] <= 12)) |
                 ((xy['month'] >= 1) & (xy['month'] < 4))]

        # 하절기
        xy2 = xy[(xy['month'] >= 4) & (xy['month'] < 10)]

        # month 제거
        xy1 = xy1.drop("month", axis=1)
        xy2 = xy2.drop("month", axis=1)

        trainX1, testX1, trainY1, testY1 = train_test_split(xy1.drop([y_var], axis=1), xy1[y_var],
                                                    test_size=0.2,
                                                    shuffle=True,
                                                   random_state=45)
        trainX2, testX2, trainY2, testY2 = train_test_split(xy2.drop([y_var], axis=1), xy2[y_var],
                                                    test_size=0.2,
                                                    shuffle=True,
                                                   random_state=45)

        # print(trainX)
        # break

#         print("@ :",trainY)
#         break

        """
        Modulation 3 : 모델 학습
        """
        # print("Modulation 3 : 모델 학습")
        if md == "GBM":
            model1, predict1 = AL_GradientBoosting(trainX1, testX1, trainY1, testY1)
            model2, predict2 = AL_GradientBoosting(trainX2, testX2, trainY2, testY2)
        elif md == "RF":
            model1, predict1 = AL_RandomForest(trainX1, testX1, trainY1, testY1)
            model2, predict2 = AL_RandomForest(trainX2, testX2, trainY2, testY2)
        elif md == "SVR":
            model1, predict1 = AL_SVR(trainX1, testX1, trainY1, testY1)
            model2, predict2 = AL_SVR(trainX2, testX2, trainY2, testY2)
        elif md == "XGB":
            model1, predict1 = AL_XGBoosting(trainX1, testX1, trainY1, testY1)
            model2, predict2 = AL_XGBoosting(trainX2, testX2, trainY2, testY2)

        yhat1 = predict1
        actual1 = testY1
        yhat2 = predict2
        actual2 = testY2

        # # back from minmaxscaler
        # actual_back1 = [((mn_min) + val*(mn_max - mn_min))  for val in actual1.to_numpy().ravel()]
        # yhat_back1 = [((mn_min) + val*(mn_max - mn_min))  for val in yhat1]
        # actual_back2 = [((mn_min) + val*(mn_max - mn_min))  for val in actual2.to_numpy().ravel()]
        # yhat_back2 = [((mn_min) + val*(mn_max - mn_min))  for val in yhat2]

        # 성과지표 표출 부분 : 적용 항목은 confing > performance_list[] 참조
        for pi in performance_list:
            rmse1 = Performance_index(actual1, yhat1, pi)
            rmse2 = Performance_index(actual2, yhat2, pi)
            print("동절기 " + md + ' 예측 ' + pi + ' : ', rmse1)
            print("하절기 " + md + ' 예측 ' + pi + ' : ', rmse2)
        break
        r2_1 = Performance_index(actual_back1, yhat_back1, "R2")
        r2_2 = Performance_index(actual_back2, yhat_back2, "R2")
        if r2_1 > best_r2_1:
            best_r2_1 = r2_1
            best_list1 = df
            best_md1 = md

        if r2_2 > best_r2_2:
            best_r2_2 = r2_2
            best_list2 = df
            best_md2 = md

        """
        Modulation 4 : 결과 데이터 저장
        """
        # print("Modulation 4 : 결과 데이터 저장")
        # print("")

        # 복원된 데이터 저장
        # pd_actual_save = pd.DataFrame(actual)
        # pd_actual_save.to_csv(dir_list['data'][0] + temp_list_name[count] + "_actual", mode='w')
        # pd_predict_save = pd.DataFrame(yhat)
        # pd_predict_save.to_csv(dir_list['data'][0] + temp_list_name[count], mode='w')

        # 모델 저장
        # joblib(sv_regressor, "svr_model.pkl")
        with open(md+'_mn_동절기.pkl', 'wb') as f:
            pickle.dump(model1, f)

        with open(md+'_mn_하절기.pkl', 'wb') as f:
            pickle.dump(model2, f)

        # # feature importance 2023-08-11
#         ser = pd.Series(model1.feature_importances_, index=xy.drop([y_var, 'month'], axis=1).columns)
#         feature_importances = ser.sort_values(ascending=False) # 내림차순 정렬
#         plt.figure(figsize=(8,6))
#         plt.title("동절기 Feature Importances")
#         sns.barplot(x=feature_importances, y=feature_importances.index)

#         ser = pd.Series(model2.feature_importances_, index=xy.drop([y_var, 'month'], axis=1).columns)
#         feature_importances = ser.sort_values(ascending=False) # 내림차순 정렬
#         plt.figure(figsize=(8,6))
#         plt.title("하절기 Feature Importances")
#         sns.barplot(x=feature_importances, y=feature_importances.index)

        # 그래프 저장
        basic_chart(actual1, yhat1, 'line')
        plt.title("동절기 " + md + " : " + temp_list_name[0] + "\n" + start_date.split('-')[0] + "-" + end_date.split('-')[0] + "\n" + str(Performance_index(actual_back1, yhat_back1, "R2")))
        # plt.savefig(dir_list['graph'][0] + temp_list_name[0] + '.png')
        basic_chart(actual2, yhat2, 'line')
        plt.title("하절기 " + md + " : " + temp_list_name[0] + "\n" + start_date.split('-')[0] + "-" + end_date.split('-')[0] + "\n" + str(Performance_index(actual_back2, yhat_back2, "R2")))
        # plt.savefig(dir_list['graph'][0] + temp_list_name[0] + '.png')
        plt.show()

        count += 1

# plt.show()
print("동절기")
print("best list : ", best_list1)
print("best R2 : ", best_r2_1)
print("best Model :", best_md1)

print("하절기")
print("best list : ", best_list2)
print("best R2 : ", best_r2_2)
print("best Model :", best_md2)

동절기 GBM 예측 RMSE :  0.012006029467154995
하절기 GBM 예측 RMSE :  0.010672551341670472
동절기 GBM 예측 R2 :  0.9270130702696042
하절기 GBM 예측 R2 :  0.8812929786222827
동절기 GBM 예측 MSE :  0.00014414474356619404
하절기 GBM 예측 MSE :  0.00011390335214059218
동절기 GBM 예측 MAE :  0.007641724270401778
하절기 GBM 예측 MAE :  0.006109155149331386
동절기 RF 예측 RMSE :  0.014148046477419034
하절기 RF 예측 RMSE :  0.011090954655939003
동절기 RF 예측 R2 :  0.8986463856029732
하절기 RF 예측 R2 :  0.8718030296383414
동절기 RF 예측 MSE :  0.00020016721912720914
하절기 RF 예측 MSE :  0.00012300927518009506
동절기 RF 예측 MAE :  0.009059444444444413
하절기 RF 예측 MAE :  0.006919188405797059
동절기 XGB 예측 RMSE :  0.01507335971502239
하절기 XGB 예측 RMSE :  0.010365936653858331
동절기 XGB 예측 R2 :  0.8849553540422086
하절기 XGB 예측 R2 :  0.8880157351318495
동절기 XGB 예측 MSE :  0.0002272061730984599
하절기 XGB 예측 MSE :  0.00010745264271180365
동절기 XGB 예측 MAE :  0.009808220363303467
하절기 XGB 예측 MAE :  0.006452097363740314
동절기
best list :  []
best R2 :  0
best Model : 
하절기
best list :  []
best R2

In [None]:
tmp = pd.DataFrame(actual_back)
tmp.describe()

In [None]:
tmp = pd.DataFrame(yhat_back)
tmp.describe()