In [None]:
!pip install -r requirements.txt

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import warnings
warnings.filterwarnings('ignore')

## 기상청 예보 데이터 (3시간 간격, 6시간 간격) 전처리

* 당진태양광 : dangjin
* 당진수상태양광 : dangjin_floating
* 당진자재창고태양광 : dangjin_warehouse
* 울산태양광 : ulsan

기상청 데이터 경로 및 파일 이름 : [개인정보 모자이크]
* 당진 데이터 폴더 /dangjin
* 울산 데이터 폴더 /ulsan

#### <span style="color:red"> <b>해당 코드는 API 사용이 포함되어 있습니다.</b> </span> 
따라서 'API 키 값을 넣어주세요' 라는 문구를 포함하였습니다. <br>
dacon 측의 요청대로 API 키 값을 가린 부분입니다. <br>
코드 사용 시 반드시 API 키 값 첨부 후 사용 바랍니다.

In [3]:
nsr = pd.read_csv('./new_data/nmae_score_record.csv')
nsr

Unnamed: 0.1,Unnamed: 0,h20_case1,h20_case2,h2320_case1,h2320_case2
0,dangjin_floating,7.2046,6.9884,7.2001,6.9184
1,dangjin_warehouse,8.6418,8.5444,8.4232,8.3451
2,dangjin,9.0658,8.9849,9.0923,8.7924
3,ulsan,7.1799,6.846,6.9341,6.9859
4,average_nmae,8.023025,7.840925,7.912425,7.76045


#### <span style="color:blue">* 위 코드는 h2320 데이터셋을 만들어 사용하는 이유입니다. </span> 
h2320은 23시 예보 데이터들을 사용합니다. <br>
이름이 h2320인 이유는 00시의 데이터는 위 범위를 사용하면 채울 수 없어, <br>
해당부분만 20시의 예보 데이터를 사용하기 때문입니다. <br>

이러한 데이터셋을 만들어 모델을 학습시키면 API 기상예보에서 23시의 기상예보를 사용할 수 있습니다. <br>
또한 20시의 예보만을 사용하는 것보다 평균적으로 nmae score가 높은 것을 확인 할 수 있습니다.

<H3>당진지역 3시간 간격 예보 데이터 전처리

In [None]:
def preprocessing_3h(path_list, y):
    #데이터 불러오기
    #'forecast', 'Temperature','Precipitation','Snow','PrecipitationForm','PrecipitationProb', 'Humidity','MaxTemp','MinTemp', 'WindSpeed', 'WindDirection', 'Cloud'
    data_year_temperature = pd.read_csv(path_list[0])
    data_year_precipitation = pd.read_csv(path_list[1])
    data_year_snow = pd.read_csv(path_list[2])
    data_year_precipitationform = pd.read_csv(path_list[3])
    data_year_precipitationprob = pd.read_csv(path_list[4])
    data_year_humidity = pd.read_csv(path_list[5])
    data_year_maxtemp = pd.read_csv(path_list[6])
    data_year_mintemp = pd.read_csv(path_list[7]) 
    data_year_windspeed = pd.read_csv(path_list[8])
    data_year_winddirection = pd.read_csv(path_list[9])
    data_year_cloud = pd.read_csv(path_list[10])

    # 데이터 결합 (3시간)
    data_year = data_year_humidity[[' format: day', 'hour', 'forecast']]
    data_year['Temperature'] = data_year_temperature[data_year_temperature.columns[-1]] #3시간 기온 
    data_year['PrecipitationForm'] = data_year_precipitationform[data_year_precipitationform.columns[-1]] #강수형태
    data_year['PrecipitationProb'] = data_year_precipitationprob[data_year_precipitationprob.columns[-1]] #강수확률
    data_year['Humidity'] = data_year_humidity[data_year_humidity.columns[-1]]#3시간 습도
    data_year['WindSpeed'] = data_year_windspeed[data_year_windspeed.columns[-1]]#3시간 풍속  
    data_year['WindDirection'] = data_year_winddirection[data_year_winddirection.columns[-1]]#3시간 풍향
    data_year['Cloud'] = data_year_cloud[data_year_cloud.columns[-1]]

    # 월 구분 행 추출
    month_rows = [-1]
    month_rows.extend(data_year[data_year['hour'].isna()].index)
    month_rows.append(data_year.shape[0]+1)
    
    # 월별 데이터 분리
    month_data = []
    for i in range(len(month_rows)-1):
        month_data.append(data_year.loc[month_rows[i]+1:month_rows[i+1]-1])
     
    # 예보 시간 생성 및 데이터 결합
    data_fcst_concat = pd.DataFrame(columns=['Forecast time', 'forecast', 'Temperature','PrecipitationForm','PrecipitationProb,' 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud'])
    for i, df in enumerate(month_data):
        month = i+3
        year = y
        if month == 13:
            month = 1
            year += 1
        elif month == 14:
            month = 2
            year += 1

        date = f'{year}-{month}-' + df[' format: day'].str.split(' ').str[-1] + ' ' + (df['hour'].astype(int)//100).astype(str) + ':00'
        
        # 시간단위 UTC => GMT
        date = pd.to_datetime(date) + pd.DateOffset(hours=9)


        data_fcst = pd.DataFrame(columns=['Forecast time', 'forecast', 'Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud'])
        data_fcst['Forecast time'] = date
        data_fcst[['forecast', 'Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']] = df[['forecast', 'Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]

        data_fcst_concat = pd.concat([data_fcst_concat, data_fcst])
        
    return data_fcst_concat           


In [None]:
dangjin_data_path = './MA_data/dangjin'

csv_list = sorted(glob(dangjin_data_path+'/*/*.csv'))

In [None]:
fcst_data_3h = pd.DataFrame(columns=['Forecast time', 'forecast', 'Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud'])
for i in range(5):
    fcst_data_3h = pd.concat([fcst_data_3h, preprocessing_3h(csv_list[i*11:i*11+11], 2016+i)])

In [None]:
def to_date(x):
    return pd.DateOffset(hours=x)

def forecasting_interpolation_h23(fcst_data, hour):
    #예보 시간 컬럼의 데이터 타입을 datetime으로 변경
    fcst_data['Forecast_time'] = pd.to_datetime(fcst_data['Forecast time'] )  
    
    #23시 기준일 경우 
    fcst_23 = fcst_data[fcst_data['Forecast_time'].dt.hour==23]
    fcst_23 = fcst_23[(fcst_23['forecast']>=4)&(fcst_23['forecast']<=22)]
    
    #예보시점에 forecast 더하기
    fcst_23['Forecast_time'] = fcst_23['Forecast_time'] + fcst_23['forecast'].map(to_date) 
    
    if hour == 3:
        fcst_23 = fcst_23[['Forecast_time','Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]
        fcst_23['Temperature'] = fcst_23['Temperature'].astype(float)
        fcst_23['PrecipitationForm'] = fcst_23['PrecipitationForm'].astype(float)
        fcst_23['Cloud'] = fcst_23['Cloud'].astype(float)

    elif hour == 6:
        fcst_23 = fcst_23[['Forecast_time', 'Precipitation','Snow']]

    #interpolation
    fcst_23_ = pd.DataFrame()
    fcst_23_['Forecast_time'] = pd.date_range(start ='2016-03-02 00:00:00', end='2021-03-01 23:00:00', freq='H')

    fcst_23_ = pd.merge(fcst_23_, fcst_23, on='Forecast_time', how='outer')
    
    return fcst_23_




def forecasting_interpolation_h20(fcst_data, df, hour):
    #예보 시간 컬럼의 데이터 타입을 datetime으로 변경
    fcst_data['Forecast_time'] = pd.to_datetime(fcst_data['Forecast time'] )  
    
    #20시 기준일 경우 
    fcst_20 = fcst_data[fcst_data['Forecast_time'].dt.hour==20]
    fcst_20 = fcst_20[(fcst_20['forecast']==4)]
    
    #예보시점에 forecast 더하기
    fcst_20['Forecast_time'] = fcst_20['Forecast_time'] + fcst_20['forecast'].map(to_date) 
    
    if hour == 3:
        fcst_20 = fcst_20[['Forecast_time','Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud']]
        fcst_20['Temperature'] = fcst_20['Temperature'].astype(float)
        fcst_20['PrecipitationForm'] = fcst_20['PrecipitationForm'].astype(float)
        fcst_20['Cloud'] = fcst_20['Cloud'].astype(float)

    elif hour == 6:
        fcst_20 = fcst_20[['Forecast_time', 'Precipitation','Snow']]
        
    j=0    
    for i in range(0, df.shape[0], 24):
        df.iloc[i] = fcst_20.iloc[j]
        j= j+1

    return df

In [None]:
dangjin_3Interval_h23 = forecasting_interpolation_h23(fcst_data_3h, 3)
dangjin_3Interval_h20 = forecasting_interpolation_h20(fcst_data_3h, dangjin_3Interval_h23, 3)
dangjin_3Interval_h2320 = dangjin_3Interval_h20.interpolate()

<H3>당진지역 6시간 간격 예보 데이터 전처리

In [None]:
def preprocessing_6h(path_list, y):
    #데이터 불러오기
    #'forecast', 'Temperature','Precipitation','Snow','PrecipitationForm','PrecipitationProb', 'Humidity','MaxTemp','MinTemp', 'WindSpeed', 'WindDirection', 'Cloud' 
    data_year_precipitation = pd.read_csv(path_list[1])
    data_year_snow = pd.read_csv(path_list[2]) 

    # 데이터 결합 (6시간)
    data_year = data_year_precipitation[[' format: day', 'hour', 'forecast']]
    data_year['Precipitation'] = data_year_precipitation[data_year_precipitation.columns[-1]] #강수량
    data_year['Snow'] = data_year_snow[data_year_snow.columns[-1]] #적설량

    # 월 구분 행 추출
    month_rows = [-1]
    month_rows.extend(data_year[data_year['hour'].isna()].index)
    month_rows.append(data_year.shape[0]+1)
    
    # 월별 데이터 분리
    month_data = []
    for i in range(len(month_rows)-1):
        month_data.append(data_year.loc[month_rows[i]+1:month_rows[i+1]-1])
     
    # 예보 시간 생성 및 데이터 결합
    data_fcst_concat = pd.DataFrame(columns=['Forecast time', 'forecast', 'Precipitation','Snow'])
    for i, df in enumerate(month_data):
        month = i+3
        year = y
        if month == 13:
            month = 1
            year += 1
        elif month == 14:
            month = 2
            year += 1

        date = f'{year}-{month}-' + df[' format: day'].str.split(' ').str[-1] + ' ' + (df['hour'].astype(int)//100).astype(str) + ':00'
        
        # 시간단위 UTC => GMT
        date = pd.to_datetime(date) + pd.DateOffset(hours=9)



        data_fcst = pd.DataFrame(columns=['Forecast time', 'forecast', 'Precipitation','Snow'])
        data_fcst['Forecast time'] = date
        data_fcst[['forecast','Precipitation','Snow']] = df[['forecast', 'Precipitation','Snow']]

        data_fcst_concat = pd.concat([data_fcst_concat, data_fcst])
        
    return data_fcst_concat           


In [None]:
fcst_data_6h = pd.DataFrame(columns=['Forecast time', 'forecast',  'Precipitation','Snow'])
for i in range(5):
    fcst_data_6h = pd.concat([fcst_data_6h, preprocessing_6h(csv_list[i*11:i*11+11], 2016+i)])

In [None]:
dangjin_6Interval_h23 = forecasting_interpolation_h23(fcst_data_6h, 6)
dangjin_6Interval_h20 = forecasting_interpolation_h20(fcst_data_6h, dangjin_6Interval_h23, 6)
dangjin_6Interval_h2320 = dangjin_6Interval_h20.interpolate()

In [None]:
dangjin_fcst = pd.merge(dangjin_3Interval_h2320, dangjin_6Interval_h2320, on='Forecast_time', how='outer')

<h3>울산지역 3시간 간격 기상예보 데이터 전처리

In [None]:
ulsan_data_path = './MA_data/ulsan'
csv_list = sorted(glob(ulsan_data_path+'/*/*.csv'))

In [None]:
fcst_data_3h = pd.DataFrame(columns=['Forecast time', 'forecast', 'Temperature','PrecipitationForm','PrecipitationProb', 'Humidity', 'WindSpeed', 'WindDirection', 'Cloud'])
for i in range(5):
    fcst_data_3h = pd.concat([fcst_data_3h, preprocessing_3h(csv_list[i*11:i*11+11], 2016+i)])

In [None]:
ulsan_3Interval_h23 = forecasting_interpolation_h23(fcst_data_3h, 3)
ulsan_3Interval_h20  = forecasting_interpolation_h20(fcst_data_3h,ulsan_3Interval_h23,3)
ulsan_3Interval_h2320 = ulsan_3Interval_h20.interpolate()

<h3>울산지역 6시간 간격 기상예보 데이터 전처리

In [None]:
fcst_data_6h = pd.DataFrame(columns=['Forecast time', 'forecast',  'Precipitation','Snow'])
for i in range(5):
    fcst_data_6h = pd.concat([fcst_data_6h, preprocessing_6h(csv_list[i*11:i*11+11], 2016+i)])

In [None]:
ulsan_6Interval_h23 = forecasting_interpolation_h23(fcst_data_6h, 6)
ulsan_6Interval_h20 = forecasting_interpolation_h20(fcst_data_6h, ulsan_6Interval_h23, 6)
ulsan_6Interval_h2320 = ulsan_6Interval_h20.interpolate()

In [None]:
ulsan_fcst = pd.merge(ulsan_3Interval_h2320, ulsan_6Interval_h2320, on='Forecast_time', how='outer')

<h3> API를 이용한 일출일몰 데이터 추가 - 당진

In [None]:
from urllib.request import urlopen
from urllib.request import Request 
from urllib.parse import urlencode, quote_plus

import urllib.request as ul
import xmltodict
import json
import sys
import io

site_info = pd.read_csv('./data/site_info.csv')

In [None]:
def make_locdate(x):
    Ymd, HMS = x.split(' ')
    Y, M, D = Ymd.split('-')
    return Y+M+D

In [None]:
dangjin_fcst['Forecast_time'] = dangjin_fcst['Forecast_time'].astype(str) 
dangjin_fcst['locdate'] = dangjin_fcst['Forecast_time'].apply(lambda x:make_locdate(x))

In [None]:
longi = site_info[(site_info['Id']=='당진태양광')]['Longitude'].values[0]
lati = site_info[(site_info['Id']=='당진태양광')]['Latitude'].values[0]

for i in range(0, dangjin_fcst.shape[0], 24):
    locdate = dangjin_fcst.loc[i,'locdate']
    url = 'http://apis.data.go.kr/B090041/openapi/service/RiseSetInfoService/getLCRiseSetInfo?longitude='+str(longi)+'&latitude='+str(lati)+'&locdate='+str(locdate)+'&dnYn=Y&ServiceKey=API 키 값을 넣어주세요'
    requestd = Request(url)
    requestd.get_method = lambda: 'GET'
    response_body = urlopen(requestd).read()
    resp = xmltodict.parse(response_body)
    resp_dic = json.loads(json.dumps(resp))
    dangjin_fcst.loc[i, 'sunrise'] = resp_dic['response']['body']['items']['item']['sunrise']
    dangjin_fcst.loc[i, 'sunset'] = resp_dic['response']['body']['items']['item']['sunset']
print('end')

In [None]:
for i in range(0, dangjin_fcst.shape[0], 24):
    dangjin_fcst['sunrise'] = dangjin_fcst['sunrise'].fillna(method='ffill')
    dangjin_fcst['sunset'] = dangjin_fcst['sunset'].fillna(method='ffill')

In [None]:
dangjin_fcst['sunrise'] = dangjin_fcst['sunrise'].astype(int)//100
dangjin_fcst['sunset'] = dangjin_fcst['sunset'].astype(int)//100

In [None]:
dangjin_fcst.info()

<h3> API를 이용한 일출일몰 데이터 추가 - 울산

In [None]:
ulsan_fcst['Forecast_time'] = ulsan_fcst['Forecast_time'].astype(str) 
ulsan_fcst['locdate'] = ulsan_fcst['Forecast_time'].apply(lambda x:make_locdate(x))

In [None]:
longi = site_info[(site_info['Id']=='울산태양광')]['Longitude'].values[0]
lati = site_info[(site_info['Id']=='울산태양광')]['Latitude'].values[0]

for i in range(0, ulsan_fcst.shape[0], 24):
    locdate = ulsan_fcst.loc[i,'locdate']
    url = 'http://apis.data.go.kr/B090041/openapi/service/RiseSetInfoService/getLCRiseSetInfo?longitude='+str(longi)+'&latitude='+str(lati)+'&locdate='+str(locdate)+'&dnYn=Y&ServiceKey=API 키 값을 넣어주세요'
    requestd = Request(url)
    requestd.get_method = lambda: 'GET'
    response_body = urlopen(requestd).read()
    resp = xmltodict.parse(response_body)
    resp_dic = json.loads(json.dumps(resp))
    ulsan_fcst.loc[i, 'sunrise'] = resp_dic['response']['body']['items']['item']['sunrise']
    ulsan_fcst.loc[i, 'sunset'] = resp_dic['response']['body']['items']['item']['sunset']
print('end')

In [None]:
for i in range(0, ulsan_fcst.shape[0], 24):
    ulsan_fcst['sunrise'] = ulsan_fcst['sunrise'].fillna(method='ffill')
    ulsan_fcst['sunset'] = ulsan_fcst['sunset'].fillna(method='ffill')
    
ulsan_fcst['sunrise'] = ulsan_fcst['sunrise'].astype(int)//100
ulsan_fcst['sunset'] = ulsan_fcst['sunset'].astype(int)//100

In [None]:
ulsan_fcst.info()

<h3> 최종 저장

In [None]:
dangjin_fcst.to_csv('./new_data/dangjin_fcst_h2320.csv', index=False)
ulsan_fcst.to_csv('./new_data/ulsan_fcst_h2320.csv', index=False)