# About this notebook
이 노트북은 새로운 70:30 split dataset에 대해 기존 GRU, LSTM을 적용했던 방법을 그대로 적용하기 위해 만들어졌습니다. 따라서 먼저 데이터셋의 기간으로 정렬하고 지역별로 나눠서 새로운 데이터셋으로 저장합니다.

그 다음 wandb를 사용해 hyper parameter sweep을 합니다. 다만, best validation acc을 log에 추가하고, GRU/LSTM layer 선택을 hyper parameter로 넣어버립시다.

## 데이터셋별 merge, interpolate

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import wandb
from wandb.keras import WandbCallback
import os


In [47]:
observe=False
if observe:
    #ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 실측_70)_train.csv"),encoding='euc-kr')
    ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 실측_30)_test.csv"),encoding='euc-kr')
else:
    #ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 조건_70)_train.csv"),encoding='euc-kr')
    ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 조건_30)_test.csv"),encoding='euc-kr')

In [48]:

ds['year']=ds['일시'].apply(lambda x: pd.Timestamp(x).year)
ds['mm']=ds['일시'].apply(lambda x: pd.Timestamp(x).month)
ds['dd']=ds['일시'].apply(lambda x: pd.Timestamp(x).day)
ds['hh']=ds['일시'].apply(lambda x: pd.Timestamp(x).hour)

In [49]:
dt_index=['year', 'mm', 'dd', 'hh']

if observe:
    df_default_index=['지점',  'year', 'mm', 'dd', 'hh', '풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)', '적조발생(실측)']
else:
    df_default_index=['지점',  'year', 'mm', 'dd', 'hh', '풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)', '적조발생(조건)']


#reset index
ds.sort_values(by=dt_index, inplace=True)
ds.reset_index(inplace=True, drop=True) #'if drop'=True, drop current index. if False, it becomes a new column.
ds=ds[df_default_index]

ds

Unnamed: 0,지점,year,mm,dd,hh,풍속(m/s),풍향(deg),기온(°C),수온(°C),강수량(mm),적조발생(조건)
0,거문도,2017,1,1,1,4.7,310.0,10.1,13.9,0.0,0
1,거제도,2017,1,1,1,5.5,157.0,9.3,15.9,0.0,0
2,통영,2017,1,1,1,,,8.8,17.5,0.0,0
3,거제도,2017,1,1,1,5.5,157.0,9.3,15.9,0.0,0
4,추자도,2017,1,1,1,4.2,306.0,11.0,15.8,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
166518,거제도,2022,9,30,22,3.2,266.0,23.5,24.0,0.0,0
166519,통영,2022,9,30,23,4.2,284.0,23.6,24.3,0.0,0
166520,추자도,2022,9,30,23,2.2,348.0,17.7,23.4,0.0,0
166521,거제도,2022,9,30,23,2.6,274.0,23.3,24.0,0.0,0


In [50]:

class Lin_Interpolate:
    """Linear interpolate nan datetimes
    """
    def __init__(self, dataframe):
        """receive 'datetime dateframe'. It should include
        yyyy, mm, dd, hh as columns of which type is int64"""
        self.dtd=dataframe
        self.time_start=self.dtd[dt_index].loc[0].to_numpy()
        self.time_end=self.dtd[dt_index].loc[len(self.dtd)-1].to_numpy()
    
    def check_sorted(self):
        #return (self.dtd.sort_values(by=dt_index,ascending=True) == self.dtd).all()
        if (self.dtd.sort_values(by=dt_index,ascending=True)['dd'] == self.dtd['dd']).all() \
            and (self.dtd.sort_values(by=dt_index,ascending=True)['year'] == self.dtd['year']).all()\
            and (self.dtd.sort_values(by=dt_index,ascending=True)['mm'] == self.dtd['mm']).all():
            return True
        else:
            return False
    def next_hour(self, array):
        '''def next_day(self, array)
        return next day as numpy ndarray.
        '''
        next_hour=pd.Timestamp(*array)+pd.Timedelta(1, 'h')
        return np.array([next_hour.year, next_hour.month, next_hour.day, next_hour.hour])
        
    
    def missing_hours(self, t0, t1):
        t=t0
        missing_hours_list=[]
        missing_hours_number=0
        missing_hours_df=pd.DataFrame()

        for idx in range(len(self.dtd)):
            target_date=self.dtd.loc[idx][dt_index].astype(int).values
            while pd.Timestamp(*t)<=pd.Timestamp(*target_date):
                if (t==target_date).all():
                    t=self.next_hour(t)
                else:
                    missing_hours_list.append(t)
                    t=self.next_hour(t)
                    missing_hours_number+=1
        print('total {} missing hours found'.format(missing_hours_number))
        
        return pd.DataFrame(missing_hours_list, columns=dt_index)

    def fill_hours(self, target_df, time_start, time_end):
        return pd.concat([target_df, self.missing_hours(time_start, time_end)])
    
#ds0=ds.loc[ds['지점']=='추자도']

In [51]:
locations=['거문도', '울산', '거제도', '통영', '추자도']

lin_interpolate=Lin_Interpolate(ds)
t0_total=lin_interpolate.time_start
t1_total=lin_interpolate.time_end

if observe:
    target_col='적조발생(실측)'
else:
    target_col='적조발생(조건)'


for pos in locations:
    ds0=ds.loc[ds['지점']==pos]

    ds0.reset_index(inplace=True)
    ds0=ds0[df_default_index]

    lin_interpolate=Lin_Interpolate(ds0)

    print('previous num rows: {} in location {}'.format(len(ds0), pos))
    ds0=lin_interpolate.fill_hours(ds0, time_start=t0_total, time_end=t1_total)
    print('num rows:', len(ds0))

    #reset index
    ds0.sort_values(by=dt_index, inplace=True)
    ds0.reset_index(inplace=True, drop=True) #'if drop'=True, drop current index. if False, it becomes a new column.
    ds0=ds0[df_default_index]
    
    ds0[target_col] = ds0.groupby(dt_index[:-1], sort=False)[target_col]\
                    .apply(lambda x: x.ffill().bfill())

    ds0.interpolate(limit_area='inside', inplace=True)
    label=ds0[target_col]
    ds0.drop(columns=['지점', target_col, 'year', 'mm', 'dd', 'hh'], inplace=True)
    
    if pos==locations[0]:
        df_merged=ds0
    else:
        df_merged=pd.concat([df_merged, ds0], axis=1)

#merge label
df_merged_store=pd.concat([df_merged, label], axis=1)

previous num rows: 34601 in location 거문도
total 21678 missing hours found
num rows: 56279
previous num rows: 33035 in location 울산
total 22960 missing hours found
num rows: 55995
previous num rows: 33173 in location 거제도
total 22865 missing hours found
num rows: 56038
previous num rows: 32891 in location 통영
total 23152 missing hours found
num rows: 56043
previous num rows: 32823 in location 추자도
total 23191 missing hours found
num rows: 56014


In [52]:
df_merged=df_merged_store
print(len(df_merged), '\n\n')
df_merged.dropna(inplace=True)
print(df_merged.isna().sum(), '\n total_rows: ',len(df_merged))


56279 


풍속(m/s)     0
풍향(deg)     0
기온(°C)      0
수온(°C)      0
강수량(mm)     0
풍속(m/s)     0
풍향(deg)     0
기온(°C)      0
수온(°C)      0
강수량(mm)     0
풍속(m/s)     0
풍향(deg)     0
기온(°C)      0
수온(°C)      0
강수량(mm)     0
풍속(m/s)     0
풍향(deg)     0
기온(°C)      0
수온(°C)      0
강수량(mm)     0
풍속(m/s)     0
풍향(deg)     0
기온(°C)      0
수온(°C)      0
강수량(mm)     0
적조발생(조건)    0
dtype: int64 
 total_rows:  55951


In [53]:
df_merged.iloc[:,:-1]=(df_merged.iloc[:,:-1]-df_merged.iloc[:,:-1].mean())/df_merged.iloc[:,:-1].std()
df_merged

Unnamed: 0,풍속(m/s),풍향(deg),기온(°C),수온(°C),강수량(mm),풍속(m/s).1,풍향(deg).1,기온(°C).1,수온(°C).1,강수량(mm).1,...,풍향(deg).2,기온(°C).2,수온(°C).2,강수량(mm).2,풍속(m/s).2,풍향(deg).3,기온(°C).3,수온(°C).3,강수량(mm).3,적조발생(조건)
44,-0.857092,1.234176,-0.602987,-0.347803,-0.137883,-0.383268,0.915232,-0.800610,-0.682343,-0.138748,...,0.696303,-0.451149,-0.236806,-0.146593,-0.797501,-1.353042,-0.409916,-0.491549,-0.119235,0.0
45,-1.129258,1.234176,-0.616960,-0.476561,-0.137883,-0.289615,1.103445,-0.765468,-0.682343,-0.138748,...,0.714360,-0.423461,-0.236806,-0.146593,-0.414941,-1.390045,-0.387034,-0.510435,-0.119235,0.0
46,-1.159499,0.930152,-0.561069,-0.304884,-0.137883,-1.351011,1.002453,-0.737354,-0.682343,-0.138748,...,0.759501,-0.395772,-0.255897,-0.146593,-0.404601,-0.900464,-0.428222,-0.497844,-0.119235,0.0
47,-1.159499,0.930152,-0.561069,-0.304884,-0.137883,-1.600752,0.800468,-0.653013,-0.659628,-0.138748,...,0.837746,-0.418846,-0.262261,-0.146593,-0.394262,-0.410883,-0.469410,-0.485254,-0.119235,0.0
48,-0.857092,0.921466,-0.449287,-0.283425,-0.137883,-1.725622,0.699476,-0.582728,-0.614199,-0.138748,...,0.915991,-0.441920,-0.268625,-0.146593,-0.383922,0.078698,-0.510598,-0.472663,-0.119235,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55990,1.743609,1.138626,1.367169,1.390422,-0.137883,-1.132489,-0.374713,0.907303,1.271111,-0.138748,...,1.210119,1.022215,1.265938,0.106143,-0.928961,-1.302213,0.548520,0.811576,-0.119235,0.0
55991,1.894813,1.121253,1.353197,1.411882,-0.137883,-1.038836,-0.328807,0.865132,1.248396,-0.138748,...,1.213577,1.021746,1.265614,0.100125,-0.955548,-1.290015,0.544597,0.849348,-0.119235,0.0
55992,1.501683,1.121253,1.367169,1.390422,-0.137883,-1.194924,-0.328807,0.865132,1.225682,-0.138748,...,1.217036,1.021277,1.265291,0.094108,-0.982135,-1.277816,0.540674,0.887120,-0.119235,0.0
55993,1.501683,1.060448,1.367169,1.390422,-0.137883,-1.475882,-0.007469,0.822961,1.225682,-0.138748,...,1.220494,1.020808,1.264967,0.088090,-1.008722,-1.265617,0.536752,0.924892,-0.119235,0.0


In [54]:
df_merged.to_csv("class1_test_refined.csv")