# 이 노트북에 관해
이 노트북은 새로운 70:30 split dataset에 대해 기존 GRU, LSTM을 적용했던 방법을 그대로 적용하기 위해 만들어졌습니다. 따라서 먼저 데이터셋의 기간으로 정렬하고 지역별로 나눠서 새로운 데이터셋으로 저장합니다.

그 다음 wandb를 사용해 hyper parameter sweep을 합니다. 다만, best validation acc을 log에 추가하고, GRU/LSTM layer 선택을 hyper parameter로 넣어버립시다.

# About this notebook
This notebook generate dataset for [class1_LSTM&GRU_sweep](https://github.com/chhyyi/aiffelthon/blob/main/class1_LSTM%26GRU_sweep.ipynb)  
Also changes, descriptions are there.

## 데이터셋별 merge, interpolate

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import wandb
from wandb.keras import WandbCallback
import os


In [12]:
observe=True
if observe:
    ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 실측_70)_train.csv"),encoding='euc-kr')
    #s=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 실측_30)_test.csv"),encoding='euc-kr')
else:
    #ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 조건_70)_train.csv"),encoding='euc-kr')
    ds=pd.read_csv(os.path.join(os.getenv("HOME"),"aiffel/aiffelthon/sample_data/(class1 조건_30)_test.csv"),encoding='euc-kr')

In [13]:

ds['year']=ds['일시'].apply(lambda x: pd.Timestamp(x).year)
ds['mm']=ds['일시'].apply(lambda x: pd.Timestamp(x).month)
ds['dd']=ds['일시'].apply(lambda x: pd.Timestamp(x).day)
ds['hh']=ds['일시'].apply(lambda x: pd.Timestamp(x).hour)

In [14]:
dt_index=['year', 'mm', 'dd', 'hh']

if observe:
    df_default_index=['지점',  'year', 'mm', 'dd', 'hh', '풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)', '적조발생(실측)']
else:
    df_default_index=['지점',  'year', 'mm', 'dd', 'hh', '풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)', '적조발생(조건)']


#reset index
ds.sort_values(by=dt_index, inplace=True)
ds.reset_index(inplace=True, drop=True) #'if drop'=True, drop current index. if False, it becomes a new column.
ds=ds[df_default_index]

ds

Unnamed: 0,지점,year,mm,dd,hh,풍속(m/s),풍향(deg),기온(°C),수온(°C),강수량(mm),적조발생(실측)
0,통영,2017,1,1,1,,,8.8,17.5,0.0,0
1,거제도,2017,1,1,1,5.5,157.0,9.3,15.9,0.0,0
2,추자도,2017,1,1,1,4.2,306.0,11.0,15.8,0.0,0
3,울산,2017,1,1,2,10.0,307.0,8.3,16.5,0.0,0
4,추자도,2017,1,1,2,2.6,315.0,10.8,15.7,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
166518,울산,2022,9,30,22,1.8,179.0,23.4,25.1,0.0,0
166519,거제도,2022,9,30,22,3.2,266.0,23.5,24.0,0.0,0
166520,통영,2022,9,30,23,4.2,284.0,23.6,24.3,0.0,0
166521,거제도,2022,9,30,23,2.6,274.0,23.3,24.0,0.0,0


In [15]:

class Lin_Interpolate:
    """Linear interpolate nan datetimes
    """
    def __init__(self, dataframe):
        """receive 'datetime dateframe'. It should include
        yyyy, mm, dd, hh as columns of which type is int64"""
        self.dtd=dataframe
        self.time_start=self.dtd[dt_index].loc[0].to_numpy()
        self.time_end=self.dtd[dt_index].loc[len(self.dtd)-1].to_numpy()
    
    def check_sorted(self):
        #return (self.dtd.sort_values(by=dt_index,ascending=True) == self.dtd).all()
        if (self.dtd.sort_values(by=dt_index,ascending=True)['dd'] == self.dtd['dd']).all() \
            and (self.dtd.sort_values(by=dt_index,ascending=True)['year'] == self.dtd['year']).all()\
            and (self.dtd.sort_values(by=dt_index,ascending=True)['mm'] == self.dtd['mm']).all():
            return True
        else:
            return False
    def next_hour(self, array):
        '''def next_day(self, array)
        return next day as numpy ndarray.
        '''
        next_hour=pd.Timestamp(*array)+pd.Timedelta(1, 'h')
        return np.array([next_hour.year, next_hour.month, next_hour.day, next_hour.hour])
        
    
    def missing_hours(self, t0, t1):
        t=t0
        
        period=int((pd.Timestamp(*t1)-pd.Timestamp(*t0))/np.timedelta64(1, 'h'))
        
        missing_hours_list=[]
        missing_hours_number=0
        missing_hours_df=pd.DataFrame()

        iter_dates=self.dtd[dt_index].values.tolist()
        if (iter_dates[-1]!=t1).any():
            iter_dates.append(t1)
            missing_hours_list.append(t1)
            missing_hours_number+=1
        if (iter_dates[0]!=t0).any():
            iter_dates.insert(0, t0)
            missing_hours_list.append(t0)
            missing_hours_number+=1

        for target_date in iter_dates:
            #print(target_date, type(target_date[0]))
            #target_date=date_
            try:
                while pd.Timestamp(*t)<=pd.Timestamp(*target_date):
                    if (t==target_date).all():
                        t=self.next_hour(t)
                    else:
                        missing_hours_list.append(t)
                        t=self.next_hour(t)
                        missing_hours_number+=1
            except:
                print(target_date)
                break

        print('total {} missing hours found'.format(missing_hours_number))
        
        return pd.DataFrame(missing_hours_list, columns=dt_index)

    def fill_hours(self, target_df, time_start, time_end):
        target_df['interpolated']=target_df.isna().any(axis=1)
        missing_hours=self.missing_hours(time_start, time_end)
        missing_hours['interpolated']=np.ones(len(missing_hours))
        
        return pd.concat([target_df, missing_hours])
    
#ds0=ds.loc[ds['지점']=='추자도']

In [16]:
#print(ds0[dt_index].values.tolist())

In [17]:
locations=['거문도', '울산', '거제도', '통영', '추자도']

lin_interpolate=Lin_Interpolate(ds)
t0_total=lin_interpolate.time_start
t1_total=lin_interpolate.time_end
period=pd.Timestamp(*t1_total)-pd.Timestamp(*t0_total)


if observe:
    target_col=['적조발생(실측)', 'interpolated']
else:
    target_col=['적조발생(조건)', 'interpolated']


for pos in locations:
    ds0=ds.loc[ds['지점']==pos]

    ds0.reset_index(inplace=True)
    ds0=ds0[df_default_index]

    lin_interpolate=Lin_Interpolate(ds0)

    print('previous num rows: {} in location {}'.format(len(ds0), pos))
    ds0=lin_interpolate.fill_hours(ds0, time_start=t0_total, time_end=t1_total)
    print('num rows:', len(ds0))

    #reset index
    ds0.sort_values(by=dt_index, inplace=True)
    ds0.reset_index(inplace=True, drop=True) #'if drop'=True, drop current index. if False, it becomes a new column.
    ds0=ds0[df_default_index+target_col[1:]]
    
    ds0[target_col[0]] = ds0.groupby(dt_index[:-1], sort=False)[target_col[0]]\
                    .apply(lambda x: x.ffill().bfill())

    ds0.interpolate(limit_area='inside', inplace=True)
    if pos==locations[0]: #if first loop
        label=ds0[target_col]
    else:
        label['interpolated']=np.logical_or(label['interpolated'].values, ds0['interpolated'].values)

    ds0.drop(columns=['지점', 'year', 'mm', 'dd', 'hh']+target_col, inplace=True)
    
    if pos==locations[0]:
        df_merged=ds0
    else:
        df_merged=pd.concat([df_merged, ds0], axis=1)

#merge label
df_merged_store=pd.concat([df_merged, label], axis=1)

previous num rows: 34638 in location 거문도
total 15737 missing hours found
num rows: 50375
previous num rows: 33134 in location 울산
total 17241 missing hours found
num rows: 50375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label['interpolated']=np.logical_or(label['interpolated'].values, ds0['interpolated'].values)


previous num rows: 33046 in location 거제도
total 17329 missing hours found
num rows: 50375
previous num rows: 32846 in location 통영
total 17529 missing hours found
num rows: 50375
previous num rows: 32859 in location 추자도
total 17516 missing hours found
num rows: 50375


In [18]:
print(ds0.columns)

Index(['풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)'], dtype='object')


In [19]:
df_merged=df_merged_store
print(len(df_merged), '\n\n')
df_merged.dropna(inplace=True)
print(df_merged.isna().sum(), '\n total_rows: ',len(df_merged))


50375 


풍속(m/s)         0
풍향(deg)         0
기온(°C)          0
수온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(deg)         0
기온(°C)          0
수온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(deg)         0
기온(°C)          0
수온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(deg)         0
기온(°C)          0
수온(°C)          0
강수량(mm)         0
풍속(m/s)         0
풍향(deg)         0
기온(°C)          0
수온(°C)          0
강수량(mm)         0
적조발생(실측)        0
interpolated    0
dtype: int64 
 total_rows:  50332


In [20]:
df_merged.iloc[:,:-2]=(df_merged.iloc[:,:-2]-df_merged.iloc[:,:-2].mean())/df_merged.iloc[:,:-2].std()
df_merged

Unnamed: 0,풍속(m/s),풍향(deg),기온(°C),수온(°C),강수량(mm),풍속(m/s).1,풍향(deg).1,기온(°C).1,수온(°C).1,강수량(mm).1,...,기온(°C).2,수온(°C).2,강수량(mm).2,풍속(m/s).2,풍향(deg).2,기온(°C).3,수온(°C).3,강수량(mm).3,적조발생(실측),interpolated
41,-0.431097,0.962426,-0.349040,-0.297696,-0.129298,-1.502294,0.571379,-0.616366,-0.643483,-0.137245,...,-0.456726,-0.243604,-0.154064,-0.162619,1.398206,-0.378656,-0.519575,-0.117603,0.0,False
42,-0.340680,0.841549,-0.293173,-0.404563,-0.129298,-1.595939,0.690575,-0.546142,-0.666158,-0.137245,...,-0.429098,-0.243604,-0.154064,-0.023648,0.010496,-0.358096,-0.510145,-0.117603,0.0,1.0
43,-0.099568,1.005596,-0.321106,-0.447310,-0.129298,0.058466,0.883122,-0.447830,-0.688834,-0.137245,...,-0.401469,-0.262643,-0.154064,0.115323,-1.377213,-0.337536,-0.500715,-0.117603,0.0,1.0
44,0.141544,1.169644,-0.349040,-0.490056,-0.129298,0.027251,0.901460,-0.504008,-0.711510,-0.137245,...,-0.360027,-0.262643,-0.154064,-0.533208,-1.317981,-0.597962,-0.462996,-0.117603,0.0,False
45,0.352517,1.129351,-0.386285,-0.547052,-0.129298,-0.035179,1.057332,-0.546142,-0.734185,-0.137245,...,-0.429098,-0.281682,-0.154064,-0.440561,0.103575,-0.618522,-0.453566,-0.117603,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50368,-1.757213,-0.859367,0.991783,1.112944,-0.129298,-1.585534,-0.363851,0.947270,1.306625,-0.137245,...,1.518707,1.336605,-0.154064,-1.614093,1.465899,1.033128,1.253243,-0.117603,0.0,1.0
50369,-1.817491,0.539355,1.019716,1.177064,-0.129298,-1.356623,-0.373020,0.923863,1.283949,-0.137245,...,1.290773,1.279489,-0.154064,-1.644976,1.423591,1.046834,1.215523,-0.117603,0.0,1.0
50370,-1.727074,0.116285,0.880047,1.219810,-0.129298,-1.127711,-0.382189,0.900455,1.261274,-0.137245,...,1.062838,1.222373,-0.154064,-1.552329,-1.377213,0.882355,1.196664,-0.117603,0.0,False
50371,-1.365406,0.185358,0.894014,1.177064,-0.129298,-1.158927,-0.359267,0.879388,1.238598,-0.137245,...,1.104281,1.241412,-0.154064,-1.567770,0.061266,0.615075,1.083505,-0.117603,0.0,1.0


In [21]:
df_merged.to_csv("class1_observe_train_refined.csv")

In [29]:
df_merged.isna().any()

풍속(m/s)         False
풍향(deg)         False
기온(°C)          False
수온(°C)          False
강수량(mm)         False
풍속(m/s)         False
풍향(deg)         False
기온(°C)          False
수온(°C)          False
강수량(mm)         False
풍속(m/s)         False
풍향(deg)         False
기온(°C)          False
수온(°C)          False
강수량(mm)         False
풍속(m/s)         False
풍향(deg)         False
기온(°C)          False
수온(°C)          False
강수량(mm)         False
풍속(m/s)         False
풍향(deg)         False
기온(°C)          False
수온(°C)          False
강수량(mm)         False
적조발생(실측)        False
interpolated    False
dtype: bool