In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import os
import wandb
from wandb.keras import WandbCallback

import Sensory_GRU

In [6]:
def train():
    import pandas as pd
    import tensorflow as tf
    from tensorflow import keras
    import matplotlib.pyplot as plt
    import numpy as np
    import wandb
    from wandb.keras import WandbCallback
    
    def seq_acc(y_true, y_pred):
        y_bin=np.zeros_like(y_pred)
        for i, dd in enumerate(y_bin):
            for j in range(len(dd)):
                pred=y_pred[i][j]
                if pred>=0.5:
                    y_bin[i][j]=1
                else:
                    y_bin[i][j]=0

        predict_true = (y_true == y_bin)
        # multi-column일 경우에도 계산 할 수 있도록 np.average를 한번 더 씌움
        try:
            score = np.average(np.average(predict_true))
        except ValueError:
            score = mean_squared_error(y_true, y_bin)
        return score

    def my_seq_acc(y_true, y_pred):
        score = tf.py_function(func=seq_acc, inp=[y_true, y_pred], Tout=tf.float32,  name='custom_seq_acc') # tf 2.x
        #score = tf.py_func( lambda y_true, y_pred : mse_AIFrenz(y_true, y_pred) , [y_true, y_pred], 'float32', stateful = False, name = 'custom_mse' ) # tf 1.x
        return score


    # In[2]:

    class MySeqAccCallback(keras.callbacks.Callback):
        def on_epoch_end(self, epochs, logs=None):
            y_pred=self.model.predict(X_test)
            print('sequence accuracy is {}'.format(seq_acc(y_test, y_pred)))



    wandb.login()

    default_config={
                         'seq_field':72,
                         'stride_inside_seq':9,
                         'stride_between_seqs':2,
                         'learning_rate':0.01,
                         'split_train_ratio':0.8,
                         'epochs':20,
                         'batch_size':64,
                         'unit_gru0':64,
                         'unit_gru1':64}
    wandb.init(config = default_config)

    locations=['거문도', '울산', '거제도', '통영', '추자도']

    # load normalized data

    df_merged=pd.read_csv("sensory_preprocessed_df.csv")
    if df_merged.columns[0]=='Unnamed: 0':
        df_merged = df_merged.iloc[:, 1:]

    print('loaded dataset. Generating sequences')
    seq_length=wandb.config.seq_field//wandb.config.stride_inside_seq
    len_ds=len(df_merged)

    seqs_idx=[]

    start_idx=0
    while start_idx<=len_ds-wandb.config.seq_field:
        seqs_idx.append(list(range(start_idx, start_idx + wandb.config.seq_field, wandb.config.stride_inside_seq
    )))
        start_idx+=wandb.config.stride_between_seqs


    seqs_idx[100],len(seqs_idx[100])

    df_merged.reset_index(inplace=True, drop=True)
    print('no missing values:', df_merged.isna().any().any())


    #train_cols=['풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)', '적조발생']
    ds_train_cols=df_merged
    ds_train_cols.reset_index(inplace=True, drop=True)
    print('train dataset columns:',ds_train_cols.columns)

    seq_dataset=np.zeros([len(seqs_idx), len(seqs_idx[0]), len(ds_train_cols.columns)])

    for i, seq in enumerate(seqs_idx):
        for j, row_number in enumerate(seq):
            seq_dataset[i, j]=ds_train_cols.loc[row_number].to_numpy()

    def not_bin_in_occurence(x):
        if x==1 or x==0:
            return x
        else:
            print('exceptional value(not 0 or 1) found. replaced by near one.')
            if x>=0.5:
                return 1
            else:
                return 0
            
    ds_train_cols['적조발생']=ds_train_cols['적조발생'].apply(not_bin_in_occurence)


    split_index=int(len(seq_dataset)*wandb.config.split_train_ratio)
    print(split_index, len(seq_dataset))

    train_xy=seq_dataset[:split_index]
    np.random.shuffle(train_xy)
    X_train=train_xy[:,:,0:-1]
    y_train=train_xy[:,:,-1]

    test_xy=seq_dataset[split_index:]
    np.random.shuffle(test_xy)
    X_test=test_xy[:,:,0:-1]
    y_test=test_xy[:,:,-1]

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape,'\n\n')

    model = keras.Sequential([
        keras.Input(shape=(seq_length, 25)),
        keras.layers.GRU(wandb.config.unit_gru0, return_sequences=True),
        keras.layers.GRU(wandb.config.unit_gru1),
        keras.layers.Dense(8, activation="sigmoid"),
    ]
    )


    # In[95]:
    optimizer=keras.optimizers.Adam(learning_rate=wandb.config.learning_rate)

    model.compile(optimizer=optimizer, loss="binary_crossentropy")

    # In[97]:


    # In[98]:


    model.fit(X_train, y_train,
            batch_size=wandb.config.batch_size,
            epochs=wandb.config.epochs, 
            validation_data=(X_test, y_test),
            callbacks=[WandbCallback(training_data = (X_train, y_train),
                                     validation_data = (X_test, y_test),
                                     log_weights = True,
                                     log_gradients = True), MySeqAccCallback()],)
    y_pred=self.model.predict(X_test)
    wandb.log({"Test Accuracy Rate:":seq_acc(y_test, y_pred)})

In [7]:
wandb.login()

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
# yaml 파일로 만들어도 됩니다.

sweep_config = {
    "name": "sweep",
    "metric": {
        "goal": "minimize",
        "name": "val_loss"
    },
    "method": "random",
    "parameters": {
        "split_train_ratio":{'max':0.8, 'min':0.5},
        "epochs":{
            "values":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
        },
        "learning_rate":{'max':0.01, 'min':0.0001}
    }
    }

In [9]:
sweep_id = wandb.sweep(sweep_config)

# run the sweep
wandb.agent(sweep_id,
            function=train,
            entity = 'chhyyi',
            project = 'redzone_gru')

404 response executing GraphQL.
{"errors":[{"message":"could not find sweep chhyyi/redzone_gru/bgutqso5 during createAgent","path":["createAgent"]}],"data":{"createAgent":null}}
[34m[1mwandb[0m: [32m[41mERROR[0m Error while calling W&B API: could not find sweep chhyyi/redzone_gru/bgutqso5 during createAgent (<Response [404]>)


Create sweep with ID: bgutqso5
Sweep URL: https://wandb.ai/chhyyi/uncategorized/sweeps/bgutqso5


CommError: could not find sweep chhyyi/redzone_gru/bgutqso5 during createAgent

In [None]:
wandb.finish()

# GRU 써보기    
이 노트북은 [Sensory_LSTM 노트북](https://github.com/chhyyi/aiffel/blob/main/aiffelthon/Sensory_LSTM.ipynb) 의 복제입니다.

## 작업 로그

### 2022-11-12
- 지난번에 하려고 했던, wandb를 이용해서 sweep을 해봅니다. wandb에 의한 sweep은 python module로 저장하고 임포트 하는 것이 더 적당해 보입니다. 이 부분은 천천히 생각합니다. 아무튼 그 경우, '\*.py' 파일을 내보내고 여기에서 wandb sweep 을 실행하도록 합니다.  

### 2022-11-10
- 원본의 LSTM을 GRU로 바꾸고, wandb를 써보려고 했는데 그냥 한 번 했습니다. 
- df_merged를 csv로 저장하고, 시퀀스 인덱스 생성 이전의 부분은 모두 삭제합니다. csv파일을 읽는 부분부터 씁니다.

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import wandb
from wandb.keras import WandbCallback

In [None]:
wandb.login()

In [None]:
run = wandb.init(project = 'redzone_gru',
                 entity = 'chhyyi',
                 config = {
                     'seq_field':72,
                     'stride_inside_seq':9,
                     'stride_between_seqs':2,
                     'learning_rate':0.01,
                     'loss':'sparse_categorical_crossentropy',
                     'metrics':['accuracy'],
                     'epochs':20,
                     'batch_size':64,
                     'callbacks'=[MySeqAccCallback()]
                 })

In [None]:
locations=['거문도', '울산', '거제도', '통영', '추자도']

In [None]:
# load normalized data

df_merged=pd.read_csv("sensory_preprocessed_df.csv")
df_merged

In [None]:
# drop surplus columns, if required
df_merged = df_merged.iloc[:, 1:]



## 시퀀스로 전환
얼마의 길이로 자를 것인가 하는게 중요한 hyper-parameter가 될 수도 있겠습니다. 1시간이 1timestep입니다. 만약 다음과 같이 한다면 sequence의 구조는 이렇게 됩니다.  
- 한 시퀀스의 field: 72
- window size: 1
- stride between elements of sequence: 6 
- stride between sequences: 7
- sequence length $\frac{72}{6}=12$  
- 
그러면 1번째 시퀀스는 timestep=0, 2번째 시퀀스는 timtestep=7에서 시작합니다. 그렇게 하면 시퀀스의 배치가 이렇게 됩니다.
|timestep|0|1|2|3|4|5|6|7|8|9|1|11|12|13|14|15|16|17|18|19|20|21|22|23|24|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|-|
|seq\[0\]|1||||||2||||||3||||||4||||||5|
|seq\[1\]||||||||1||||||2||||||3||||||
|seq\[2\]|||||||||||||||1||||||2|||||

또한, 전체 시퀀스가 시작되거나 끝나는 부분이 아닌 곳에서는 하나의 데이터 포인트가 두 개의 시퀀스에 포함되게 됩니다.

## 모델의 입력과 출력
입력은 일단 5개 지점의 5개 데이터를 통합, 인코더의 embedding size가 $5\times5=25$인 모델로 작동합니다.

In [None]:
wandb.config.seq_field
wandb.config.stride_inside_seq
wandb.config.stride_between_seqs
seq_length=seq_field//stride
len_ds=len(df_merged)

seqs_idx=[]

start_idx=0
while start_idx<=len_ds-wandb.config.seq_field:
    seqs_idx.append(list(range(start_idx, start_idx + wandb.config.seq_field, wandb.config.stride_inside_seq
)))
    start_idx+=wandb.config.stride_between_seqs

생성된 seqs_idx를 이미지로 바꿔서 한번 살펴봅니다.

In [None]:
seqs_idx[100],len(seqs_idx[100])

In [None]:
img_size_x = 263 #usually, match with seq_idx[-1][-1]
img_size_y = 100 #number of sequence to visuallize
seq_length_squeeze_ratio = 1

img = np.zeros([img_size_y, img_size_x])

for yidx, seq in enumerate(seqs_idx[:img_size_y]):
    #for i in range(seq[0]//seq_length_squeeze_ratio,seq[-1]//seq_length_squeeze_ratio + 1):
    for i in seq:
        if i<img_size_x:
            img[yidx, i]=1.0

plt.figure(figsize=(12,4))
plt.xticks(ticks=list(range(0,img_size_x,seq_field)))
plt.yticks(ticks=list(range(0,img_size_y,10)))

plt.grid(alpha=0.5)
plt.imshow(img)
plt.colorbar()
#plt.colorbar(location='bottom')
plt.show()

figure는 50행(수직축)까지 시퀀스별 time-index(수평축)를 점으로 표시합니다.

## train, test set 구성
먼저 index를 리셋하고 시작합니다..

In [None]:

df_merged.reset_index(inplace=True, drop=True)
#ds0=ds0[df_default_index]
print(df_merged.isna().any().any())
df_merged

In [None]:
df_merged.columns

In [None]:
#train_cols=['풍속(m/s)', '풍향(deg)', '기온(°C)', '수온(°C)', '강수량(mm)', '적조발생']
ds_train_cols=df_merged
ds_train_cols.reset_index(inplace=True, drop=True)
ds_train_cols

In [10]:
seq_dataset=np.zeros([len(seqs_idx), len(seqs_idx[0]), len(ds_train_cols.columns)])
seq_dataset.shape

NameError: name 'seqs_idx' is not defined

In [11]:
for i, seq in enumerate(seqs_idx):
    for j, row_number in enumerate(seq):
        seq_dataset[i, j]=ds_train_cols.loc[row_number].to_numpy()

seq_dataset[:2]

NameError: name 'seqs_idx' is not defined

In [12]:
ds_train_cols.loc[0].to_numpy()

NameError: name 'ds_train_cols' is not defined

### label 예외처리: 
이유를 찾지 못했는데 라벨에 0, 1아닌 값들이 있어 임시로 씁니다.

In [13]:
def not_bin_in_occurence(x):
    if x==1 or x==0:
        return x
    else:
        print('exceptional value(not 0 or 1) found. replaced by near one.')
        if x>=0.5:
            return 1
        else:
            return 0
        
ds_train_cols['적조발생']=ds_train_cols['적조발생'].apply(not_bin_in_occurence)

NameError: name 'ds_train_cols' is not defined

### train - test 셋 분리 :
특정 연도들에 대한 과적합을 없애기 위해 test와 train을 섞지 않고 시계열에서 일단 나눕니다. 그 이후 shuffle합니다.  
셔플 참고자료(stack overflow) https://stackoverflow.com/questions/35646908/numpy-shuffle-multidimensional-array-by-row-only-keep-column-order-unchanged

In [14]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(seq_dataset[:,:,0:-1], seq_dataset[:,:,-1], test_size=0.20, random_state=42)

#1. 8:2 split
split_index=int(len(seq_dataset)*0.8)
print(split_index, len(seq_dataset))

#2. 5:5 split
#split_index=int(len(ds_train_cols)*0.5)
train_xy=seq_dataset[:split_index]
np.random.shuffle(train_xy)
X_train=train_xy[:,:,0:-1]
y_train=train_xy[:,:,-1]

test_xy=seq_dataset[split_index:]
np.random.shuffle(test_xy)
X_test=test_xy[:,:,0:-1]
y_test=test_xy[:,:,-1]

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape,'\n\n')


NameError: name 'seq_dataset' is not defined

In [15]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape,'\n\n')
print('print data for one sequence')

NameError: name 'X_train' is not defined

# model & train
간단한 GRU 모델을 훈련해봅니다.

In [16]:
from tensorflow import keras
from tensorflow.keras import layers


In [17]:
#copied from:
#https://www.dacon.io/competitions/official/235584/codeshare/738
def seq_acc(y_true, y_pred):
    y_bin=np.zeros_like(y_pred)
    for i, dd in enumerate(y_bin):
        for j in range(len(dd)):
            pred=y_pred[i][j]
            if pred>=0.5:
                y_bin[i][j]=1
            else:
                y_bin[i][j]=0
            
    predict_true = (y_true == y_bin)
    # multi-column일 경우에도 계산 할 수 있도록 np.average를 한번 더 씌움
    try:
        score = np.average(np.average(predict_true))
    except ValueError:
        score = mean_squared_error(y_true, y_bin)
    return score

def my_seq_acc(y_true, y_pred):
    score = tf.py_function(func=seq_acc, inp=[y_true, y_pred], Tout=tf.float32,  name='custom_seq_acc') # tf 2.x
    #score = tf.py_func( lambda y_true, y_pred : mse_AIFrenz(y_true, y_pred) , [y_true, y_pred], 'float32', stateful = False, name = 'custom_mse' ) # tf 1.x
    return score


In [18]:

model = keras.Sequential([
    keras.Input(shape=(8, 25)),
    layers.GRU(64, return_sequences=True),
    layers.GRU(64),
    layers.Dense(8, activation="sigmoid"),
]
)

In [19]:
model.compile(optimizer="adam", loss="binary_crossentropy")

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, 8, 64)             17472     
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                24960     
_________________________________________________________________
dense (Dense)                (None, 8)                 520       
Total params: 42,952
Trainable params: 42,952
Non-trainable params: 0
_________________________________________________________________


In [21]:
class MySeqAccCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epochs, logs=None):
        y_pred=self.model.predict(X_test)
        print('sequence accuracy is {}'.format(seq_acc(y_test, y_pred)))

In [22]:
history = model.fit(X_train, y_train, 
                    batch_size=32, 
                    epochs=10, 
                    validation_data=(X_test, y_test),
                    callbacks=[MySeqAccCallback()],
                   )

NameError: name 'X_train' is not defined

In [23]:
#print(history.history['loss'])
#print(history.history['val_loss'])

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

NameError: name 'history' is not defined

In [24]:
y_pred=model.predict(X_test)
print(seq_acc(y_test, y_pred))

NameError: name 'X_test' is not defined

In [None]:
del model

# WandB 써보기  
WandB는 hyper parameter search을 도와주는 라이브러리입니다.

In [None]:
def train():
    import pandas as pd
    import tensorflow as tf
    import matplotlib.pyplot as plt
    import numpy as np

    import wandb
    from wandb.keras import WandbCallback
    default_config = {
                     'num_filter_conv2D_1':30,
                     'num_filter_conv2D_2':30,
                     'kernel_size_conv2D_1':3,
                     'kernel_size_conv2D_2':3,
                     'pool_size_max_pool0':2,
                     'pool_size_max_pool1':2,
                     'num_unit_dense_1':128,
                     'num_unit_dense_output':10,
                     'input_shape':(28, 28, 1),
                     'learning_rate':0.01,
                     'loss':'sparse_categorical_crossentropy',
                     'metrics':['accuracy'],
                     'epochs':20,
                     'batch_size':64,
                 }
    wandb.init(config = default_config)