In [None]:
'''
Get resources
'''
import sys
sys.path.insert(1,"/Users/admin/Desktop/jasmine/lakes21_parquet/Eco_KGML_workshop")   #change path to local file

import os
if os.path.exists("/Users/admin/Desktop/jasmine/lakes21_parquet/Eco_KGML_workshop"):
  print("Directory already exists.")
else:
  os.system("git clone https://github.com/jasminehyu/Machine_Learning_Model_for_Predicting_Dissolved_Oxygen_in_Lakes.git")
  print("Repository cloned.")

In [None]:
'''
Import necessary modules
'''
import random
import pandas as pd
import numpy as np
from tqdm import trange
import datetime
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch import optim
from utils import  run_all, Utils
from encoder_decoder import seq2seq

import warnings
warnings.filterwarnings('ignore')

In [None]:
'''
Set up environment - Running on GPU if available
'''
if torch.cuda.is_available():
    device=torch.device("cuda")
    print('Computational device:GPU')

else:
    device=torch.device('cpu')
    print("WARNING: For this notebook to perform best, "
        "if possible, in the menu under Runtime -> "
        "Change runtime type. select GPU.")

In [None]:
'''
Set up environment - Handles variabilty and remain reproducible by controlling sources of randomness through setting seed values"
'''
seed = 2025

np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
'''
Define some hyperparameters required for learning
'''
#Type of the model
model_type='LSTM'

#Output size of our encoder_decoder model(Number of target variable)
output_size=1

#Number of layers in our deep learning model
num_layer=1

#Hidden cell size
hidden_feature_size=64

#Dropout is a form of regularization
dropout = 0.05

#Whether we want to shuffle the batches while generating the training batches
batch_shuffle = True

#Frequency of evaluation --> if iteration_num % eval_freq ==0 --> then perform evaluation
eval_freq = 1 

'''
Learning rate scheduler parameters
'''
max_lr=5e-4
div_factor=100
pct_start=0.05
anneal_strategy='cos'
final_div_factor=10000.0

'''
Parameters for early stopping
'''
early_stop = False
#If there is no imrovement for a 'thres' number of epochs, stop the training process
thres=5
#Quantifying the improvement. If the loss is greater than min_val_loss_so_far + delta for thres number of iterations, stop the training
delta=0.5
#Specifying the percentage of times we want to enforce teacher forcing
teacher_forcing_ratio = 0.0
training_prediction = 'recursive'

In [None]:
'''
Read the file with Lake Mendota's data
'''
file_name='lake_me_year.csv'
me_df=pd.read_csv(file_name)
me_df_training=me_df[(me_df['year']==2019) | (me_df['year']==2020)]
me_df_testing
'''
An Adjustment on datetimes that include time at 12:00 am
'''
def add_missing_time(datetime):
    if len(datetime)==10:
        return datetime+ ' 00:00:00'
    return datetime

me_df_training['datetime']=me_df_training['datetime'].apply(add_missing_time)
me_df_training

In [None]:
'''
Filter specific columns from me_df for training and testing
'''
me_df_training=me_df_training.loc[:,["datetime","date","time","do","temp","tp","tn","do_lf","secchi"]]
me_df_testing=me_df_testing.loc[:,["datetime","date","time","do","temp","tp","tn","do_lf","secchi"]]
me_df_training.columns

In [None]:
'''
Define columns(features, date, target) for learning
'''
feature_cols=['temp','tp','tn','do_lf','secchi']
date_col=['datetime','date','time']
target_col=['do']

In [None]:
'''
Create a utility object
'''
utils = Utils(num_features=len(feature_cols), inp_cols=feature_cols, target_cols=target_col, date_col=date_col, num_out_features=output_size, device=device)

In [None]:
'''
Split train/test data(either by ratio or time)
'''
split_ratio=0.6
df_train,df_val=utils.train_test_split(me_df_training,split_ratio=split_ratio)
df_train

In [None]:
'''
Normalize the data
'''
df_train=utils.normalize(df_train)
df_val=utils.normalize(df_val,use_stat=True)
df_test=utils.normalize(me_df_testing,use_stat=True)
df_test

In [None]:
'''
Generating data sample
'''
#Lookback window
input_window=14
#Horizon window=1(for predicting the present value?!)
output_window=7
#Define the number of stride the sliding window need to take  while creating each sample(lookback window + horizon window= 1 sample)
stride=1

#Create samples: 1 sample = lookback window + horizon window
x_train,y_train=utils.windowed_dataset(df_train,input_window,output_window,stride)
x_val,y_val=utils.windowed_dataset(df_val,input_window,output_window,stride)
x_test,y_test=utils.windowed_dataset(df_test,input_window,output_window,stride)

In [None]:
'''
Modeling - Define hyperparameters for model training
'''
#Batch size during training
batch_size=32

#Number of epochs(1 epoch = 1 pass of the complete training data through the model)
epochs=100

#Learning rate specifies the rate where we want to update the model parameters after every training pass
learning_rate=0.0001

#Specify the amount of L2 regularization to be applied
weight_decay=0.05

In [None]:
'''
Define config file
'''
config={
    "batch_size":batch_size,
    "epochs":epochs,
    "learning_rate":learning_rate,
    "eval_freq":eval_freq,
    "batch_shuffle":batch_shuffle,
    "dropout":dropout,
    "num_layers":num_layer,
    "hidden_feature_size":hidden_feature_size,
    "model_type":model_type,
    "teacher_forcing_ratio":teacher_forcing_ratio,
    "max_lr":max_lr,
    "div_factor":div_factor,
    "pct_start":pct_start,
    "anneal_strategy":anneal_strategy,
    "final_div_factor":final_div_factor,
    "dataset":file_name,
    "split_ratio":split_ratio,
    "input_window":input_window,
    "output_window":output_window,
    "early_stop_thres":thres,
    "early_stop_delta":delta,
    "early_stop":early_stop,
    "weight_decay":weight_decay
}

In [None]:
'''
Create the seq2seq model
'''
model=seq2seq(input_size=x_train.shape[2],
              hidden_size=hidden_feature_size,
              output_size=output_size,
              model_type=model_type,
              num_layers=num_layer,
              utils=utils,
              dropout=dropout,
              device=device
              )

In [None]:
'''
Train the model
'''
loss,val_rmse,train_rmse=model.train_model(x_train,y_train,x_val,y_val,
                                            target_len=output_window,
                                            config=config,
                                            training_prediction=training_prediction)

In [None]:
'''
Plot train-val Root Mean Squared Error(RMSE)
'''
utils.plot_RMSE_epochs(val_rmse,train_rmse)

In [None]:
'''
Evaluation - Perform Evaluation
'''
train_eval_metrics=model.evaluate_batch(x_train.to(device),y_train.to(device))
val_eval_metrics=model.evaluate_batch(x_val.to(device),y_val.to(device))
test_eval_metrics=model.evaluate_batch(x_test.to(device),y_test.to(device))

In [None]:
'''
Visualization - Generate the plots on test data
'''
horizon_range=[1]  #Specify the list of T+n predictions to plot
plot_df_test=utils.plot_predictions(df_test,test_eval_metrics,horizon_range,split='Test') 
# this will plot T+1 predictions and Ground Observed Dissolved Oxygen and create a dataset(Timeline as label and T+1 predictions as one column)

In [None]:
'''
Convert the tag (Timeline) to a column and rename it
'''
plot_df_test=plot_df_test.reset_index()
plot_df_test.rename(columns={'index':'datetime'},inplace=True)

In [None]:
'''
Merge plot_df_test(the dataset with predictions) with actual observations
'''
merged_df_test=pd.merge(plot_df_test,me_df,on='datetime',how='left')
merged_df_test

In [None]:
'''
Save the model
'''
MODEL_PATH_SAVE = "./current_model_weights"
torch.save(model.state_dict(), MODEL_PATH_SAVE)