In [None]:
## nessisary dependents
from glob2 import glob
import numpy as np
import math
import numpy as np
import pandas as pd
import argparse

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from utils import get_dataset_from_csv
from IPython import embed

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support


import matplotlib.pyplot as plt

In [None]:
ARG_MODELS_PATH = "models"

In [None]:
models = glob(ARG_MODELS_PATH + "/*")
print("Founded models:")
for i, model in enumerate(models):
    print(i, model)

In [None]:
ARG_TEST_DATA_ORIG = "test_reg_data_orig.csv" 
ARG_TEST_DATA_PATH = "test_reg_data.csv" 
ARG_TRAIN_DATA_PATH = "train_reg_data.csv" 
ARG_TRAIN_DATA_ORIG = "train_reg_data_orig.csv" 

In [None]:
train_data_orig = pd.read_csv(ARG_TRAIN_DATA_ORIG)
train_data = pd.read_csv(ARG_TRAIN_DATA_PATH)
test_data = pd.read_csv(ARG_TEST_DATA_PATH)
test_data.head()

In [None]:
CSV_HEADER = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8A', 'B5_B4', 
               'B3_B2', 'B3_B4', 'B4_B5_B6', 'avw', 'B3_B5', 'diff_alg', 
               #'Area_km2', 'Shoreline_development', 'Type', 
               'chla_ug_L']

In [None]:
batch_size = 1000

eval_dataset = get_dataset_from_csv(
    ARG_TEST_DATA_PATH, 
    CSV_HEADER,
    "chla_ug_L", batch_size=batch_size
) 

train_dataset = get_dataset_from_csv(
    ARG_TRAIN_DATA_PATH, 
    CSV_HEADER,
    "chla_ug_L", batch_size=batch_size
) 


In [None]:
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error


def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def ReLU(x):
    return x * (x > 0)

In [None]:
#plt.figure()
lw = 2

fpr = dict()
tpr = dict()
roc_auc = dict()

model_list = []
mse_list = []
mse_train_list = []

mae_list = []
rmse_list = []
r2_list = []
msle_list = []
mape_list = []

n_param_list = []

for i, model in tqdm(enumerate(models)):
    model_name = model.split("/")[1].split("-")[0]
    
    
    
    print("eval model: {}".format(model))
    m0 = keras.models.load_model(model)
    n_param = m0.count_params()
    
    y_pred_train = ReLU(m0.predict(train_dataset))
    y_true_train = train_data["chla_ug_L"]

    mse_train = mean_squared_error(y_true_train, y_pred_train)
    
    
    y_pred = ReLU(m0.predict(eval_dataset))
    y_true = test_data["chla_ug_L"]

    mse = mean_squared_error(y_true, y_pred)
            
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=True)
    
    mape = MAPE(y_true, y_pred) / 100
    
    r2 = r2_score(y_true, y_pred)
    rsle = mean_squared_log_error(y_true, y_pred)

    
    model_list.append(model_name)
    mse_list.append(mse)
    mse_train_list.append(mse_train)
    mae_list.append(mae)
    rmse_list.append(rmse)
    r2_list.append(r2)
    msle_list.append(rsle)
    mape_list.append(mape)
    
    n_param_list.append(n_param)


In [None]:
idx = np.argmin(mse_list)
idx

In [None]:
summary_table = pd.DataFrame({"Model": model_list, 
                              "MSE": mse_list,
                              "MSE train": mse_train_list,
                              "Number of params": n_param_list,
                            })
summary_table = summary_table.sort_values(by=['MSE'])
summary_table

In [None]:
print(summary_table.round(3).to_latex(index=False))

In [None]:
test_results = pd.DataFrame({"Model": model_list, 
                             "MAE": mae_list,
                             "MSE": mse_list,
                             "R2": r2_list,
                             "MAPE": mape_list,
                             "MSE train": mse_train_list,
                             "Number of params": n_param_list,
                            })

test_results.sort_values(by=['MAPE'])

In [None]:
models[89]

In [None]:
test_results.sort_values(by=['MAE'])

In [None]:
import pycaret
from pycaret.regression import *

In [None]:
exp_clf101 = setup(data = train_data, target = 'chla_ug_L', 
                   test_data = test_data,
                   session_id=123) 

In [None]:
best_model = compare_models(n_select = 19)

In [None]:
res_list = []
train_acc = []
for j in range(19):
    predictions = predict_model(best_model[j], data=test_data)
    res = pull()
    res_list.append(res)
    predictions = predict_model(best_model[j], data=train_data)
    res = pull()
    train_acc.append(float(res["MSE"]))

In [None]:
pycaret= pd.concat(res_list)
pycaret["MSE train"] = train_acc
pycaret["Number of params"] = "-"
pycaret = pycaret[["Model", "MAE", "MSE", "R2", "MSE train", "Number of params"]]

In [None]:
list(pycaret["Model"])

In [None]:
final = pd.concat([test_results, pycaret])

In [None]:
final = final.sort_values(by=['MSE'])
final