In [2]:
from __future__ import absolute_import, division, print_function

import numpy as np
%load_ext autoreload
%autoreload 2

import sys, os
script_dir  = os.path.normpath(os.path.abspath("."))
root_dir    = os.path.normpath(os.path.abspath(script_dir + "/../.."))
if root_dir in sys.path: sys.path.remove(root_dir)
sys.path.insert(1, root_dir)

import warnings
warnings.filterwarnings("ignore")

In [61]:
import glob, pandas as pd
import tqdm, numpy as np
from IPython import display
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [4]:
data_root = f'{root_dir}/data/AI_competition'
prep_root = f'{data_root}/preprocessed'
df_data   = pd.read_excel(f'{prep_root}/final_info.xlsx')

In [23]:
df_data_day = df_data.groupby(['cdate', 'fplace']).mean().reset_index()

In [24]:
train_data = df_data_day.query('fplace=="고성"')[["Temperatue","DO","pH","salinity","NTU","ncells"]].values
test_data  = df_data_day.query('fplace=="일해"')[["Temperatue","DO","pH","salinity","NTU","ncells"]].values

In [59]:
x_train, y_train = train_data[:, :-1], train_data[:, -1]
x_test, y_test = test_data[:, :-1], test_data[:, -1]

y_test  = np.floor(y_test)
y_train = np.floor(y_train)

print(f'train: {x_train.shape}, {y_train.shape}')
print(f'test:  {x_test.shape},  {y_test.shape}')

standard_scaler = StandardScaler()
standard_scaler.fit(x_train)

x_train = standard_scaler.transform(x_train)
x_test = standard_scaler.transform(x_test)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

train: (21, 5), (21,)
test:  (21, 5),  (21,)


In [60]:
cv = KFold(n_splits=10, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
CV_mape_array     =[]
CV_mse_array     =[]
for train_index, test_index in cv.split(x_train):
    x1_train, x1_valid = x_train[train_index], x_train[test_index]
    y1_train, y1_valid = y_train[train_index], y_train[test_index]
    regressor = TabNetRegressor(verbose=0,seed=42)
    regressor.fit(X_train=x1_train, y_train=y1_train,
              eval_set=[(x1_valid, y1_valid)],
              patience=300, max_epochs=2000,
              eval_metric=['rmse'])
    CV_score_array.append(regressor.best_cost)
    y1_pred_valid = regressor.predict(x1_valid)
    CV_mape_array.append(mean_absolute_percentage_error(y1_valid, y1_pred_valid))
    CV_mse_array.append(mean_squared_error(y1_valid, y1_pred_valid))



Early stopping occurred at epoch 356 with best_epoch = 56 and best_val_0_rmse = 39.73295
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 370 with best_epoch = 70 and best_val_0_rmse = 9.3619
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 460 with best_epoch = 160 and best_val_0_rmse = 21.33043
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 409 with best_epoch = 109 and best_val_0_rmse = 12.24109
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 422 with best_epoch = 122 and best_val_0_rmse = 8.68239
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 339 with best_epoch = 39 and best_val_0_rmse = 44.1936
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 429 with best_epoch = 129 and best_val_0_rmse = 43.4636
Best weights from best epoch are automatically used!


In [62]:
np.mean(CV_mape_array), np.mean(CV_mse_array)

(0.27807645453373714, 1056.7656595174326)

In [63]:
x_train, y_train = train_data[:, :-1], train_data[:, -1]
x_test, y_test = test_data[:, :-1], test_data[:, -1]

y_test  = np.floor(y_test)
y_train = np.floor(y_train)

print(f'train: {x_train.shape}, {y_train.shape}')
print(f'test:  {x_test.shape},  {y_test.shape}')

standard_scaler = StandardScaler()
standard_scaler.fit(x_test)

x_train = standard_scaler.transform(x_train)
x_test = standard_scaler.transform(x_test)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

cv = KFold(n_splits=10, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
CV_mape_array     =[]
CV_mse_array      =[]
for train_index, test_index in cv.split(x_test):
    x1_train, x1_valid = x_test[train_index], x_test[test_index]
    y1_train, y1_valid = y_test[train_index], y_test[test_index]
    regressor = TabNetRegressor(verbose=0,seed=42)
    regressor.fit(X_train=x1_train, y_train=y1_train,
              eval_set=[(x1_valid, y1_valid)],
              patience=300, max_epochs=2000,
              eval_metric=['rmse'])
    CV_score_array.append(regressor.best_cost)
    y1_pred_valid = regressor.predict(x1_valid)
    CV_mape_array.append(mean_absolute_percentage_error(y1_valid, y1_pred_valid))
    CV_mse_array.append(mean_squared_error(y1_valid, y1_pred_valid))

train: (21, 5), (21,)
test:  (21, 5),  (21,)

Early stopping occurred at epoch 445 with best_epoch = 145 and best_val_0_rmse = 64.64029
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 382 with best_epoch = 82 and best_val_0_rmse = 22.22554
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 698 with best_epoch = 398 and best_val_0_rmse = 22.58709
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 396 with best_epoch = 96 and best_val_0_rmse = 116.54479
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 363 with best_epoch = 63 and best_val_0_rmse = 19.0457
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 530 with best_epoch = 230 and best_val_0_rmse = 64.87496
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 365 with best_epoch = 65 and best_val_0_rmse = 30.97446
Best

In [64]:
np.mean(CV_mape_array), np.mean(CV_mse_array)

(0.3970292661155891, 2834.1863348215556)