# Import Libraries

In [None]:
!pip install pytorch-tabnet

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.augmentations import RegressionSMOTE

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import os
import warnings
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from glob import glob

warnings.filterwarnings('ignore')
np.random.seed(530)
random.seed(530)

In [None]:
print(torch.cuda.is_available())
# GPU 사용 가능 -> 가장 빠른 번호 GPU, GPU 사용 불가 -> CPU 자동 지정 예시
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

True
cuda:0


# Load & Split Dataset

In [None]:
data_path = "/content/drive/MyDrive/DKU/Ubiosis/total_data.csv"
data_df = pd.read_csv(data_path)
data_df.dropna(axis=0,inplace=True)
data_df.tail(3)

Unnamed: 0,RADIUS,CIS_1_1,CIS_1_2,CIS_1_3,CIS_1_4,CIS_1_5,CIS_1_6,CIS_1_7,CIS_1_8,CIS_1_9,...,CIS_2_6000,1000,300,150,100,50,10,5,2,1
79333,3.2,596.0,596.0,596.0,596.0,596.0,596.0,596.0,596.0,596.0,...,523.0,3.1,3.7,4.1,4.4,5.1,8.5,11.7,19.5,30.5
79334,3.16,630.0,630.0,630.0,630.0,630.0,630.0,630.0,630.0,630.0,...,517.0,2.6,2.6,2.6,2.7,2.7,3.2,3.5,4.2,5.1
79335,3.16,594.0,594.0,594.0,594.0,595.0,595.0,595.0,595.0,595.0,...,540.0,3.5,3.6,3.9,4.1,4.7,7.7,10.4,17.1,26.5


In [None]:
# 데이터 정보 별 데이터프레임 분할
radius_df = data_df.iloc[:,:1]
cis1_df = data_df.iloc[:,1:6001]
cis2_df = data_df.iloc[:,6001:12001]
shear_df = data_df.iloc[:,12001:]

In [None]:
# Shear Rate Scaling
scale_list = [10,10,10,10,10,10,10,15,20]
shear_df = shear_df.div(scale_list, axis=1)
shear_df.head(3)

In [None]:
data_df = pd.concat[[ohe_df, cis1_df, cis2_df, shear_df], axis=1]

In [None]:
train, test = train_test_split(data_df, test_size=0.2,random_state=530)
train, valid = train_test_split(train, test_size=0.2,random_state=530)

X_train = train.iloc[:,:-9].reset_index(drop=True).to_numpy()
y_train = train.iloc[:,-9:].reset_index(drop=True).to_numpy()
X_valid = valid.iloc[:,:-9].reset_index(drop=True).to_numpy()
y_valid = valid.iloc[:,-9:].reset_index(drop=True).to_numpy()
X_test = test.iloc[:,:-9].reset_index(drop=True).to_numpy()
y_test = test.iloc[:,-9:].reset_index(drop=True).to_numpy()

# TabNet Parameter

In [None]:
model = TabNetRegressor(n_d=64, n_a=64,n_steps=3, gamma=1.3,
                        cat_dims=[], cat_emb_dim=[], cat_idxs=[],
                        n_independent=5, n_shared=5, epsilon=1e-15,
                        seed=530, momentum=0.4, optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=2e-2),
                        scheduler_params={"step_size":50,"gamma":0.9},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        mask_type='entmax', # sparsemax / entmax
                        lambda_sparse=1e-3
                        )

In [None]:
max_epochs = 1000 if not os.getenv("CI", False) else 2

In [None]:
aug = RegressionSMOTE(p=0.2)

## Model Training

In [None]:
model.fit( X_train=X_train, y_train=y_train,
          eval_set=[(X_train, y_train), (X_valid, y_valid)],
          eval_name=['train', 'valid'],
          eval_metric=['mse', 'rmse', 'mae'],
          max_epochs=max_epochs,
          patience=50,
          batch_size=128, virtual_batch_size=64,
          num_workers=0,
          drop_last=False,
          augmentations=aug
)

# Model Test

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred[:5]

array([[ 3.7629685,  4.195734 ,  4.6138444,  4.9465117,  5.7822623,
         9.964745 , 13.578662 , 22.664858 , 35.950493 ],
       [ 3.9935122,  4.458596 ,  4.9070125,  5.2589207,  6.1455417,
        10.53552  , 14.330327 , 23.866064 , 37.752083 ],
       [ 3.4783878,  3.8552284,  4.2199073,  4.508431 ,  5.2184453,
         8.686758 , 11.690649 , 19.014765 , 29.68295  ],
       [ 2.9485006,  3.2908857,  3.595682 ,  3.8183856,  4.366695 ,
         7.047172 ,  9.378632 , 14.870757 , 22.850574 ],
       [ 3.8118658,  4.255247 ,  4.6752343,  5.003948 ,  5.8254957,
         9.962169 , 13.554658 , 22.49678  , 35.537678 ]], dtype=float32)

In [None]:
def unscale_values(scaled_list):
    unscale_list = []
    for i in range(len(scaled_list)):
        row = scaled_list[i]
        for j in range(len(scale_list)):
            row[j] = row[j] * scale_list[j]
        unscale_list.append(row)
    return unscale_list

un_y_pred = unscale_values(y_pred)
un_y_test = unscale_values(y_test.values)

# Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

def pearson_correlation_coefficient(X, Y):
    """
    두 변수 X와 Y 간의 피어슨 상관계수를 계산하는 함수

    :param X: 첫 번째 변수의 값들을 담은 1차원 NumPy 배열
    :param Y: 두 번째 변수의 값들을 담은 1차원 NumPy 배열
    :return: 피어슨 상관계수
    """
    # 변수들의 평균 계산
    mean_X = np.mean(X)
    mean_Y = np.mean(Y)

    # 각 변수들의 편차 계산
    deviation_X = X - mean_X
    deviation_Y = Y - mean_Y

    # 피어슨 상관계수의 분자 계산
    numerator = np.sum(deviation_X * deviation_Y)

    # 피어슨 상관계수의 분모 계산
    denominator = np.sqrt(np.sum(deviation_X ** 2) * np.sum(deviation_Y ** 2))

    # 피어슨 상관계수 계산
    pearson_coefficient = numerator / denominator

    return pearson_coefficient

In [None]:
col_list = ["1000","300","150","100","50","10","5","2","1"]
y_test_df = pd.DataFrame(columns=col_list, data=y_test)
y_pred_df = pd.DataFrame(columns=col_list, data=y_pred)

In [None]:
for i in range(len(col_list)):
    print(f"{col_list[i]}")
    real_v = list(y_test_df[col_list[i]].values)
    pred_v = list(y_pred_df[col_list[i]].values)

    mae = mean_absolute_error(real_v, pred_v)
    mse = mean_squared_error(real_v, pred_v)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(real_v, pred_v)
    r2_scores = r2_score(real_v, pred_v)
    pearson_scores = pearson_correlation_coefficient(real_v, pred_v)

    print(f"MAE : {mae}")
    print(f"MSE : {mse}")
    print(f"RMSE : {rmse}")
    print(f"MAPE : {mape}")
    print(f"R2 : {r2_scores}")
    print(f"Pearson : {pearson_scores}")
    print()

1000
MAE : 0.17395573514878734
MSE : 0.2016357204924405
RMSE : 0.4490386625808968
MAPE : 0.04839067892649853
R2 : 0.5781762180756737
Pearson : 0.8055990499413721

300
MAE : 0.16277245114545003
MSE : 0.1643931670363868
RMSE : 0.4054542724357295
MAPE : 0.04028709185202358
R2 : 0.7051377274408477
Pearson : 0.8601753259136318

150
MAE : 0.1755389670205411
MSE : 0.1251014928271698
RMSE : 0.3536968940027178
MAPE : 0.03979344128174019
R2 : 0.8184898573785219
Pearson : 0.9085599344949421

100
MAE : 0.1953959195626482
MSE : 0.17196040409291377
RMSE : 0.414681087213914
MAPE : 0.04114156345299086
R2 : 0.7949961385593383
Pearson : 0.8966354170258765

50
MAE : 0.2559212217530466
MSE : 0.20376974021196526
RMSE : 0.45140861778655184
MAPE : 0.04613586772482351
R2 : 0.8419894413146315
Pearson : 0.9192040159455762

10
MAE : 0.4683034089611074
MSE : 0.7343685263990655
RMSE : 0.8569530479548255
MAPE : 0.05362539077359263
R2 : 0.8576463087352738
Pearson : 0.9291518295862177

5
MAE : 0.6638786914759088
MSE 

# Model Save

In [None]:
save_model = model.network.eval()

# Create an example input tensor
example_input = torch.randn(1, len(X_train[0])).to(device)

# Check if GPU is available, otherwise use CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move the model to the same device
save_model.to(device)

# Export the model to ONNX format
# 모델 변환
torch.onnx.export(save_model,               # 실행될 모델
                  example_input,                         # 모델 입력값 (튜플 또는 여러 입력값들도 가능)
                  "/content/drive/MyDrive/DKU/Ubiosis/onnx_model/tabnet_model_230702.onnx",   # 모델 저장 경로 (파일 또는 파일과 유사한 객체 모두 가능)
                  export_params=True,        # 모델 파일 안에 학습된 모델 가중치를 저장할지의 여부
                  opset_version=11,          # 모델을 변환할 때 사용할 ONNX 버전
                  do_constant_folding=True,  # 최적화시 상수폴딩을 사용할지의 여부
                  input_names = ['input'],   # 모델의 입력값을 가리키는 이름
                  output_names = ['output'], # 모델의 출력값을 가리키는 이름
                  dynamic_axes={'input' : {0 : 'batch_size'},    # 가변적인 길이를 가진 차원
                                'output' : {0 : 'batch_size'}})