# Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings('ignore')
import random
random.seed(530)

from glob import glob
from tqdm.auto import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

import tensorflow as tf
from keras.layers import Lambda, Input, concatenate, Conv2D, MaxPooling2D, Dense, Embedding, Flatten, LSTM, TimeDistributed
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Data Load

In [3]:
data_path = "/content/drive/MyDrive/DKU/Ubiosis/total_data.csv"
data_df = pd.read_csv(data_path)

## Data Pre-Processing

In [4]:
def prepare_dataset(data_df, c_mode = "all", r_mode="org", scale_list = [1,1,1,1,1,1,1,1,1]):
    data = {}
    # 결측치 제거
    data_df.dropna(axis=0,inplace=True)

    # 데이터 정보 별 데이터프레임 분할
    radius_df = data_df.iloc[:,:1]
    cis1_df = data_df.iloc[:,1:6001]
    cis2_df = data_df.iloc[:,6001:12001]
    shear_df = data_df.iloc[:,12001:]
    shear_df = shear_df.div(scale_list, axis=1)
    re_cols = ["1","2","5","10","50","100","150","300","1000"]
    re_shear_df = shear_df[re_cols]

    if r_mode == "org":
        if c_mode == "all":
            data["radius"] = radius_df.values
            cis_arr = np.array([cis1_df.values, cis2_df.values])
            cis_arr = cis_arr.reshape((len(data_df), 2, cis2_df.shape[1],1))
            data["cis"] = cis_arr
            data["shear"] = re_shear_df.values
        elif c_mode == "only2":
            data["radius"] = radius_df.values
            cis_arr = np.array([cis2_df.values])
            cis_arr = cis_arr.reshape((len(data_df), 1, cis2_df.shape[1],1))
            data["cis"] = cis_arr
            data["shear"] = re_shear_df.values
        else:
            pass

    elif r_mode == "ohe":
        # one-hot encoding
        ohe_cols = []
        for i in range(0,19):
            ohe_cols.append(((310+i)/100))

        ohe_target = np.array(ohe_cols).reshape(-1,1)
        ohe_value = np.array(radius_df["RADIUS"]).reshape(-1,1)

        ohe = OneHotEncoder()
        ohe.fit(ohe_target)

        ohe_labels = ohe.transform(ohe_value)
        ohe_targets = ohe_labels.toarray()

        ohe_df = pd.DataFrame(columns=ohe_cols,data=ohe_targets)

        if c_mode == "all":
            data["radius"] = ohe_df.values
            cis_arr = np.array([cis1_df.values, cis2_df.values])
            cis_arr = cis_arr.reshape((len(data_df), 2, cis2_df.shape[1],1))
            data["cis"] = cis_arr
            data["shear"] = re_shear_df.values
        elif c_mode == "only2":
            data["radius"] = ohe_df.values
            cis_arr = np.array([cis2_df.values])
            cis_arr = cis_arr.reshape((len(data_df), 1, cis2_df.shape[1],1))
            data["cis"] = cis_arr
            data["shear"] = re_shear_df.values
        else:
            pass
    else:
        pass

    return data

c_mode = "all" # all / only2
r_mode = "org" # org / ohe
scale_list = [10,10,10,10,10,10,10,15,20]

data = prepare_dataset(data_df, c_mode, r_mode, scale_list)

## Dataset Split

In [5]:
r_data = data["radius"]
c_data = data["cis"]
s_data = data["shear"]

train_indicis = int(len(r_data) * 0.7)
train_r = r_data[:train_indicis]
train_c = c_data[:train_indicis]
train_s = s_data[:train_indicis]

test_r = r_data[train_indicis:]
test_c = c_data[train_indicis:]
test_s = s_data[train_indicis:]

In [18]:
def get_model(m_mode, input_dim_X1, input_dim_X2, output_dim):
    # 각 입력에 대한 입력 레이어 생성
    input_X1 = Input(shape=input_dim_X1) # Radius
    input_X2 = Input(shape=input_dim_X2) # CIS

    if m_mode == "org":
        radius = input_X1
    elif m_mode == "emb":
        # 임베딩 레이어 적용
        embedded = Embedding(input_dim=16, output_dim=8, input_length=input_dim_X1)(input_X1)
        radius = Flatten()(embedded)

    cis = Conv2D(1024, (1, 3), activation='relu')(input_X2)
    cis = MaxPooling2D((1, 2))(cis)
    cis = Conv2D(512, (1, 3), activation='relu')(cis)
    cis = MaxPooling2D((1, 2))(cis)
    cis = Conv2D(256, (1, 3), activation='relu')(cis)
    cis = MaxPooling2D((1, 2))(cis)
    # cis = Conv2D(128, (1, 3), activation='relu')(cis)
    # cis = MaxPooling2D((1, 2))(cis)
    # cis = Conv2D(64, (1, 3), activation='relu')(cis)
    # cis = MaxPooling2D((1, 2))(cis)
    # cis = Conv2D(32, (1, 3), activation='relu')(cis)
    # cis = MaxPooling2D((1, 2))(cis)
    # cis = Conv2D(16, (1, 3), activation='relu')(cis)
    # cis = MaxPooling2D((1, 2))(cis)
    # cis = Flatten()(cis)
    cis = TimeDistributed(Flatten())(cis)
    cis = LSTM(128, activation="relu", return_sequences=True)(cis) # relu -> GPU X
    cis = LSTM(64, activation="relu", return_sequences=True)(cis)
    cis = LSTM(32, activation="relu", return_sequences=True)(cis)
    cis = LSTM(16, activation="relu", return_sequences=False)(cis)


    # 모델 통합
    x = concatenate([radius, cis])
    # output = Dense(64, activation="relu")(x)
    output = Dense(output_dim, activation="linear")(x)

    model = tf.keras.models.Model(inputs=[input_X1, input_X2], outputs=output)

    # 모델 컴파일
    opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)
    model.compile(optimizer=opt, loss=['mse','mae'],
                  metrics=[tf.keras.metrics.MeanSquaredError(),tf.keras.metrics.RootMeanSquaredError(),
                           tf.keras.metrics.MeanAbsoluteError(),tf.keras.metrics.MeanAbsolutePercentageError()])

    return model

In [19]:
m_mode = "emb"
if r_mode =="ohe":
    m_mode = "org"
input_dim_X1, input_dim_X2, output_dim = train_r.shape[1], (train_c.shape[1], train_c.shape[2],1), train_s.shape[1]

model = get_model(m_mode, input_dim_X1, input_dim_X2, output_dim)



In [20]:
# 모델 학습
es = EarlyStopping(monitor='val_loss', patience=10, mode='min')
history = model.fit([train_r, train_c], train_s, epochs=100, batch_size=64,
                    validation_split=0.2, validation_batch_size=64,
                    verbose=1, callbacks=es)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


# Model Evaluation

In [21]:
y_pred = model.predict([test_r, test_c])
print(f"Predict Result\n{y_pred[:5]}\n")
print(f"Test Dataset\n{test_s[:5]}")

Predict Result
[[1.5620614  1.3300738  1.2095766  0.8832962  0.5315983  0.4616878
  0.4319583  0.39401037 0.35356423]
 [1.5620614  1.3300738  1.2095766  0.8832962  0.5315983  0.4616878
  0.4319583  0.39401037 0.35356423]
 [1.5620614  1.3300738  1.2095766  0.8832962  0.5315983  0.4616878
  0.4319583  0.39401037 0.35356423]
 [1.5620614  1.3300738  1.2095766  0.8832962  0.5315983  0.4616878
  0.4319583  0.39401037 0.35356423]
 [1.5620614  1.3300738  1.2095766  0.8832962  0.5315983  0.4616878
  0.4319583  0.39401037 0.35356423]]

Test Dataset
[[1.48999996 1.26666667 1.13999996 0.83000002 0.5        0.43000002
  0.4        0.37       0.33      ]
 [1.31999998 1.13333333 1.03999996 0.76999998 0.46999998 0.40999999
  0.38       0.35       0.3       ]
 [1.49499998 1.27333336 1.15       0.83999996 0.5        0.43000002
  0.40999999 0.37       0.32      ]
 [1.15500002 1.         0.93000002 0.7        0.44000001 0.39000001
  0.35999999 0.33       0.3       ]
 [1.76499996 1.48000005 1.31000004 0.93

In [10]:
scale_list.sort(reverse=True)
def unscale_values(y_list):
    unscale_list = [x * y for x,y in zip(y_list,scale_list)]
    return unscale_list

un_y_pred = unscale_values(y_pred)
un_y_test = unscale_values(test_s)

col_list = ["1000","300", "150", "100", "50", "10", "5", "2", "1"]
y_real_df = pd.DataFrame(columns=col_list, data=un_y_test)
y_pred_df = pd.DataFrame(columns=col_list, data=un_y_pred)

In [11]:
def pearson_correlation_coefficient(X, Y):
    """
    두 변수 X와 Y 간의 피어슨 상관계수를 계산하는 함수

    :param X: 첫 번째 변수의 값들을 담은 1차원 NumPy 배열
    :param Y: 두 번째 변수의 값들을 담은 1차원 NumPy 배열
    :return: 피어슨 상관계수
    """
    # 변수들의 평균 계산
    mean_X = np.mean(X)
    mean_Y = np.mean(Y)

    # 각 변수들의 편차 계산
    deviation_X = X - mean_X
    deviation_Y = Y - mean_Y

    # 피어슨 상관계수의 분자 계산
    numerator = np.sum(deviation_X * deviation_Y)

    # 피어슨 상관계수의 분모 계산
    denominator = np.sqrt(np.sum(deviation_X ** 2) * np.sum(deviation_Y ** 2))

    # 피어슨 상관계수 계산
    pearson_coefficient = numerator / denominator

    return pearson_coefficient

In [12]:
mae_list = []
mse_list = []
rmse_list = []
mape_list = []
r2_list = []
pc_list = []
for i in range(len(col_list)):
    #print(f"{col_list[i]}")
    real_v = list(y_real_df[col_list[i]].values)
    pred_v = list(y_pred_df[col_list[i]].values)

    mae = mean_absolute_error(real_v, pred_v)
    mse = mean_squared_error(real_v, pred_v)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(real_v, pred_v)
    r2_scores = r2_score(real_v, pred_v)
    pearson_scores = pearson_correlation_coefficient(real_v, pred_v)

    mae_list.append(mae)
    mse_list.append(mse)
    rmse_list.append(rmse)
    mape_list.append(mape)
    r2_list.append(r2_scores)
    pc_list.append(pearson_scores)

ev_df = pd.DataFrame(columns=["MAE","MSE","RMSE","MAPE","R2","Pearson"])
ev_df["MAE"] = mae_list
ev_df["MSE"] = mse_list
ev_df["RMSE"] = rmse_list
ev_df["MAPE"] = mape_list
ev_df["R2"] = r2_list
ev_df["Pearson"] = pc_list
ev_df.index = col_list

In [13]:
ev_df

Unnamed: 0,MAE,MSE,RMSE,MAPE,R2,Pearson
1000,2.259364,7.887456,2.808461,0.13131,0.661325,0.84576
300,1.84018,5.321052,2.306741,0.125007,0.681255,0.857003
150,1.524931,3.815423,1.953311,0.112463,0.713674,0.877122
100,1.04728,1.835629,1.354854,0.10388,0.737109,0.891441
50,0.525729,0.508588,0.713153,0.085414,0.799505,0.917609
10,0.447241,0.341035,0.583982,0.082962,0.816964,0.934159
5,0.397029,0.272122,0.521653,0.080178,0.825921,0.933497
2,0.38294,0.235526,0.48531,0.084668,0.829862,0.945519
1,0.340328,0.196311,0.44307,0.084469,0.815617,0.937977


# Export Model

In [None]:
!pip install tf2onnx



In [None]:
import tf2onnx

# 변환할 모델을 입력합니다.
input_signature = [
    tf.TensorSpec(shape=(None, train_r.shape[1]), dtype=tf.float32),
    tf.TensorSpec(shape=(None, train_c.shape[1],train_c.shape[2],train_c.shape[3]), dtype=tf.float32)
]
# tf2onnx 변환 함수를 사용하여 모델을 ONNX로 변환합니다.
onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature=input_signature)

# ONNX 모델을 파일로 저장합니다.
with open(f"/content/drive/MyDrive/DKU/Ubiosis/Method5_CNN+LSTM_{r_mode}_{c_mode}_{m_mode}.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())