# Import Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import random
random.seed(530)

from glob import glob
from tqdm.auto import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

import tensorflow as tf
from keras.layers import Lambda, Input, concatenate, Conv1D, MaxPooling1D, Dense, Embedding, Flatten
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Data Prepare

## Data Load

In [None]:
data_path = "/content/drive/MyDrive/DKU/Ubiosis/total_data.csv"
data_df = pd.read_csv(data_path)

## Data Pre-Processing

In [None]:
def prepare_dataset(data_df, c_mode = "all", r_mode="org", scale_list = [1,1,1,1,1,1,1,1,1]):
    # 결측치 제거
    data_df.dropna(axis=0,inplace=True)

    # 데이터 정보 별 데이터프레임 분할
    radius_df = data_df.iloc[:,:1]
    cis1_df = data_df.iloc[:,1:6001]
    cis2_df = data_df.iloc[:,6001:12001]
    shear_df = data_df.iloc[:,12001:]
    shear_df = shear_df.div(scale_list, axis=1)
    re_cols = ["1","2","5","10","50","100","150","300","1000"]
    re_shear_df = shear_df[re_cols]

    if r_mode == "org":
        if c_mode == "all":
            data = pd.concat([radius_df, cis1_df, cis2_df,re_shear_df],axis=1)
        elif c_mode == "only2":
            data = pd.concat([radius_df, cis2_df,re_shear_df],axis=1)
        else:
            data = pd.DataFrame()

    elif r_mode == "ohe":
        # one-hot encoding
        ohe_cols = []
        for i in range(0,19):
            ohe_cols.append(((310+i)/100))

        ohe_target = np.array(ohe_cols).reshape(-1,1)
        ohe_value = np.array(radius_df["RADIUS"]).reshape(-1,1)

        ohe = OneHotEncoder()
        ohe.fit(ohe_target)

        ohe_labels = ohe.transform(ohe_value)
        ohe_targets = ohe_labels.toarray()

        ohe_df = pd.DataFrame(columns=ohe_cols,data=ohe_targets)

        if c_mode == "all":
            data = pd.concat([ohe_df, cis1_df, cis2_df,re_shear_df],axis=1)
        elif c_mode == "only2":
            data = pd.concat([ohe_df, cis2_df,re_shear_df],axis=1)
        else:
            data = pd.DataFrame()
    else:
        data = pd.DataFrame()

    return data

c_mode = "all" # all / only2
r_mode = "org" # org / ohe
scale_list = [10,10,10,10,10,10,10,15,20]

data = prepare_dataset(data_df, c_mode, r_mode, scale_list)

# Dataset Split

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=530)

tr_radius = np.array(train.iloc[:,:-12009].values)
tr_cis1 = np.array(train.iloc[:,-12009:-6009].values)
tr_cis2 = np.array(train.iloc[:,-6009:-9].values)
tr_shear = np.array(train.iloc[:,-9:-8].values)

te_radius = np.array(test.iloc[:,:-12009].values)
te_cis1 = np.array(test.iloc[:,-12009:-6009].values)
te_cis2 = np.array(test.iloc[:,-6009:-9].values)
te_shear = np.array(test.iloc[:,-9:-8].values)

# Model Define

In [None]:
def get_model(m_mode, input_dim_X1, input_dim_X2, input_dim_X3, output_dim):
    # 각 입력에 대한 입력 레이어 생성
    input_X1 = Input(shape=(input_dim_X1,)) # cis 1
    input_X2 = Input(shape=(input_dim_X2,)) # cis 2
    input_X3 = Input(shape=(input_dim_X3,)) # extra

    if m_mode == "org":
        X1_merged = concatenate([input_X3, input_X1])
        X2_merged = concatenate([input_X3, input_X2])
    elif m_mode == "emb":

        # 임베딩 레이어 적용
        embedded = Embedding(input_dim=10000, output_dim=128, input_length=input_dim_X3)(input_X3)
        embedded = Flatten()(embedded)
        X1_merged = concatenate([embedded, input_X1])
        X2_merged = concatenate([embedded, input_X2])

    X1_reshaped = Lambda(lambda x: K.expand_dims(x, axis=-1))(X1_merged)
    X2_reshaped = Lambda(lambda x: K.expand_dims(x, axis=-1))(X2_merged)

    # cis 1
    x1 = Conv1D(512, 3, activation='relu', strides=2, padding="same")(X1_reshaped)
    x1 = MaxPooling1D(2)(x1)
    x1 = Conv1D(256, 3, activation='relu', strides=2, padding="same")(x1)
    x1 = MaxPooling1D(2)(x1)
    x1 = Conv1D(128, 3, activation='relu', strides=2, padding="same")(x1)
    x1 = MaxPooling1D(2)(x1)
    x1 = Conv1D(64, 3, activation='relu', strides=2, padding="same")(x1)
    x1 = MaxPooling1D(2)(x1)
    x1 = Conv1D(32, 3, activation='relu', strides=2, padding="same")(x1)
    x1 = MaxPooling1D(2)(x1)
    x1 = Conv1D(16, 3, activation='relu', strides=2, padding="same")(x1)
    x1 = MaxPooling1D(2)(x1)
    x1 = Flatten()(x1)

    # cis 2
    x2 = Conv1D(512, 3, activation='relu', strides=2, padding="same")(X2_reshaped)
    x2 = MaxPooling1D(2)(x2)
    x2 = Conv1D(256, 3, activation='relu', strides=2, padding="same")(x2)
    x2 = MaxPooling1D(2)(x2)
    x2 = Conv1D(128, 3, activation='relu', strides=2, padding="same")(x2)
    x2 = MaxPooling1D(2)(x2)
    x2 = Conv1D(64, 3, activation='relu', strides=2, padding="same")(x2)
    x2 = MaxPooling1D(2)(x2)
    x2 = Conv1D(32, 3, activation='relu', strides=2, padding="same")(x2)
    x2 = MaxPooling1D(2)(x2)
    x2 = Conv1D(16, 3, activation='relu', strides=2, padding="same")(x2)
    x2 = MaxPooling1D(2)(x2)
    x2 = Flatten()(x2)

    # 모델 통합
    x = concatenate([x1, x2])
    output = Dense(output_dim)(x)

    model = tf.keras.models.Model(inputs=[input_X1, input_X2, input_X3], outputs=output)

    # 모델 컴파일
    opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True)
    model.compile(optimizer=opt, loss='mae',
                  metrics=[tf.keras.metrics.MeanSquaredError(),tf.keras.metrics.RootMeanSquaredError(),
                           tf.keras.metrics.MeanAbsoluteError(),tf.keras.metrics.MeanAbsolutePercentageError()])

    return model

# Model Train

In [None]:
m_mode = "emb"
if r_mode =="ohe":
    m_mode = "org"
input_dim_X1, input_dim_X2, input_dim_X3, output_dim = tr_cis1.shape[1], tr_cis2.shape[1], tr_radius.shape[1], tr_shear.shape[1]

model = get_model(m_mode, input_dim_X1, input_dim_X2, input_dim_X3, output_dim)

In [None]:
# 모델 학습
es = EarlyStopping(monitor='val_loss', patience=20, mode='min')
history = model.fit([tr_cis1, tr_cis2, tr_radius], tr_shear, epochs=100, batch_size=64,
                    validation_split=0.2, validation_batch_size=64,
                    verbose=1, callbacks=es)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100

In [None]:
# Visualize Train History
def vis(history,name) :
    plt.title(f"{name.upper()}")
    plt.xlabel('epochs')
    plt.ylabel(f"{name.lower()}")
    value = history.history.get(name)
    val_value = history.history.get(f"val_{name}",None)
    epochs = range(1, len(value)+1)
    plt.plot(epochs, value, 'b-', label=f'training {name}')
    if val_value is not None :
        plt.plot(epochs, val_value, 'r:', label=f'validation {name}')
    plt.legend(loc='upper center', bbox_to_anchor=(0.05, 1.2) , fontsize=10 , ncol=1)

def plot_history(history) :
    key_value = list(set([i.split("val_")[-1] for i in list(history.history.keys())]))
    plt.figure(figsize=(12, 4))
    for idx , key in enumerate(key_value) :
        plt.subplot(1, len(key_value), idx+1)
        vis(history, key)
    plt.tight_layout()
    plt.show()

plot_history(history)

# Model Test

In [None]:
y_pred = model.predict([te_cis1, te_cis2, te_radius])
print(f"Predict Result\n{y_pred[:5]}\n")
print(f"Test Dataset\n{te_shear[:5]}")



# Evaluation

In [None]:
scale_list.sort(reverse=True)
def unscale_values(y_list):
    unscale_list = [x * y for x,y in zip(y_list,scale_list)]
    return unscale_list

un_y_pred = unscale_values(y_pred)
un_y_test = unscale_values(te_shear)

col_list = ["1000","300", "150", "100", "50", "10", "5", "2", "1"]
y_real_df = pd.DataFrame(columns=col_list, data=un_y_test)
y_pred_df = pd.DataFrame(columns=col_list, data=un_y_pred)

In [None]:
def pearson_correlation_coefficient(X, Y):
    """
    두 변수 X와 Y 간의 피어슨 상관계수를 계산하는 함수

    :param X: 첫 번째 변수의 값들을 담은 1차원 NumPy 배열
    :param Y: 두 번째 변수의 값들을 담은 1차원 NumPy 배열
    :return: 피어슨 상관계수
    """
    # 변수들의 평균 계산
    mean_X = np.mean(X)
    mean_Y = np.mean(Y)

    # 각 변수들의 편차 계산
    deviation_X = X - mean_X
    deviation_Y = Y - mean_Y

    # 피어슨 상관계수의 분자 계산
    numerator = np.sum(deviation_X * deviation_Y)

    # 피어슨 상관계수의 분모 계산
    denominator = np.sqrt(np.sum(deviation_X ** 2) * np.sum(deviation_Y ** 2))

    # 피어슨 상관계수 계산
    pearson_coefficient = numerator / denominator

    return pearson_coefficient

In [None]:
mae_list = []
mse_list = []
rmse_list = []
mape_list = []
r2_list = []
pc_list = []
for i in range(len(col_list)):
    #print(f"{col_list[i]}")
    real_v = list(y_real_df[col_list[i]].values)
    pred_v = list(y_pred_df[col_list[i]].values)

    mae = mean_absolute_error(real_v, pred_v)
    mse = mean_squared_error(real_v, pred_v)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(real_v, pred_v)
    r2_scores = r2_score(real_v, pred_v)
    pearson_scores = pearson_correlation_coefficient(real_v, pred_v)

    mae_list.append(mae)
    mse_list.append(mse)
    rmse_list.append(rmse)
    mape_list.append(mape)
    r2_list.append(r2_scores)
    pc_list.append(pearson_scores)

ev_df = pd.DataFrame(columns=["MAE","MSE","RMSE","MAPE","R2","Pearson"])
ev_df["MAE"] = mae_list
ev_df["MSE"] = mse_list
ev_df["RMSE"] = rmse_list
ev_df["MAPE"] = mape_list
ev_df["R2"] = r2_list
ev_df["Pearson"] = pc_list
ev_df.index = col_list