In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import math
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import shap
import os

# Load the data
data = pd.read_csv('PATH_to_cvs')

# Drop unnecessary columns
data_cleaned = data.drop(columns=['E*', 'E**'])    ##Drop the unnecessary output, E*,E**,E*** are one of the Eb,Ec,Eform
data_cleaned.rename(columns={'Unnamed: 0': 'Label'}, inplace=True)

# Extract features and target
X = data_cleaned.drop(columns=['Label', 'E***']).values
y = data_cleaned['E***'].values

# Normalize the features to [0, 1] range using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)

# Reshape features for 1D CNN
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

# Perform 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_list, mae_list, r2_list = [], [], []
fold = 1
for train_index, val_index in kf.split(X_reshaped):
    print(f"\nTraining Fold {fold}...")
    fold += 1

    X_train, X_test = X_reshaped[train_index], X_reshaped[val_index]
    y_train, y_test = y[train_index], y[val_index]

    model = Sequential()
    model.add(Conv1D(32, 3, input_shape=(X_train.shape[1], 1), activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=300, batch_size=8, validation_data=(X_test, y_test), callbacks=[early_stopping], verbose=0)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    rmse_tr = mean_squared_error(y_train, y_train_pred, squared=False)
    rmse_te = mean_squared_error(y_test, y_test_pred, squared=False)
    mae_tr = mean_absolute_error(y_train, y_train_pred)
    mae_te = mean_absolute_error(y_test, y_test_pred)
    r2_tr = r2_score(y_train, y_train_pred)
    r2_te = r2_score(y_test, y_test_pred)

    print('RMSE (training) = %.3f' % rmse_tr)
    print('RMSE (test) = %.3f' % rmse_te)
    print('MAE (training) = %.3f' % mae_tr)
    print('MAE (test) = %.3f' % mae_te)
    print('R² (training) = %.3f' % r2_tr)
    print('R² (test) = %.3f' % r2_te)

    rmse_list.append(rmse_te)
    mae_list.append(mae_te)
    r2_list.append(r2_te)

print("\nCross-validation results (5-fold):")
print(f"Average RMSE: {np.mean(rmse_list):.3f} ± {np.std(rmse_list):.3f}")
print(f"Average MAE:  {np.mean(mae_list):.3f} ± {np.std(mae_list):.3f}")
print(f"Average R²:   {np.mean(r2_list):.3f} ± {np.std(r2_list):.3f}")
