In [None]:
import ast
with open("selected_feature.txt", "r", encoding="utf-8") as f:
    temp_features = {}
    for line in f:
        line = line.strip()
        if not line:
            continue
        key, val = line.split(":", 1)
        temp_features[key.strip()] = ast.literal_eval(val.strip())

BPS_features = temp_features['bps_pred']
MPS_features = temp_features['mps_pred']
FPS_features = temp_features['fps_pred']
HOMO_eV_features = temp_features['HOMO_eV']
LUMO_eV_features = temp_features['LUMO_eV']
c_v_features = temp_features['c_v']
model_paths = {
    'bps_model': 'regression_Pipeline_bps_pred.pkl',
    'mps_model': 'regression_Pipeline_mps_pred.pkl',
    'fps_model': 'regression_Pipeline_fps_pred.pkl',
    'HOMO_eV_model' : 'regression_Pipeline_HOMO_eV.pkl',
    'LUMO_eV_model' : 'regression_Pipeline_LUMO_eV.pkl',
    'c_v_model' : 'regression_Pipeline_c_v.pkl',
}
targets1 = ['HOMO_eV', 'LUMO_eV', 'c_v']
targets2 = ['bps_pred', 'mps_pred', 'fps_pred']
input_file1 = "QM9_129440_MLtraining"
input_file2 = "QM9_49762_MLtraining"

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import seaborn as sns
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
df1 = pd.read_csv(f'{input_file1}.csv').replace([np.inf, -np.inf], np.nan).dropna()
df2 = pd.read_csv(f'{input_file2}.csv').replace([np.inf, -np.inf], np.nan).dropna()
models = {}
for name, path in model_paths.items():
    if os.path.exists(path):
        print(f"Loading {name} from {path}")
        with open(path, 'rb') as f:
            models[name] = pickle.load(f)
    else:
        print(f"Warning: {path} does not exist")
X_bps = df2[BPS_features] 
X_mps = df2[MPS_features]  
X_fps = df2[FPS_features]  
X_HOMO_eV = df1[HOMO_eV_features]
X_LUMO_eV = df1[LUMO_eV_features]
X_c_v = df1[c_v_features]
for name, path in model_paths.items():
    if os.path.exists(path):
        print(f"Loading {name} from {path}")
        with open(path, 'rb') as f:
            models[name] = pickle.load(f)
    else:
        print(f"Warning: {path} does not exist")

# Make predictions if models were loaded successfully
if 'bps_model' in models:
    # Kiểm tra features tồn tại trong dataframe
    missing_features = [feat for feat in BPS_features if feat not in df2.columns]
    if missing_features:
        print(f"Warning: Missing features for BPS prediction: {missing_features}")
    else:
        X_bps = df2[BPS_features]
        df2['bps_predicted'] = models['bps_model'].predict(X_bps)
        print("BPS predictions added to dataframe")

if 'mps_model' in models:
    missing_features = [feat for feat in MPS_features if feat not in df2.columns]
    if missing_features:
        print(f"Warning: Missing features for MPS prediction: {missing_features}")
    else:
        X_mps = df2[MPS_features]
        df2['mps_predicted'] = models['mps_model'].predict(X_mps)
        print("MPS predictions added to dataframe")

if 'fps_model' in models:
    missing_features = [feat for feat in FPS_features if feat not in df2.columns]
    if missing_features:
        print(f"Warning: Missing features for FPS prediction: {missing_features}")
    else:
        X_fps = df2[FPS_features]
        df2['fps_predicted'] = models['fps_model'].predict(X_fps)
        print("FPS predictions added to dataframe")

    if 'HOMO_eV_model' in models:
        missing_features = [feat for feat in HOMO_eV_features if feat not in df1.columns]
        if missing_features:
            print(f"Warning: Missing features for HOMO_eV prediction: {missing_features}")
        else:
            df1['HOMO_eV_predicted'] = models['HOMO_eV_model'].predict(X_HOMO_eV)
            print("HOMO_eV predictions added to dataframe")

    if 'LUMO_eV_model' in models:
        missing_features = [feat for feat in LUMO_eV_features if feat not in df1.columns]
        if missing_features:
            print(f"Warning: Missing features for LUMO_eV prediction: {missing_features}")
        else:
            df1['LUMO_eV_predicted'] = models['LUMO_eV_model'].predict(X_LUMO_eV)
            print("LUMO_eV predictions added to dataframe")

    if 'c_v_model' in models:
        missing_features = [feat for feat in c_v_features if feat not in df1.columns]
        if missing_features:
            print(f"Warning: Missing features for c_v prediction: {missing_features}")
        else:
            df1['c_v_predicted'] = models['c_v_model'].predict(X_c_v)
            print("c_v predictions added to dataframe")

# Create a figure with 3 subplots
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: BPS prediction vs actual
axs[0].scatter(df2['bps_pred'], df2['bps_predicted'], alpha=0.5)
axs[0].plot([df2['bps_pred'].min(), df2['bps_pred'].max()], 
           [df2['bps_pred'].min(), df2['bps_pred'].max()], 
           'r--', label='Perfect prediction')
axs[0].set_xlabel('BPS Predicted (Ground Truth)')
axs[0].set_ylabel('BPS Model Prediction')
axs[0].set_title('BPS Model Performance')
axs[0].legend()
r2_bps = r2_score(df2['bps_pred'], df2['bps_predicted'])
mape_bps = np.mean(np.abs((df2['bps_pred'] - df2['bps_predicted']) / df2['bps_pred'])) * 100
axs[0].text(
    0.05, 0.95,
    f'R2={r2_bps:.3f}\nMAPE={mape_bps:.2f}%',
    transform=axs[2].transAxes,
    verticalalignment='top'
)


# Plot 2: MPS prediction vs actual
axs[1].scatter(df2['mps_pred'], df2['mps_predicted'], alpha=0.5)
axs[1].plot([df2['mps_pred'].min(), df2['mps_pred'].max()], 
           [df2['mps_pred'].min(), df2['mps_pred'].max()], 
           'r--', label='Perfect prediction')
axs[1].set_xlabel('MPS Predicted (Ground Truth)')
axs[1].set_ylabel('MPS Model Prediction')
axs[1].set_title('MPS Model Performance')
axs[1].legend()
r2_mps = r2_score(df2['mps_pred'], df2['mps_predicted'])
mape_mps = np.mean(np.abs((df2['mps_pred'] - df2['mps_predicted']) / df2['mps_pred'])) * 100
axs[1].text(
    0.05, 0.95,
    f'R2={r2_mps:.3f}\nMAPE={mape_mps:.2f}%',
    transform=axs[1].transAxes,
    verticalalignment='top'
)

# Plot 3: FPS prediction vs actual
axs[2].scatter(df2['fps_pred'], df2['fps_predicted'], alpha=0.5)
axs[2].plot([df2['fps_pred'].min(), df2['fps_pred'].max()], 
           [df2['fps_pred'].min(), df2['fps_pred'].max()], 
           'r--', label='Perfect prediction')
axs[2].set_xlabel('FPS Predicted (Ground Truth)')
axs[2].set_ylabel('FPS Model Prediction')
axs[2].set_title('FPS Model Performance')
axs[2].legend()
r2_fps = r2_score(df2['fps_pred'], df2['fps_predicted'])
mape_fps = np.mean(np.abs((df2['fps_pred'] - df2['fps_predicted']) / df2['fps_pred'])) * 100
axs[2].text(
    0.05, 0.95,
    f'R2={r2_fps:.3f}\nMAPE={mape_fps:.2f}%',
    transform=axs[2].transAxes,
    verticalalignment='top'
)

plt.tight_layout()
plt.show()
fig2, axs2 = plt.subplots(1, 3, figsize=(24, 6))

# Plot 1: c_v prediction vs actual

axs2[0].scatter(df1['c_v'], df1['c_v_predicted'], alpha=0.5)
axs2[0].plot([df1['c_v'].min(), df1['c_v'].max()],
             [df1['c_v'].min(), df1['c_v'].max()],
             'r--', label='Perfect prediction')
axs2[0].set_xlabel('c_v (Ground Truth)')
axs2[0].set_ylabel('c_v Model Prediction')
axs2[0].set_title('c_v Model Performance')
axs2[0].legend()
r2_c_v = r2_score(df1['c_v'], df1['c_v_predicted'])
mape_c_v = np.mean(np.abs((df1['c_v'] - df1['c_v_predicted']) / df1['c_v'])) * 100
axs2[0].text(
    0.05, 0.95,
    f'R2={r2_c_v:.3f}\nMAPE={mape_c_v:.2f}%',
    transform=axs2[0].transAxes,
    verticalalignment='top'
)

# Plot 3: HOMO_eV prediction vs actual
r2_homo = r2_score(df1['HOMO_eV'], df1['HOMO_eV_predicted'])
mape_homo = np.mean(np.abs((df1['HOMO_eV'] - df1['HOMO_eV_predicted']) / df1['HOMO_eV'])) * 100

axs2[1].scatter(df1['HOMO_eV'], df1['HOMO_eV_predicted'], alpha=0.5)
axs2[1].plot([df1['HOMO_eV'].min(), df1['HOMO_eV'].max()],
             [df1['HOMO_eV'].min(), df1['HOMO_eV'].max()],
             'r--', label='Perfect prediction')
axs2[1].set_xlabel('HOMO_eV (Ground Truth)')
axs2[1].set_ylabel('HOMO_eV Model Prediction')
axs2[1].set_title('HOMO_eV Model Performance')
axs2[1].legend()
axs2[1].text(
    0.05, 0.95,
    f'R2={r2_homo:.3f}\nMAPE={mape_homo:.2f}%',
    transform=axs2[3].transAxes,
    verticalalignment='top'
)

# Plot 4: LUMO_eV prediction vs actual
r2_lumo = r2_score(df1['LUMO_eV'], df1['LUMO_eV_predicted'])
mape_lumo = np.mean(np.abs((df1['LUMO_eV'] - df1['LUMO_eV_predicted']) / df1['LUMO_eV'])) * 100

axs2[2].scatter(df1['LUMO_eV'], df1['LUMO_eV_predicted'], alpha=0.5)
axs2[2].plot([df1['LUMO_eV'].min(), df1['LUMO_eV'].max()],
             [df1['LUMO_eV'].min(), df1['LUMO_eV'].max()],
             'r--', label='Perfect prediction')
axs2[2].set_xlabel('LUMO_eV (Ground Truth)')
axs2[2].set_ylabel('LUMO_eV Model Prediction')
axs2[2].set_title('LUMO_eV Model Performance')
axs2[2].legend()
axs2[2].text(
    0.05, 0.95,
    f'R2={r2_lumo:.3f}\nMAPE={mape_lumo:.2f}%',
    transform=axs2[3].transAxes,
    verticalalignment='top'
)

plt.tight_layout()
plt.show()

Loading bps_model from regression_Pipeline_bps_pred.pkl
Loading mps_model from regression_Pipeline_mps_pred.pkl
Loading fps_model from regression_Pipeline_fps_pred.pkl
Loading mu_model from regression_Pipeline_mu.pkl
Loading HOMO_eV_model from regression_Pipeline_HOMO_eV.pkl
Loading LUMO_eV_model from regression_Pipeline_LUMO_eV.pkl
Loading c_v_model from regression_Pipeline_c_v.pkl
Loading bps_model from regression_Pipeline_bps_pred.pkl
Loading mps_model from regression_Pipeline_mps_pred.pkl
Loading fps_model from regression_Pipeline_fps_pred.pkl
Loading mu_model from regression_Pipeline_mu.pkl
Loading HOMO_eV_model from regression_Pipeline_HOMO_eV.pkl
Loading LUMO_eV_model from regression_Pipeline_LUMO_eV.pkl
Loading c_v_model from regression_Pipeline_c_v.pkl
BPS predictions added to dataframe
MPS predictions added to dataframe
FPS predictions added to dataframe
mu predictions added to dataframe
HOMO_eV predictions added to dataframe
LUMO_eV predictions added to dataframe
c_v predict