In [1]:
model_paths = {
    'bps_model': 'regression_Pipeline_BPS.pkl',
    'mps_model': 'regression_Pipeline_MPS.pkl',
    'fps_model': 'regression_Pipeline_FPS.pkl'
}
BPS_features = ['#Accept', '#Donor', '#O/#C', '#C', 'SLogP', '#Ether', '#Nitrile', 'apol', '#R=R', '#Ar-N', '#Ring', '#ArR', 'Vabc', '#Ester', '#AlHR', '#C=O', '#ArHR', '#Imine', '#Bran', 'Radius']
MPS_features = ['#Donor', '#Accept', 'SLogP', '#Ar-N', 'apol', '#R=R', '#O/#C', '#Ether', '#ArHR', 'Vabc', '#Bran', '#ArR', '#C', '#Imine', '#AlHR', '#Ring', '#SCR', '#SHR', '#Ketone', '#AlCR']
FPS_features = ['#Accept', 'SLogP', '#Donor', '#C', '#O/#C', '#Ether', 'apol', '#R=R', '#Nitrile', '#Ar-N', '#ArR', '#AlHR', '#Imine', '#ArHR', 'Vabc', '#Ring', '#AlCR', '#Ester', '#C=O', '#SCR']
input_file1 = "QM9_129440_MLtraining"
input_file2 = "QM9_49762_MLtraining"

In [2]:
import pandas as pd
import numpy as np
import pickle
import os
df1 = pd.read_csv(f'{input_file1}.csv')
df2 = pd.read_csv(f'{input_file2}.csv').replace([np.inf, -np.inf], np.nan).dropna()


In [3]:

# Check if models exist and load them
models = {}
for name, path in model_paths.items():
    if os.path.exists(path):
        print(f"Loading {name} from {path}")
        with open(path, 'rb') as f:
            models[name] = pickle.load(f)
    else:
        print(f"Warning: {path} does not exist")
X_bps = df2[BPS_features] 
X_mps = df2[MPS_features]  
X_fps = df2[FPS_features]  
for name, path in model_paths.items():
    if os.path.exists(path):
        print(f"Loading {name} from {path}")
        with open(path, 'rb') as f:
            models[name] = pickle.load(f)
    else:
        print(f"Warning: {path} does not exist")

# Make predictions if models were loaded successfully
if 'bps_model' in models:
    # Kiểm tra features tồn tại trong dataframe
    missing_features = [feat for feat in BPS_features if feat not in df2.columns]
    if missing_features:
        print(f"Warning: Missing features for BPS prediction: {missing_features}")
    else:
        X_bps = df2[BPS_features]
        df2['bps_predicted'] = models['bps_model'].predict(X_bps)
        print("BPS predictions added to dataframe")

if 'mps_model' in models:
    missing_features = [feat for feat in MPS_features if feat not in df2.columns]
    if missing_features:
        print(f"Warning: Missing features for MPS prediction: {missing_features}")
    else:
        X_mps = df2[MPS_features]
        df2['mps_predicted'] = models['mps_model'].predict(X_mps)
        print("MPS predictions added to dataframe")

if 'fps_model' in models:
    missing_features = [feat for feat in FPS_features if feat not in df2.columns]
    if missing_features:
        print(f"Warning: Missing features for FPS prediction: {missing_features}")
    else:
        X_fps = df2[FPS_features]
        df2['fps_predicted'] = models['fps_model'].predict(X_fps)
        print("FPS predictions added to dataframe")

# Display the dataframe with predictions
result_cols = []
for target in ['bps', 'mps', 'fps']:
    if f'{target}_pred' in df2.columns and f'{target}_predicted' in df2.columns:
        result_cols.extend([f'{target}_pred', f'{target}_predicted'])

df2[result_cols]


Loading bps_model from regression_Pipeline_BPS.pkl


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading mps_model from regression_Pipeline_MPS.pkl
Loading fps_model from regression_Pipeline_FPS.pkl
Loading bps_model from regression_Pipeline_BPS.pkl
Loading mps_model from regression_Pipeline_MPS.pkl


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading fps_model from regression_Pipeline_FPS.pkl
BPS predictions added to dataframe
MPS predictions added to dataframe
FPS predictions added to dataframe


Unnamed: 0,bps_pred,bps_predicted,mps_pred,mps_predicted,fps_pred,fps_predicted
0,204.875000,204.875061,140.500000,140.500031,211.125000,211.125061
1,197.818182,197.818542,113.659091,113.659088,192.136364,192.136353
2,283.450000,333.022980,159.840909,133.072784,211.340909,238.408707
3,367.000000,366.954041,186.238636,186.242340,272.931818,272.939636
4,420.863636,420.855347,216.170455,216.050797,314.840909,314.849396
...,...,...,...,...,...,...
49757,521.227273,511.935028,302.659091,334.035461,365.136364,362.891327
49758,536.818182,525.267456,334.500000,392.683777,381.704545,387.590363
49759,516.045455,515.572021,319.477273,326.749817,375.363636,372.665558
49760,549.090909,520.972778,349.613636,368.673492,407.636364,398.791138
