In [None]:
model_paths = {
    'bps_model': 'regression_Pipeline_BPS.pkl',
    'mps_model': 'regression_Pipeline_MPS.pkl',
    'fps_model': 'regression_Pipeline_FPS.pkl'
}
input_file1 = "QM9_129440_MLtraining"
input_file2 = "QM9_49762_MLtraining"

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
df1 = pd.read_csv(f'{input_file1}.csv')
df2 = pd.read_csv(f'{input_file2}.csv')
df2 = df2.dropna()

# Remove infinite values
df2 = df2.replace([np.inf, -np.inf], np.nan).dropna()

to_drop = ['canonical_smiles']  # Thêm các features khác nếu cần
df2 = df2.drop(columns=to_drop)

missing_percent = df2.isnull().mean()
to_drop = missing_percent[missing_percent > 0.4].index
df2 = df2.drop(columns=to_drop)

# Loại bỏ features có variance thấp
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.08)  # Điều chỉnh threshold
selector.fit(df2.select_dtypes(include=['float64', 'int64']))
low_variance_cols = df2.columns[~selector.get_support()]
df2 = df2.drop(columns=low_variance_cols)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49712 entries, 0 to 49761
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   mps_pred   49712 non-null  float64
 1   bps_pred   49712 non-null  float64
 2   fps_pred   49712 non-null  float64
 3   #C         49712 non-null  int64  
 4   #O         49712 non-null  int64  
 5   #R=R       49712 non-null  int64  
 6   #R#R       49712 non-null  int64  
 7   #Donor     49712 non-null  int64  
 8   #Accept    49712 non-null  int64  
 9   #Ring      49712 non-null  int64  
 10  #AlCR      49712 non-null  int64  
 11  #AlHR      49712 non-null  int64  
 12  #AlR       49712 non-null  int64  
 13  #ArHR      49712 non-null  int64  
 14  #ArR       49712 non-null  int64  
 15  #SCR       49712 non-null  int64  
 16  #SHR       49712 non-null  int64  
 17  #SR        49712 non-null  int64  
 18  #Bran      49712 non-null  int64  
 19  #C=O       49712 non-null  int64  
 20  #Ether     

In [4]:

# Check if models exist and load them
models = {}
for name, path in model_paths.items():
    if os.path.exists(path):
        print(f"Loading {name} from {path}")
        with open(path, 'rb') as f:
            models[name] = pickle.load(f)
    else:
        print(f"Warning: {path} does not exist")

# Prepare features for prediction
# Exclude target columns if they exist in the dataframe
X = df2.drop(columns=['bps_pred', 'mps_pred', 'fps_pred'], errors='ignore')

# Make predictions if models were loaded successfully
if 'bps_model' in models:
    df2['bps_predicted'] = models['bps_model'].predict(X)
    print("BPS predictions added to dataframe")
    
if 'mps_model' in models:
    df2['mps_predicted'] = models['mps_model'].predict(X)
    print("MPS predictions added to dataframe")
    
if 'fps_model' in models:
    df2['fps_predicted'] = models['fps_model'].predict(X)
    print("FPS predictions added to dataframe")

# Display the dataframe with predictions
df2[['bps_pred', 'bps_predicted', 'mps_pred', 'mps_predicted', 'fps_pred', 'fps_predicted']].head()

Loading bps_model from regression_Pipeline_BPS.pkl


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading mps_model from regression_Pipeline_MPS.pkl
Loading fps_model from regression_Pipeline_FPS.pkl
BPS predictions added to dataframe
MPS predictions added to dataframe
FPS predictions added to dataframe


Unnamed: 0,bps_pred,bps_predicted,mps_pred,mps_predicted,fps_pred,fps_predicted
0,204.875,204.875,140.5,140.5,211.125,211.125
1,197.818182,197.818054,113.659091,118.070007,192.136364,192.136276
2,283.45,312.723785,159.840909,166.039597,211.340909,234.262939
3,367.0,367.00235,186.238636,186.238922,272.931818,272.933289
4,420.863636,420.863556,216.170455,216.170563,314.840909,314.840912
