In [14]:
# import necessary libraries

import joblib, numpy as np, pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.DataStructs import ConvertToNumpyArray
from sklearn.feature_selection import VarianceThreshold


In [15]:
# 1) Load your saved classifier (This was a Naive Bayes Model)
clf = joblib.load("clf_NB_model.joblib")

In [16]:
# 2) Load the original training data
df_data = pd.read_csv("df_data.csv", index_col = 0)
print(df_data.shape)  # make sure the data loaded correctly

(6791, 169)


In [17]:
# 3) Recreate the selector from the original training data
X_MACCS = df_data.iloc[:, 2:].to_numpy(dtype=int)
sel = VarianceThreshold(threshold=0.0).fit(X_MACCS)
mask = sel.get_support()
print(len(mask))
# (sanity check: the model should expect the same number of features)
n_model_feats = getattr(clf, "n_features_in_", None)
if n_model_feats is not None and n_model_feats != mask.sum():
    raise ValueError(f"Feature mismatch: model expects {n_model_feats}, mask has {mask.sum()}.")

167


In [18]:
# 4) Load a smiles to predict and generate MACCS Keys
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # aspirin
mol = Chem.MolFromSmiles(smiles)
fp = MACCSkeys.GenMACCSKeys(mol)

In [19]:
# 5) Build the query MACCS vector correctly (length = fp.GetNumBits()) and reshape to 2D

arr = np.zeros((fp.GetNumBits(),), dtype=int)  # correct length (RDKit MACCS is typically 167 bits)
ConvertToNumpyArray(fp, arr)
X_query = arr.reshape(1, -1)                   # shape (1, n_bits)


In [20]:
# 6) Apply the same variance mask used in training
X_query_sel = X_query[:, mask]



In [21]:
# 7) Predict
prediction = clf.predict(X_query_sel)

if prediction == 0:
    print("Molecule is inactive for human aromatase.")
else:
    print("Molecule is active for human aromatase. Active may be agonist or antagonist in this model.")

Molecule is inactive for human aromatase.
