In [8]:
import joblib
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.DataStructs import ConvertToNumpyArray
import numpy as np

# Load the entire preprocessing + model pipeline
pipe = joblib.load("nb_maccs_pipeline.joblib")
print("Pipeline loaded successfully!")

Pipeline loaded successfully!


In [9]:
smiles = "C1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3Cl)N4C=CN=C4" # CID = 2812 should be active
mol = Chem.MolFromSmiles(smiles)

fp = MACCSkeys.GenMACCSKeys(mol)
arr = np.zeros((fp.GetNumBits(),), dtype=int)   # correct length (â‰ˆ167 bits)
ConvertToNumpyArray(fp, arr)

# scikit-learn expects 2D shape: (n_samples, n_features)
X_query = arr.reshape(1, -1)
X_query.shape

(1, 167)

In [13]:
prediction = pipe.predict(X_query)
probability = pipe.predict_proba(X_query)

print("Predicted class:", prediction[0])
print("Probabilities (Inactive, Active):", probability[0])


Predicted class: 1
Probabilities (Inactive, Active): [0.01173477 0.98826523]


In [11]:
#new_smiles = "C1=CC=C(C(=C1)C2=NC(=NO2)C3=CC=NC=C3)Cl" #CID 65758 should be active
#new_smiles = "C1=CC(=CC=C1C2=COC3=CC(=CC(=C3C2=O)O)O)O" #CID 5280961 should be active
#new_smiles = "CN(C1CCN(CC1)C2=NC3=CC=CC=C3N2CC4=CC=C(C=C4)F)C5=NC=CC(=O)N5" #CID 65906 should be INactive

smiles_list = ["C1=CC=C(C(=C1)C2=NC(=NO2)C3=CC=NC=C3)Cl", "C1=CC(=CC=C1C2=COC3=CC(=CC(=C3C2=O)O)O)O", "CN(C1CCN(CC1)C2=NC3=CC=CC=C3N2CC4=CC=C(C=C4)F)C5=NC=CC(=O)N5"]
fps = []
for s in smiles_list:
    mol = Chem.MolFromSmiles(s)
    fp = MACCSkeys.GenMACCSKeys(mol)
    arr = np.zeros((fp.GetNumBits(),), dtype=int)
    ConvertToNumpyArray(fp, arr)
    fps.append(arr)

X_batch = np.array(fps)
preds = pipe.predict(X_batch)
print(preds)

[1 1 1]
