Notebook for setting up predictions on any of the ChEMBL models in the supplied repository

In [74]:
import numpy as np
import pandas as pd
import joblib
from rdkit import Chem
from rdkit.Chem import AllChem
from scripts.utils import rdkit_fpconvert_numpy, rdkit_get_physchem_descr

In [None]:
# EDIT THIS TO BE YOUR CHOSEN MODEL
chosen_model = "DRD1_PIC50"
hpo_config = "svm"
smi_file = "./example_drd_drugs.txt"
delimiter = ","

In [80]:
def import_and_setup(
    chosen_model: str,
    hpo_config: str,
    smi_file: str,
    delimiter: str
):
    """ Import temporal smiles file and load models

    :param chosen_model: Str combination of target and pxc50 type
    :param hpo_config: Str for model type: svm or xgb
    :param smi_file: Str for temporal smiles file
    :param delimiter: Str passed to pandas  
    :requirements smi_file: 
    1) comma, tab delimited text file
    2) smiles (upper or lower case accepted)
    :return model.pkl, input dataframe, scaler.pkl 
    """
    chosen_model_path = f"./{chosen_model}/{chosen_model}_{hpo_config}.pkl"
    predictor_model = joblib.load(chosen_model_path)
    scaler = joblib.load(f"./{chosen_model}/scalar_{chosen_model}.pkl")
    pred_example = pd.read_csv(smi_file, sep=delimiter)
    pred_example.columns = [x.lower() for x in pred_example.columns]
    return predictor_model, pred_example, scaler


In [82]:
def prepare_pipeline(
    pred_example: pd.DataFrame,
    scaler  
):
    """ Prepare descriptors to be used for predictions
    :param pred_example: Dataframe containing smiles information
    :param scaler: Scaler model.pkl used to scale descriptors onto train scaler
    :return scaled and combined morgan fingerprints and rdkit descriptors array 
    """
    # Canonicalise smiles first to standardise input
    CanonSmiles = [Chem.CanonSmiles(smi) for smi in pred_example.smiles]
    mols = [Chem.MolFromSmiles(smi) for smi in CanonSmiles]
    # generate binary Morgan fingerprint with radius 2
    fp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols]
    # convert to numpy array
    x = rdkit_fpconvert_numpy(fp)
    # add to morgan FP's
    x = np.concatenate((x, rdkit_get_physchem_descr(mols)), axis=1)
    x = scaler.transform(x)
    return x

In [83]:
def predict_smiles(
    descr: np.array,
    pred_example: pd.DataFrame,
    predictor_model
):
    """ Make predictions on descriptors and merge with original dataframe
    :param descr: Array of descriptors for making predictions
    :param pred_example: Original input dataframe for joining back predictions on index
    :param predictor_model: Prediction model.pkl
    :return Dataframe of combined predictions with original input dataframe
    """
    pred_model = predictor_model.predict(descr)
    pred_model_df = pd.DataFrame(pred_model, columns=["pred_class"])
    merged_pred_model = pd.merge(pred_model_df, pred_example, left_index=True, right_index=True)
    return merged_pred_model

Example Drug Compounds that hit DRD1 and DRD2

In [88]:
predictor_model, pred_example, scaler = import_and_setup(chosen_model, hpo_config, smi_file, delimiter)
descr = prepare_pipeline(pred_example, scaler)
predictions = predict_smiles(descr, pred_example, predictor_model)

In [89]:
predictions

Unnamed: 0,pred_class,smiles,id
0,0,[H][C@]12C[C@@H](C(=O)N(CCCN(C)C)C(=O)NCC)CN(C...,Cabergoline
1,1,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,Chlorpromazine


In [68]:
# EDIT THIS TO BE YOUR CHOSEN MODEL
chosen_model = "DRD2_PIC50"
hpo_config = "svm"
smi_file = "./example_drd_drugs.txt"
delimiter = ","
predictor_model, pred_example, scaler = import_and_setup(chosen_model, hpo_config, smi_file, delimiter)
descr = prepare_pipeline(pred_example, scaler)
predictions = predict_smiles(descr, pred_example, predictor_model)

In [69]:
predictions

Unnamed: 0,pred_class,smiles,id
0,1,[H][C@]12C[C@@H](C(=O)N(CCCN(C)C)C(=O)NCC)CN(C...,Cabergoline
1,1,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,Chlorpromazine


DRD1 activity incorrectly predicted for Cabergoline but both predicted correctly for DRD2

Example drugs that hit HRH1 or suspected to

In [70]:
# EDIT THIS TO BE YOUR CHOSEN MODEL
chosen_model = "HRH1_PIC50"
hpo_config = "xgb"
smi_file = "./example_histamine_receptor_drugs.txt"
delimiter = ","
predictor_model, pred_example, scaler = import_and_setup(chosen_model, hpo_config, smi_file, delimiter)
descr = prepare_pipeline(pred_example, scaler)
predictions = predict_smiles(descr, pred_example, predictor_model)

In [71]:
predictions

Unnamed: 0,pred_class,smiles,id
0,1,[H][C@]12C[C@@H](C(=O)N(CCCN(C)C)C(=O)NCC)CN(C...,Cabergoline
1,1,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,Chlorpromazine
2,1,Fc1ccc(cc1)Cn2c5ccccc5nc2NC4CCN(CCc3ccc(OC)cc3...,Astemiszole
3,1,Brc1ccc(cc1)C(c2ncccc2)CCN(C)C,Brompheniramine


All of the drugs appear to be correctly predicted for HRH1 antagonism

Investigate final list for CHRM1 activity

In [72]:
# EDIT THIS TO BE YOUR CHOSEN MODEL
chosen_model = "CHRM1_PIC50"
hpo_config = "xgb"
smi_file = "./example_drugs.txt"
delimiter = ","
predictor_model, pred_example, scaler = import_and_setup(chosen_model, hpo_config, smi_file, delimiter)
descr = prepare_pipeline(pred_example, scaler)
predictions = predict_smiles(descr, pred_example, predictor_model)

In [73]:
predictions

Unnamed: 0,pred_class,smiles,id
0,1,[H][C@]12C[C@@H](C(=O)N(CCCN(C)C)C(=O)NCC)CN(C...,Cabergoline
1,1,CN(C)CCCN1c2ccccc2Sc2ccc(Cl)cc21,Chlorpromazine
2,0,Fc1ccc(cc1)Cn2c5ccccc5nc2NC4CCN(CCc3ccc(OC)cc3...,Astemiszole
3,0,Brc1ccc(cc1)C(c2ncccc2)CCN(C)C,Brompheniramine
4,0,O=C(O)/C=C\C(O)=O.O=C(O)/C=C\C(O)=O.CN1CC/C(CC...,Azatadine


Doesn't appear to have worked as well as Azatadine and Brompheniramine were the suspected CHRM1 antagonists