<a href="https://colab.research.google.com/github/francescopatane96/Bioactivity-prediction-with-ML/blob/main/M7finaledefinitivo100922.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit

In [None]:
!pip install lazypredict

In [4]:
!pip install git+https://github.com/volkamerlab/teachopencadd.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/volkamerlab/teachopencadd.git
  Cloning https://github.com/volkamerlab/teachopencadd.git to /tmp/pip-req-build-ta02aa58
  Running command git clone -q https://github.com/volkamerlab/teachopencadd.git /tmp/pip-req-build-ta02aa58
[31mERROR: Operation cancelled by user[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/cli/base_command.py", line 180, in _main
    status = self.run(options, args)
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/cli/req_command.py", line 199, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/commands/install.py", line 319, in run
    reqs, check_supported_wheels=not options.target_dir
  File "/usr/local/lib/python3.7/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 104, in resolve
    req

In [3]:
from pathlib import Path
import seaborn as sns
from warnings import filterwarnings
import time
import lazypredict
from lazypredict.Supervised import LazyRegressor
from lazypredict.Supervised import LazyClassifier

import pandas as pd
import numpy as np
from sklearn import svm, metrics, clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import auc, accuracy_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect

from teachopencadd.utils import seed_everything

# Silence some expected warnings
filterwarnings("ignore")
# Fix seed for reproducible results
SEED = 22
seed_everything(SEED)



In [5]:
# Read data (Lipinski)
chembl_df = pd.read_csv(
    "IDH_compounds_lipinski.csv",
    index_col=0,
)

# Look at head
print("Shape of dataframe : ", chembl_df.shape)
chembl_df.head()


Shape of dataframe :  (1545, 11)


Unnamed: 0,molecule_chembl_id,IC50,units,smiles,pIC50,ROMol,molecular_weight,n_hba,n_hbd,logp,ro5_fulfilled
0,CHEMBL4279047,0.04,nM,C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,10.4,"<img data-content=""rdkit/molecule"" src=""data:i...",414.08,6,2,4.31,True
1,CHEMBL4278845,0.25,nM,C[C@H](Nc1nc(N[C@@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,9.6,"<img data-content=""rdkit/molecule"" src=""data:i...",414.08,6,2,4.31,True
2,CHEMBL4283785,0.3,nM,CC(Nc1nc(NC(C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F...,9.52,"<img data-content=""rdkit/molecule"" src=""data:i...",414.08,6,2,4.31,True
3,CHEMBL4280132,0.7,nM,FC1(F)CCC(Nc2nc(NC3CCC(F)(F)C3)nc(-c3cccc(Cl)n...,9.15,"<img data-content=""rdkit/molecule"" src=""data:i...",430.13,6,2,4.79,True
4,CHEMBL3909586,1.0,nM,C[C@H](Nc1nccc(N2C(=O)OC[C@@H]2[C@H](C)F)n1)c1...,9.0,"<img data-content=""rdkit/molecule"" src=""data:i...",446.17,7,1,4.46,True


In [6]:
# Feature for proving and Proving our data \\ NaN finder
def check_missing_values(dataframe):
    
    if dataframe.isnull().sum().sum() > 0:
        m_total = dataframe.isnull().sum().sort_values(ascending=False) 
        total = m_total[m_total > 0]

        m_percent = dataframe.isnull().mean().sort_values(ascending=False) 
        percent = m_percent[m_percent > 0] 

        missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    
        print(f'Total and Percentage of NaN:\n {missing_data}')
    else: 
        print('No NaN found.')
        
        
check_missing_values(dataframe=chembl_df)

No NaN found.


In [8]:
# remove NaN
chembl_df = chembl_df.dropna()

In [9]:
chembl_df.shape

(1545, 11)

In [35]:
# Keep only the columns we want
chembl_df = chembl_df[["molecule_chembl_id", "smiles", "pIC50"]]
chembl_df.head()


Unnamed: 0,molecule_chembl_id,smiles,pIC50
0,CHEMBL4279047,C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,10.4
1,CHEMBL4278845,C[C@H](Nc1nc(N[C@@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,9.6
2,CHEMBL4283785,CC(Nc1nc(NC(C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F...,9.52
3,CHEMBL4280132,FC1(F)CCC(Nc2nc(NC3CCC(F)(F)C3)nc(-c3cccc(Cl)n...,9.15
4,CHEMBL3909586,C[C@H](Nc1nccc(N2C(=O)OC[C@@H]2[C@H](C)F)n1)c1...,9.0


In [11]:
# Add column for activity
chembl_df["active"] = np.zeros(len(chembl_df))

# Mark every molecule as active with an pIC50 of >= 6.3, 0 otherwise
chembl_df.loc[chembl_df[chembl_df.pIC50 >= 6.3].index, "active"] = 1.0

# NBVAL_CHECK_OUTPUT
print("Number of active compounds:", int(chembl_df.active.sum()))
print("Number of inactive compounds:", len(chembl_df) - int(chembl_df.active.sum()))

Number of active compounds: 978
Number of inactive compounds: 567


In [12]:
chembl_df.head()


Unnamed: 0,molecule_chembl_id,smiles,pIC50,active
0,CHEMBL4279047,C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,10.4,1.0
1,CHEMBL4278845,C[C@H](Nc1nc(N[C@@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,9.6,1.0
2,CHEMBL4283785,CC(Nc1nc(NC(C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F...,9.52,1.0
3,CHEMBL4280132,FC1(F)CCC(Nc2nc(NC3CCC(F)(F)C3)nc(-c3cccc(Cl)n...,9.15,1.0
4,CHEMBL3909586,C[C@H](Nc1nccc(N2C(=O)OC[C@@H]2[C@H](C)F)n1)c1...,9.0,1.0


In [13]:
def smiles_to_fp(smiles, method="maccs", n_bits=2048):
    """
    Encode a molecule from a SMILES string into a fingerprint.

    Parameters
    ----------
    smiles : str
        The SMILES string defining the molecule.

    method : str
        The type of fingerprint to use. Default is MACCS keys.

    n_bits : int
        The length of the fingerprint.

    Returns
    -------
    array
        The fingerprint array.

    """

    # convert smiles to RDKit mol object
    mol = Chem.MolFromSmiles(smiles)

    if method == "maccs":
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    if method == "morgan2":
        return np.array(GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits))
    if method == "morgan3":
        return np.array(GetMorganFingerprintAsBitVect(mol, 3, nBits=n_bits))
    else:
        # NBVAL_CHECK_OUTPUT
        print(f"Warning: Wrong method specified: {method}. Default will be used instead.")
        return np.array(MACCSkeys.GenMACCSKeys(mol))

In [None]:
#da qui pezzo test

In [14]:
! wget https://github.com/gromdimon/features/raw/main/padel.sh
! wget https://github.com/gromdimon/features/raw/main/padel.zip

--2022-09-10 15:03:39--  https://github.com/gromdimon/features/raw/main/padel.sh
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/gromdimon/Pandas_features/raw/main/padel.sh [following]
--2022-09-10 15:03:39--  https://github.com/gromdimon/Pandas_features/raw/main/padel.sh
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/gromdimon/Pandas_features/main/padel.sh [following]
--2022-09-10 15:03:39--  https://raw.githubusercontent.com/gromdimon/Pandas_features/main/padel.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200

In [15]:
!unzip padel.zip

Archive:  padel.zip
   creating: PaDEL-Descriptor/
  inflating: __MACOSX/._PaDEL-Descriptor  
  inflating: PaDEL-Descriptor/MACCSFingerprinter.xml  
  inflating: __MACOSX/PaDEL-Descriptor/._MACCSFingerprinter.xml  
  inflating: PaDEL-Descriptor/AtomPairs2DFingerprinter.xml  
  inflating: __MACOSX/PaDEL-Descriptor/._AtomPairs2DFingerprinter.xml  
  inflating: PaDEL-Descriptor/EStateFingerprinter.xml  
  inflating: __MACOSX/PaDEL-Descriptor/._EStateFingerprinter.xml  
  inflating: PaDEL-Descriptor/Fingerprinter.xml  
  inflating: __MACOSX/PaDEL-Descriptor/._Fingerprinter.xml  
  inflating: PaDEL-Descriptor/.DS_Store  
  inflating: __MACOSX/PaDEL-Descriptor/._.DS_Store  
   creating: PaDEL-Descriptor/license/
  inflating: __MACOSX/PaDEL-Descriptor/._license  
  inflating: PaDEL-Descriptor/KlekotaRothFingerprintCount.xml  
  inflating: __MACOSX/PaDEL-Descriptor/._KlekotaRothFingerprintCount.xml  
  inflating: PaDEL-Descriptor/config  
  inflating: __MACOSX/PaDEL-Descriptor/._config  
  inf

In [7]:
selection = ['smiles', 'molecule_chembl_id']
act_selected = chembl_df[selection]
act_selected.to_csv('molecule.smi', sep='\t', index=False, header=False )

In [8]:
! cat molecule.smi | head -5
! cat molecule.smi | wc -l

C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F)(F)F	CHEMBL4279047
C[C@H](Nc1nc(N[C@@H](C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F)(F)F	CHEMBL4278845
CC(Nc1nc(NC(C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F)(F)F	CHEMBL4283785
FC1(F)CCC(Nc2nc(NC3CCC(F)(F)C3)nc(-c3cccc(Cl)n3)n2)C1	CHEMBL4280132
C[C@H](Nc1nccc(N2C(=O)OC[C@@H]2[C@H](C)F)n1)c1cn(-c2ccc(C(F)F)cc2)cn1	CHEMBL3909586
1545


In [18]:
!cat padel.sh

java -Xms1G -Xmx1G -Djava.awt.headless=true -jar ./PaDEL-Descriptor/PaDEL-Descriptor.jar -removesalt -standardizenitro -fingerprints -descriptortypes ./PaDEL-Descriptor/PubchemFingerprinter.xml -dir ./ -file descriptors_output.csv


In [19]:
!bash padel.sh

Processing CHEMBL4279047 in molecule.smi (1/1545). 
Processing CHEMBL4278845 in molecule.smi (2/1545). 
Processing CHEMBL4280132 in molecule.smi (4/1545). Average speed: 3.45 s/mol.
Processing CHEMBL4283785 in molecule.smi (3/1545). Average speed: 3.42 s/mol.
Processing CHEMBL3909586 in molecule.smi (5/1545). Average speed: 1.30 s/mol.
Processing CHEMBL4277352 in molecule.smi (6/1545). Average speed: 1.10 s/mol.
Processing CHEMBL4529476 in molecule.smi (8/1545). Average speed: 0.83 s/mol.
Processing CHEMBL4463644 in molecule.smi (7/1545). Average speed: 0.98 s/mol.
Processing CHEMBL3979625 in molecule.smi (9/1545). Average speed: 0.81 s/mol.
Processing CHEMBL3980822 in molecule.smi (10/1545). Average speed: 0.74 s/mol.
Processing CHEMBL4215717 in molecule.smi (12/1545). Average speed: 0.64 s/mol.
Processing CHEMBL4436236 in molecule.smi (11/1545). Average speed: 0.70 s/mol.
Processing CHEMBL3962229 in molecule.smi (13/1545). Average speed: 0.63 s/mol.
Processing CHEMBL4170686 in molecu

In [20]:
!ls -l

total 46456
-rw-r--r-- 1 root root  2758352 Sep 10 15:08 descriptors_output.csv
-rw-r--r-- 1 root root 18897550 Sep 10 15:03 IDH_compounds_lipinski.csv
drwxr-xr-x 3 root root     4096 Sep 10 15:03 __MACOSX
-rw-r--r-- 1 root root   118166 Sep 10 15:03 molecule.smi
drwxrwxr-x 4 root root     4096 May 30  2020 PaDEL-Descriptor
-rw-r--r-- 1 root root      231 Sep 10 15:03 padel.sh
-rw-r--r-- 1 root root 25768637 Sep 10 15:03 padel.zip
drwxr-xr-x 1 root root     4096 Aug 31 13:47 sample_data


In [9]:
actx = pd.read_csv('descriptors_output.csv')
actx

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL4278845,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL4279047,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL4283785,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL4280132,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL3909586,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,CHEMBL4217951,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1541,CHEMBL4059985,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1542,CHEMBL4450414,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1543,CHEMBL2180737,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#fine test

In [10]:
# Read data (Lipinski)
chembl_df = pd.read_csv(
    "IDH_compounds_lipinski.csv",
    index_col=0,
)

In [11]:
chembl_df

Unnamed: 0,molecule_chembl_id,IC50,units,smiles,pIC50,ROMol,molecular_weight,n_hba,n_hbd,logp,ro5_fulfilled
0,CHEMBL4279047,0.04,nM,C[C@@H](Nc1nc(N[C@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,10.40,"<img data-content=""rdkit/molecule"" src=""data:i...",414.08,6,2,4.31,True
1,CHEMBL4278845,0.25,nM,C[C@H](Nc1nc(N[C@@H](C)C(F)(F)F)nc(-c2cccc(Cl)...,9.60,"<img data-content=""rdkit/molecule"" src=""data:i...",414.08,6,2,4.31,True
2,CHEMBL4283785,0.30,nM,CC(Nc1nc(NC(C)C(F)(F)F)nc(-c2cccc(Cl)n2)n1)C(F...,9.52,"<img data-content=""rdkit/molecule"" src=""data:i...",414.08,6,2,4.31,True
3,CHEMBL4280132,0.70,nM,FC1(F)CCC(Nc2nc(NC3CCC(F)(F)C3)nc(-c3cccc(Cl)n...,9.15,"<img data-content=""rdkit/molecule"" src=""data:i...",430.13,6,2,4.79,True
4,CHEMBL3909586,1.00,nM,C[C@H](Nc1nccc(N2C(=O)OC[C@@H]2[C@H](C)F)n1)c1...,9.00,"<img data-content=""rdkit/molecule"" src=""data:i...",446.17,7,1,4.46,True
...,...,...,...,...,...,...,...,...,...,...,...
1695,CHEMBL4059985,47400.00,nM,NS(=O)(=O)c1ccc(Cc2c(-c3ccccc3)nn(-c3nc(C(=O)O...,4.32,"<img data-content=""rdkit/molecule"" src=""data:i...",494.11,7,2,3.88,True
1696,CHEMBL4217951,50000.00,nM,COC(=O)C1C(=O)C2=C(CC1C)Nc1ccccc1NC2c1c[nH]c2c...,4.30,"<img data-content=""rdkit/molecule"" src=""data:i...",401.17,5,3,4.40,True
1697,CHEMBL4450414,51400.00,nM,COc1cccc(NCc2cc3cc(C)c(C)cc3[nH]c2=O)c1,4.29,"<img data-content=""rdkit/molecule"" src=""data:i...",308.15,3,2,3.77,True
1698,CHEMBL2180737,53100.00,nM,Cc1ccccc1C(C(=O)NC1CCNCC1)N(C(=O)Cc1cccs1)c1cc...,4.27,"<img data-content=""rdkit/molecule"" src=""data:i...",465.19,4,2,4.38,True


In [12]:
chembl_df = chembl_df[["ro5_fulfilled"]]
chembl_df.head()



Unnamed: 0,ro5_fulfilled
0,True
1,True
2,True
3,True
4,True


In [13]:
extracted_col = chembl_df["ro5_fulfilled"]
print("column to added from first dataframe to second:")
display(extracted_col)
  
actx = actx.join(extracted_col)
print("Second dataframe after adding column from first dataframe:")
display(actx)

column to added from first dataframe to second:


0       True
1       True
2       True
3       True
4       True
        ... 
1695    True
1696    True
1697    True
1698    True
1699    True
Name: ro5_fulfilled, Length: 1545, dtype: bool

Second dataframe after adding column from first dataframe:


Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,ro5_fulfilled
0,CHEMBL4278845,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
1,CHEMBL4279047,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
2,CHEMBL4283785,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
3,CHEMBL4280132,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
4,CHEMBL3909586,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,CHEMBL4217951,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
1541,CHEMBL4059985,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
1542,CHEMBL4450414,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True
1543,CHEMBL2180737,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,True


In [14]:
chembl_df = actx

In [15]:
def plot_roc_curves_for_models(models, test_x, test_y, save_png=False):
    """
    Helper function to plot customized roc curve.

    Parameters
    ----------
    models: dict
        Dictionary of pretrained machine learning models.
    test_x: list
        Molecular fingerprints for test set.
    test_y: list
        Associated activity labels for test set.
    save_png: bool
        Save image to disk (default = False)

    Returns
    -------
    fig:
        Figure.
    """

    fig, ax = plt.subplots()

    # Below for loop iterates through your models list
    for model in models:
        # Select the model
        ml_model = model["model"]
        # Prediction probability on test set
        test_prob = ml_model.predict_proba(test_x)[:, 1]
        # Prediction class on test set
        test_pred = ml_model.predict(test_x)
        # Compute False postive rate and True positive rate
        fpr, tpr, thresholds = metrics.roc_curve(test_y, test_prob)
        # Calculate Area under the curve to display on the plot
        auc = roc_auc_score(test_y, test_prob)
        # Plot the computed values
        ax.plot(fpr, tpr, label=(f"{model['label']} AUC area = {auc:.2f}"))

    # Custom settings for the plot
    ax.plot([0, 1], [0, 1], "r--")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title("Receiver Operating Characteristic")
    ax.legend(loc="lower right")
    # Save plot
    if save_png:
        fig.savefig(f"{DATA}/roc_auc", dpi=300, bbox_inches="tight", transparent=True)
    return fig

In [16]:
def model_performance(ml_model, test_x, test_y, verbose=True):
    """
    Helper function to calculate model performance

    Parameters
    ----------
    ml_model: sklearn model object
        The machine learning model to train.
    test_x: list
        Molecular fingerprints for test set.
    test_y: list
        Associated activity labels for test set.
    verbose: bool
        Print performance measure (default = True)

    Returns
    -------
    tuple:
        Accuracy, sensitivity, specificity, auc on test set.
    """

    # Prediction probability on test set
    test_prob = ml_model.predict_proba(test_x)[:, 1]

    # Prediction class on test set
    test_pred = ml_model.predict(test_x)

    # Performance of model on test set
    accuracy = accuracy_score(test_y, test_pred)
    sens = recall_score(test_y, test_pred)
    spec = recall_score(test_y, test_pred, pos_label=0)
    auc = roc_auc_score(test_y, test_prob)

    if verbose:
        # Print performance results
        # NBVAL_CHECK_OUTPUT        print(f"Accuracy: {accuracy:.2}")
        print(f"Sensitivity: {sens:.2f}")
        print(f"Specificity: {spec:.2f}")
        print(f"AUC: {auc:.2f}")

    return accuracy, sens, spec, auc

In [17]:
def model_training_and_validation(ml_model, name, splits, verbose=True):
    """
    Fit a machine learning model on a random train-test split of the data
    and return the performance measures.

    Parameters
    ----------
    ml_model: sklearn model object
        The machine learning model to train.
    name: str
        Name of machine learning algorithm: RF, SVM, ANN
    splits: list
        List of desciptor and label data: train_x, test_x, train_y, test_y.
    verbose: bool
        Print performance info (default = True)

    Returns
    -------
    tuple:
        Accuracy, sensitivity, specificity, auc on test set.

    """
    train_x, test_x, train_y, test_y = splits

    # Fit the model
    ml_model.fit(train_x, train_y)

    # Calculate model performance results
    accuracy, sens, spec, auc = model_performance(ml_model, test_x, test_y, verbose)

    return accuracy, sens, spec, auc

In [22]:
actx_final = actx.drop('Name', axis=1)
actx_final

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,ro5_fulfilled
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
1541,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
1542,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True
1543,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,True


In [24]:
actx_df = actx_final.drop('ro5_fulfilled', axis=1)
actx_df

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1540,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1541,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1542,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1543,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [32]:
chembl_df['ro5_fulfilled']

0       True
1       True
2       True
3       True
4       True
        ... 
1540    True
1541    True
1542    True
1543    True
1544    True
Name: ro5_fulfilled, Length: 1545, dtype: object

In [33]:
# Read data (Lipinski)
chembl_df = pd.read_csv(
    "IDH_compounds_lipinski.csv",
    index_col=0,
)


In [39]:
X = actx_df
Y = chembl_df.pIC50

In [40]:
# Spliting data in 80\20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=42)

In [41]:
# Seeing the data that was prepared
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((1236, 881), (1236,), (309, 881), (309,))

In [42]:
# Defines and builds the lazyclassifier
reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = reg.fit(X_train, X_train, Y_train, Y_train)

100%|██████████| 42/42 [02:37<00:00,  3.75s/it]


In [43]:
# Performance table of the training set (80% subset)
models_train

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTreeRegressor,0.65,0.9,0.3,0.16
ExtraTreeRegressor,0.65,0.9,0.3,0.18
ExtraTreesRegressor,0.65,0.9,0.3,4.8
GaussianProcessRegressor,0.65,0.9,0.3,2.6
XGBRegressor,0.5,0.86,0.36,3.26
RandomForestRegressor,0.49,0.85,0.36,3.6
BaggingRegressor,0.43,0.84,0.38,0.49
MLPRegressor,0.32,0.81,0.42,3.62
HistGradientBoostingRegressor,0.25,0.78,0.44,16.03
LGBMRegressor,0.25,0.78,0.44,0.45


In [44]:
# Checking the study on a test sample
reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_test,predictions_test = reg.fit(X_train,X_test,Y_train,Y_test)

100%|██████████| 42/42 [02:24<00:00,  3.45s/it]


In [45]:
models_test

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Lars,6.332124502954637e+35,-1.1780218636990285e+36,1.0214053203298788e+18,0.66
RANSACRegressor,3.4367828283938724e+23,-6.393755067109379e+23,752487756185.03,31.33
SGDRegressor,1.4448043153295612e+23,-2.6878989372851904e+23,487896730746.13,0.14
TransformedTargetRegressor,1.3567265284523298e+23,-2.5240399376726782e+23,472791381783.18,0.44
LinearRegression,1.3567265284523298e+23,-2.5240399376726782e+23,472791381783.18,0.48
KernelRidge,26.99,-47.34,6.54,0.2
GaussianProcessRegressor,18.85,-32.22,5.42,1.87
LinearSVR,1.59,-0.1,0.99,1.48
MLPRegressor,1.58,-0.09,0.98,3.79
PassiveAggressiveRegressor,1.56,-0.04,0.96,0.15


In [46]:
X_train = X_train.astype('int32')
Y_train = Y_train.astype('float64')

In [48]:
from sklearn.ensemble import RandomForestRegressor

In [50]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
r2

0.510691268495264

In [51]:
# Try data with test sample

Y_pred = model.predict(X_test)
print(Y_pred)

[4.58498254 8.43111923 5.2576164  5.83708014 6.41639852 5.85292287
 4.87983157 5.35062734 5.57683558 5.99596143 6.86435798 5.57791009
 5.25683436 7.16168644 6.49257128 6.17729433 7.13432406 6.06704579
 6.98649606 7.27280054 6.94950021 6.98233933 7.01257306 7.12892291
 6.64313311 7.01828239 5.94673311 5.80227749 5.82100922 6.67523118
 6.43446884 4.91450013 7.4509863  5.2757592  5.81382747 6.58818223
 5.81849624 6.26641821 6.85672579 6.17248209 7.25094766 7.58812827
 7.56636463 7.16241915 5.29946486 6.71494004 7.53686347 5.78848766
 5.57343583 6.25568844 6.57072576 7.36076997 6.6409613  6.28954117
 7.11551499 6.80046984 6.0326932  7.16767069 6.8656832  5.75041133
 6.68499748 6.56100672 6.57573867 6.99266775 7.21104179 7.24493807
 7.20727315 6.61625883 5.82188687 5.96705379 8.12693261 7.5261656
 5.65498773 5.53679191 7.87102143 7.86292441 5.7279558  7.25885999
 6.23235691 6.13042951 6.87547417 7.24548031 7.53686347 6.61958922
 7.3232724  5.25727645 6.99450049 7.10700222 6.15378118 6.92202

In [52]:
# Calculate the absolute errors

errors = abs(Y_pred - Y_test)
print('Mean absolute errors:', round(np.mean(errors), 2), 'degrees.')

Mean absolute errors: 0.49 degrees.


In [53]:
# Calculate percentage of errors
mape = 100 * (errors / Y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 92.04 %.


Novel drugs prediction

In [None]:
df.to_csv('drugs.csv')

In [None]:
pred = model_RF.predict([df])
print("pIC50 prediction", pred)

In [None]:
drugs = pd.read_csv('drugs.csv')
pred = model_RF.predict([drugs])
print("pIC50 prediction", pred)

generate model as Pickle Object

In [58]:
import pickle

In [59]:
pickle.dump(model, open('IDH_model.pkl', 'wb'))