In [None]:
# General python libraries and packages
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')
import os

# Accessing Databases
from chembl_webresource_client.new_client import new_client

# Data Visualization
import matplotlib.pyplot as plt

# Processing chemical data
from rdkit.Chem import PandasTools, Draw

# Retrieve approved oral drugs from ChEMBL 
via API

**What is a ‘drug’ or a ‘clinical candidate drug’ in ChEMBL? And how does this differ from a compound?** <br>
https://chembl.gitbook.io/chembl-interface-documentation/frequently-asked-questions/drug-and-compound-questions

The **maximum phase of development** for the compound across all indications is assigned a category called 'max_phase' <br>
max_phase = 4: A marketed drug.  <br>
https://chembl.gitbook.io/chembl-interface-documentation/frequently-asked-questions/drug-and-compound-questions#what-is-max-phase <br>
By contrast, the 'max_phase_for_ind' field in the 'drug_indication' table in the downloadable ChEMBL database contains the maximum phase of development for the drug or clinical candidate drug for a specified indication. 


In [None]:
def get_oral_drugs_from_chembl():

    # Create a new API client
    compounds_api = new_client.molecule
    
    # Get the cmpd infos
    # Filter for organic small molecules in clinical phase 4 (= approved drugs, not withdrawn)  
    # that are orally administered and are not a prodrug
    cmpd_info = compounds_api.filter(max_phase=4, oral=True, molecule_type='Small molecule', 
                                     withdrawn_flag=False, inorganic_flag=0, prodrug=0,
                                    ).only('pref_name', 'molecule_chembl_id', 
                                           'first_approval', 'indication_class', 
                                           'molecule_structures', 'canonical_smiles', 'natural_product', 
                                           #'molecule_type', 'max_phase', 'withdrawn_flag', 'inorganic_flag', 'prodrug'
                                    ).order_by('pref_name')

    # Create a dataframe from the retrieved cmpd info
    oral_drugs_df = pd.DataFrame.from_records(list(tqdm(cmpd_info)))
    
    # Rename the columns
    oral_drugs_df.rename(columns={'pref_name'         : 'Drug_Name',
                                 'molecule_chembl_id' : 'ChEMBL_Molecule_ID',
                                 'first_approval'     : 'Year_of_First_Approval',
                                 'indication_class'   : 'Indication', 
                                 'canonical_smiles'   : 'SMILES',
                                 'natural_product'    : 'NP'
                                 }, inplace=True)
    
    # Extract the SMILES from the dictionary of molecular representations
    oral_drugs_df['SMILES'] = oral_drugs_df['molecule_structures'].apply(lambda x: x.get('canonical_smiles') if isinstance(x, dict) else None)
    oral_drugs_df.drop('molecule_structures', axis=1, inplace=True)

    # Remove all molecules without a canonical SMILES
    oral_drugs_df.dropna(axis=0, subset='SMILES', inplace=True)

    print(len(oral_drugs_df))

    return oral_drugs_df

oral_drugs_df = get_oral_drugs_from_chembl() 

oral_drugs_df.head(1)

In [None]:
# Convert the strings into title format (instead of all upper case)
oral_drugs_df['Drug_Name'] = oral_drugs_df['Drug_Name'].str.title()

In [None]:
# Remove compounds UREA C 13 and UREA C 14, as identified from manual data inspection
# Note, that UREA itself is also included in the data
oral_drugs_df = oral_drugs_df[~oral_drugs_df['Drug_Name'].isin(['Urea C 13', 'Urea C 14'])]
oral_drugs_df = oral_drugs_df.reset_index(drop=True)
print(len(oral_drugs_df))

In [None]:
filepath = os.path.join('Data', 'ChEMBL_Oral_Drugs')
oral_drugs_df.to_excel(os.path.join(filepath, 'ChEMBL_Oral_Drugs.xlsx'), index=False)

In [None]:
PandasTools.AddMoleculeColumnToFrame(oral_drugs_df, smilesCol='SMILES', molCol='2D_Mol')
PandasTools.RenderImagesInAllDataFrames(images=True)
oral_drugs_df.head()

Please note that drugs may appear multiple times in this DataFrame, <br>
depending on the protonation/charge state, the presence/type of a counter ion, alternative from as a hydrochloride, etc.  <br>
=> **Molecule Standardization and Preparation** in the next step!