# RDKit pandas support
This is a quick tutorial will show some of the ways you can use RDKit together with pandas.

@TAGS: #basics #pandas

In [None]:
from __future__ import print_function
%matplotlib inline

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

# The next line is commented out 
# because GitHub does not render svg's embedded in notebooks
IPythonConsole.ipython_useSVG=False

Load the table of drugs (downloaded from [ChEMBL](https://www.ebi.ac.uk/chembl/) )

In [None]:
df = pd.read_csv('../data/chembl_drugs.txt.gz', sep='\t')

In [None]:
[str(x) for x in df.columns]

In [None]:
len(df)

Keep only compounds with SMILES,that respect Ro5 and are on the market

In [None]:
df = df[df['CANONICAL_SMILES'].notnull() & # Keep cpds with SMILES
        (df['RULE_OF_FIVE'] == 'Y') & # that respect Ro5
        (df['DEVELOPMENT_PHASE'] == 4)] # are on the market

In [None]:
len(df)

Table contains CANONICAL_SMILES with SMILES which we can convert to RDKit molecules (default name ROMol)

In [None]:
PandasTools.AddMoleculeColumnToFrame(df, smilesCol='CANONICAL_SMILES')

Remove rows where RDKit failed to generate a molecule from SMILES

In [None]:
df = df[~df['ROMol'].isnull()]

Extract a name from SYNONYMS column by applying a row-wise operation

In [None]:
df['name'] = df.apply(lambda x: x['SYNONYMS'].split('(')[0] if type(x['SYNONYMS']) is str else None, axis=1)

Depict first 8 mols

In [None]:
PandasTools.FrameToGridImage(df.head(8), legendsCol='name', molsPerRow=4)

Calculate some descriptors and visualize distributions

In [None]:
from rdkit.Chem import Descriptors

In [None]:
df['MW'] = df['ROMol'].map(Descriptors.MolWt)
df['logP'] = df['ROMol'].map(Descriptors.MolLogP)

In [None]:
#doctest: IGNORE
df['MW'].hist();

In [None]:
#doctest: IGNORE
df['logP'].hist();

Do a subsructure search on a dataframe

In [None]:
query = Chem.MolFromSmarts('Nc1ccc(S(=O)(=O)-[*])cc1')

In [None]:
#doctest: IGNORE
Chem.Draw.MolToImage(query, kekulize=False)

In [None]:
len(df[df['ROMol'] >= query])

RDKit by defaults highlights the matched substructures

In [None]:
df[df['ROMol'] >= query][['SYNONYMS', 'ROMol']].head()

Save the table as SD file

In [None]:
PandasTools.WriteSDF(df, '../data/approved_drugs.sdf', idName='CHEMBL_ID', properties=df.columns)

Tutorial author: Samo Turk, Jan. 2017