In [1]:
!pip install padelpy
import padelpy

Collecting padelpy
  Downloading padelpy-0.1.14-py2.py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.14


In [2]:
import pandas as pd
import numpy as np
from padelpy import padeldescriptor, from_smiles

In [3]:
df = pd.read_excel('alcohols.xlsx')
df.head(10)

Unnamed: 0,Name,Smiles
0,2-phenoxyethanol,OCCOC1=CC=CC=C1
1,2-bromoethanol,BrCCO
2,2-fluoroethanol,FCCO
3,methanol,CO
4,benzylalcohol,OCC1=CC=CC=C1
5,ethanol,CCO
6,1-phenylethanol,OC(C)C1=CC=CC=C1
7,2-phenylethanol,OCCC1=CC=CC=C1
8,2-methoxyethanol,OCCOC
9,2-ethoxyethanol,OCCOCC


In [None]:
# Draw.MolsToGridImage(df['Mol'][:8],molsPerRow=4,subImgSize=(200,200),legends=[x for x in df['Name'][:8]])

## **Calculate molecular descriptors from padel**

 **Descriptor Info**
*   It will provide the following descriptors/fingerprint:
1.   2D - 1444
2.   3D - 431
3.   Fingerprints - PubChem fingerprint 881 bits


**Calculate molecular descriptors using "from_smiles" function**


> The "from_smiles" function accepts a SMILES string or list of SMILES strings as an argument, and returns a Python dictionary with descriptor/fingerprint names/values as keys/values respectively - if multiple SMILES strings are supplied, "from_smiles" returns a list of dictionaries.



In [4]:
descriptors = from_smiles('OCCOC1=CC=CC=C1', descriptors=True, fingerprints=False)
descriptors

{'nAcid': '0',
 'ALogP': '0.36539999999999995',
 'ALogp2': '0.13351715999999997',
 'AMR': '43.8916',
 'apol': '22.351929999999985',
 'naAromAtom': '6',
 'nAromBond': '6',
 'nAtom': '20',
 'nHeavyAtom': '10',
 'nH': '10',
 'nB': '0',
 'nC': '8',
 'nN': '0',
 'nO': '2',
 'nS': '0',
 'nP': '0',
 'nF': '0.0',
 'nCl': '0.0',
 'nBr': '0.0',
 'nI': '0.0',
 'nX': '0.0',
 'ATS0m': '1676.2096099999987',
 'ATS1m': '1711.4315979999994',
 'ATS2m': '2026.651218999999',
 'ATS3m': '1770.1954409999985',
 'ATS4m': '1173.253965999999',
 'ATS5m': '996.7358609999997',
 'ATS6m': '702.1525629999999',
 'ATS7m': '293.0819250000002',
 'ATS8m': '32.298336',
 'ATS0v': '4131.754212232656',
 'ATS1v': '4987.449996432906',
 'ATS2v': '6286.807577470098',
 'ATS3v': '5657.446432490819',
 'ATS4v': '4223.645982427685',
 'ATS5v': '3636.1018194274247',
 'ATS6v': '2703.7759026750555',
 'ATS7v': '1289.1098006267898',
 'ATS8v': '321.0852247197065',
 'ATS0e': '154.2122',
 'ATS1e': '156.41512',
 'ATS2e': '251.0052280000001',
 'A

In [5]:
descriptors = from_smiles(df['Smiles'][0])
descriptors

{'nAcid': '0',
 'ALogP': '0.36539999999999995',
 'ALogp2': '0.13351715999999997',
 'AMR': '43.8916',
 'apol': '22.351929999999985',
 'naAromAtom': '6',
 'nAromBond': '6',
 'nAtom': '20',
 'nHeavyAtom': '10',
 'nH': '10',
 'nB': '0',
 'nC': '8',
 'nN': '0',
 'nO': '2',
 'nS': '0',
 'nP': '0',
 'nF': '0.0',
 'nCl': '0.0',
 'nBr': '0.0',
 'nI': '0.0',
 'nX': '0.0',
 'ATS0m': '1676.2096099999987',
 'ATS1m': '1711.4315979999994',
 'ATS2m': '2026.651218999999',
 'ATS3m': '1770.1954409999985',
 'ATS4m': '1173.253965999999',
 'ATS5m': '996.7358609999997',
 'ATS6m': '702.1525629999999',
 'ATS7m': '293.0819250000002',
 'ATS8m': '32.298336',
 'ATS0v': '4131.754212232656',
 'ATS1v': '4987.449996432906',
 'ATS2v': '6286.807577470098',
 'ATS3v': '5657.446432490819',
 'ATS4v': '4223.645982427685',
 'ATS5v': '3636.1018194274247',
 'ATS6v': '2703.7759026750555',
 'ATS7v': '1289.1098006267898',
 'ATS8v': '321.0852247197065',
 'ATS0e': '154.2122',
 'ATS1e': '156.41512',
 'ATS2e': '251.0052280000001',
 'A

**Generate descriptors for the whole dataset**

In [6]:
df.head()

Unnamed: 0,Name,Smiles
0,2-phenoxyethanol,OCCOC1=CC=CC=C1
1,2-bromoethanol,BrCCO
2,2-fluoroethanol,FCCO
3,methanol,CO
4,benzylalcohol,OCC1=CC=CC=C1


In [7]:
# provide list of SMILES as an arguement and the output will be list of dictionaries
descriptors_df = from_smiles(df['Smiles'].tolist(), descriptors=True, fingerprints=False)
descriptors_df

[{'nAcid': '0',
  'ALogP': '0.36539999999999995',
  'ALogp2': '0.13351715999999997',
  'AMR': '43.8916',
  'apol': '22.351929999999985',
  'naAromAtom': '6',
  'nAromBond': '6',
  'nAtom': '20',
  'nHeavyAtom': '10',
  'nH': '10',
  'nB': '0',
  'nC': '8',
  'nN': '0',
  'nO': '2',
  'nS': '0',
  'nP': '0',
  'nF': '0.0',
  'nCl': '0.0',
  'nBr': '0.0',
  'nI': '0.0',
  'nX': '0.0',
  'ATS0m': '1676.2096099999987',
  'ATS1m': '1711.4315979999994',
  'ATS2m': '2026.651218999999',
  'ATS3m': '1770.1954409999985',
  'ATS4m': '1173.253965999999',
  'ATS5m': '996.7358609999997',
  'ATS6m': '702.1525629999999',
  'ATS7m': '293.0819250000002',
  'ATS8m': '32.298336',
  'ATS0v': '4131.754212232656',
  'ATS1v': '4987.449996432906',
  'ATS2v': '6286.807577470098',
  'ATS3v': '5657.446432490819',
  'ATS4v': '4223.645982427685',
  'ATS5v': '3636.1018194274247',
  'ATS6v': '2703.7759026750555',
  'ATS7v': '1289.1098006267898',
  'ATS8v': '321.0852247197065',
  'ATS0e': '154.2122',
  'ATS1e': '156.4

In [8]:
final_df = pd.concat([df, pd.DataFrame(descriptors_df)], axis=1)

In [9]:
final_df.head()

Unnamed: 0,Name,Smiles,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2-phenoxyethanol,OCCOC1=CC=CC=C1,0,0.3653999999999999,0.1335171599999999,43.8916,22.351929999999985,6,6,20,...,0.7875874213342919,0.1638065969410667,0.6050491779839479,0.3682736001175868,0.3416590413487128,8.015529193353178,11.259936632609534,22.50482595657066,0.6813811320014378,1.3149818194502476
1,2-bromoethanol,BrCCO,0,0.4517000000000001,0.20403289,20.5881,10.705965,0,0,9,...,0.5591812521433479,0.3016314872177597,0.3653893487530604,0.5810243673525842,0.3966029711679034,2.491072140643751,1.7901506609171909,4.6441233958754005,0.3387718782150219,1.343016687273548
2,2-fluoroethanol,FCCO,0,0.0426,0.00181476,12.8615,8.212965,0,0,9,...,0.5727488937172263,0.2907895788693193,0.6778366353173771,0.5241377056935893,0.3507892943321852,2.389575450755787,1.6238824112950196,4.323567717995904,0.3591233405758394,1.5527636353431518
3,methanol,CO,0,-0.3580000000000002,0.1281640000000001,8.2613,5.229172000000001,0,0,6,...,0.6003988191708051,0.2123549937028557,0.55278933374474,0.3378550439864066,0.29296875,1.3955041752662944,0.544663090640938,2.0050469477132946,0.4005982287562077,1.1836131277311466
4,benzylalcohol,OCC1=CC=CC=C1,0,0.7591000000000008,0.5762328100000011,37.0125,18.456343999999994,6,6,16,...,0.6473069930912803,0.2955970785857772,0.5925623835564244,0.4189967301946937,0.2075067066306033,5.259877887074605,6.783172963930538,13.632851098171754,0.4709604896369204,1.2190658203817215


**Calculate molecular descriptors using "padeldescriptor" function**



> Alternatively, you can have more control over PaDEL-Descriptor with the command-line wrapper function. Any combination of arguments supported by PaDEL-Descriptor can be accepted by the "padeldescriptor" function.



In [10]:
# create .smi file containing smiles
df['Smiles'].to_csv('alcohols.smi', index=None, header=None)

In [17]:
padeldescriptor(mol_dir='alcohols.smi', d_file='descriptors.csv', d_2d=False, d_3d=True, fingerprints=False, retainorder=True,
                convert3d=True, #detectaromaticity=True, removesalt=True, retain3d=True,
                # standardizenitro=True, standardizetautomers=True
                )

In [18]:
final_df = pd.read_csv("descriptors.csv")
final_df.head()

Unnamed: 0,Name,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,AUTOGEN_alcohols_1,1.233465,2.169707,2.931186,3.776197,4.525274,5.381329,6.374012,7.238476,7.56326,...,0.787587,0.163807,0.605049,0.368274,0.341659,8.015529,11.259937,22.504826,0.681381,1.314982
1,AUTOGEN_alcohols_2,1.2835,2.203537,2.813956,2.828235,0.0,0.0,0.0,0.0,0.0,...,0.559181,0.301631,0.365389,0.581024,0.396603,2.491072,1.790151,4.644123,0.338772,1.343017
2,AUTOGEN_alcohols_3,1.213875,2.092681,2.761575,2.730703,0.0,0.0,0.0,0.0,0.0,...,0.572749,0.29079,0.677837,0.524138,0.350789,2.389575,1.623882,4.323568,0.359123,1.552764
3,AUTOGEN_alcohols_4,1.1366,1.935542,2.458364,0.0,0.0,0.0,0.0,0.0,0.0,...,0.600399,0.212355,0.552789,0.337855,0.292969,1.395504,0.544663,2.005047,0.400598,1.183613
4,AUTOGEN_alcohols_5,1.228769,2.187856,2.995196,3.818374,4.610989,5.202201,5.349996,0.0,0.0,...,0.647307,0.295597,0.592562,0.418997,0.207507,5.259878,6.783173,13.632851,0.47096,1.219066


In [25]:
final_df.iloc[0, 1:].mean()

47.353080704776964

In [19]:
desc_2d_3d = pd.concat([df, final_df.drop("Name", axis=1)], axis=1)


In [20]:
desc_2d_3d.head()

Unnamed: 0,Name,Smiles,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,2-phenoxyethanol,OCCOC1=CC=CC=C1,1.233465,2.169707,2.931186,3.776197,4.525274,5.381329,6.374012,7.238476,...,0.787587,0.163807,0.605049,0.368274,0.341659,8.015529,11.259937,22.504826,0.681381,1.314982
1,2-bromoethanol,BrCCO,1.2835,2.203537,2.813956,2.828235,0.0,0.0,0.0,0.0,...,0.559181,0.301631,0.365389,0.581024,0.396603,2.491072,1.790151,4.644123,0.338772,1.343017
2,2-fluoroethanol,FCCO,1.213875,2.092681,2.761575,2.730703,0.0,0.0,0.0,0.0,...,0.572749,0.29079,0.677837,0.524138,0.350789,2.389575,1.623882,4.323568,0.359123,1.552764
3,methanol,CO,1.1366,1.935542,2.458364,0.0,0.0,0.0,0.0,0.0,...,0.600399,0.212355,0.552789,0.337855,0.292969,1.395504,0.544663,2.005047,0.400598,1.183613
4,benzylalcohol,OCC1=CC=CC=C1,1.228769,2.187856,2.995196,3.818374,4.610989,5.202201,5.349996,0.0,...,0.647307,0.295597,0.592562,0.418997,0.207507,5.259878,6.783173,13.632851,0.47096,1.219066


In [21]:
df.head()

Unnamed: 0,Name,Smiles
0,2-phenoxyethanol,OCCOC1=CC=CC=C1
1,2-bromoethanol,BrCCO
2,2-fluoroethanol,FCCO
3,methanol,CO
4,benzylalcohol,OCC1=CC=CC=C1


In [22]:
padeldescriptor(mol_dir='phenoxyethanol.sdf', d_file='phenoxyethanol.csv', d_2d=False, d_3d=True, fingerprints=False, retainorder=True,
                #convert3d=True, #detectaromaticity=True, removesalt=True, retain3d=True,
                # standardizenitro=True, standardizetautomers=True
                )

In [23]:
pe_df = pd.read_csv("phenoxyethanol.csv")
pe_df.head()

Unnamed: 0,Name,TDB1u,TDB2u,TDB3u,TDB4u,TDB5u,TDB6u,TDB7u,TDB8u,TDB9u,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,31236,1.244017,2.198937,2.98115,3.889752,4.652716,5.534275,6.537125,7.539452,8.229534,...,0.792948,0.168216,0.600563,0.400875,0.292884,8.417605,12.096166,23.603488,0.689422,1.294321


In [24]:
pe_df.iloc[:, 1:].mean(axis=1)

0    49.336343
dtype: float64