In [1]:
import rdkit #type: ignore
import pandas as pd  # type: ignore
 



df  = pd.read_csv('bioactivity_processed.csv')
df.head(3)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0


In [2]:
import numpy as np # type: ignore
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
import pandas as pd

# lambda function simplifies code by creating two lists representing the variables
lipinski = lambda smiles: pd.DataFrame(  #dataframe iimplicitly returned.
    [
        [   
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Lipinski.NumHDonors(mol),
            Lipinski.NumHAcceptors(mol)
        

        ]

    for mol in [Chem.MolFromSmiles(elem) for elem in smiles]
    ],
#dataframe has been constructed within lambda function through comprehension to generate columns and data
    columns=['MW', 'logP', 'NumofDonors', 'NumofAcceptors']


)
#finally the lambda function return the dataframe containing calculaed descriptors

df_lipinski = lipinski(df.canonical_smiles) # on calling this function it directly creates the dataframe containing the descriptors for each molecule




In [3]:
df_lipinski

Unnamed: 0,MW,logP,NumofDonors,NumofAcceptors
0,281.271,1.89262,0,5
1,415.589,3.81320,0,2
2,421.190,2.66050,0,4
3,293.347,3.63080,0,3
4,338.344,3.53900,0,5
...,...,...,...,...
128,338.359,3.40102,0,5
129,296.366,3.44330,0,3
130,276.291,4.09564,0,3
131,278.307,3.29102,0,3


In [4]:
df_combined = pd.concat([df,df_lipinski], axis=1) #combining df and df_lipinski

In [5]:
df_combined

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,MW,logP,NumofDonors,NumofAcceptors
0,CHEMBL187579,Cc1noc(C)c1CN1C(=O)C(=O)c2cc(C#N)ccc21,7200.0,281.271,1.89262,0,5
1,CHEMBL188487,O=C1C(=O)N(Cc2ccc(F)cc2Cl)c2ccc(I)cc21,9400.0,415.589,3.81320,0,2
2,CHEMBL185698,O=C1C(=O)N(CC2COc3ccccc3O2)c2ccc(I)cc21,13500.0,421.190,2.66050,0,4
3,CHEMBL426082,O=C1C(=O)N(Cc2cc3ccccc3s2)c2ccccc21,13110.0,293.347,3.63080,0,3
4,CHEMBL187717,O=C1C(=O)N(Cc2cc3ccccc3s2)c2c1cccc2[N+](=O)[O-],2000.0,338.344,3.53900,0,5
...,...,...,...,...,...,...,...
128,CHEMBL2146517,COC(=O)[C@@]1(C)CCCc2c1ccc1c2C(=O)C(=O)c2c(C)c...,10600.0,338.359,3.40102,0,5
129,CHEMBL187460,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,10100.0,296.366,3.44330,0,3
130,CHEMBL363535,Cc1coc2c1C(=O)C(=O)c1c-2ccc2c(C)cccc12,11500.0,276.291,4.09564,0,3
131,CHEMBL227075,Cc1cccc2c3c(ccc12)C1=C(C(=O)C3=O)[C@@H](C)CO1,10700.0,278.307,3.29102,0,3


In [6]:
df_combined.standard_value.describe()

count        133.000000
mean       85967.130075
std       158897.319181
min           50.000000
25%        10100.000000
50%        17500.000000
75%        70000.000000
max      1000000.000000
Name: standard_value, dtype: float64

In [1]:
#chemical space analysis via lipinski descriptors

import seaborn as sns
import matplotlib.pyplot as plt 
sns.set(style='ticks')


plt.figure(figsize=(5.5,5.5))

sns.countplot(x='bio'