In [1]:
!pip install numpy==1.23



In [2]:
import numpy
numpy.__version__

'1.23.0'

In [3]:
!pip install rdkit==2023.09.3
!pip install mordred



In [4]:
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors

import pandas as pd
from tqdm import tqdm

In [5]:
!pip install session-info



In [6]:
import session_info
session_info.show()

In [7]:
!wget https://github.com/codetodiscovery/3d-descriptors-mordred/raw/12b803ff9bc82320f048cea209459aae52a58849/alcohols.xlsx

--2024-07-17 08:00:01--  https://github.com/codetodiscovery/3d-descriptors-mordred/raw/12b803ff9bc82320f048cea209459aae52a58849/alcohols.xlsx
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/codetodiscovery/3d-descriptors-mordred/12b803ff9bc82320f048cea209459aae52a58849/alcohols.xlsx [following]
--2024-07-17 08:00:01--  https://raw.githubusercontent.com/codetodiscovery/3d-descriptors-mordred/12b803ff9bc82320f048cea209459aae52a58849/alcohols.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10444 (10K) [application/octet-stream]
Saving to: ‘alcohols.xlsx.1’


2024-07-17 08:00:01 (21.8 MB/s) - ‘

In [8]:
df = pd.read_excel('alcohols.xlsx')

In [9]:
df.head()

Unnamed: 0,Name,Smiles
0,2-phenoxyethanol,OCCOC1=CC=CC=C1
1,2-bromoethanol,BrCCO
2,2-fluoroethanol,FCCO
3,methanol,CO
4,benzylalcohol,OCC1=CC=CC=C1


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    100 non-null    object
 1   Smiles  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


## **Generate molecular objects from SMILES with hydrogens attached**

In [11]:
mol_list = []

for smile in df['Smiles']:
  mol = Chem.MolFromSmiles(smile)
  mol = Chem.AddHs(mol)
  mol_list.append(mol)

df = pd.concat([df, pd.DataFrame(mol_list, columns = (['Mol']))], axis=1)


In [12]:
df.head()

Unnamed: 0,Name,Smiles,Mol
0,2-phenoxyethanol,OCCOC1=CC=CC=C1,<rdkit.Chem.rdchem.Mol object at 0x7b61f13a9620>
1,2-bromoethanol,BrCCO,<rdkit.Chem.rdchem.Mol object at 0x7b61f13a9700>
2,2-fluoroethanol,FCCO,<rdkit.Chem.rdchem.Mol object at 0x7b61f13a97e0>
3,methanol,CO,<rdkit.Chem.rdchem.Mol object at 0x7b61f13a9850>
4,benzylalcohol,OCC1=CC=CC=C1,<rdkit.Chem.rdchem.Mol object at 0x7b61f13a9540>



**Create calculator object to calculate descriptors**

In [13]:
# Creating a descriptor calculator with all descriptors
calc = Calculator(descriptors, ignore_3D=False)

**Calculate 2D/3D descriptors for the whole dataset**

In [14]:
# Function to generate 3D coordinates using RDKit
def generate_3d_coordinates(molecule):
    AllChem.EmbedMolecule(molecule)
    AllChem.MMFFOptimizeMolecule(molecule)

# Iterate through molecules
d = []
for mol in tqdm(df['Mol']):
  generate_3d_coordinates(mol)

 # Calculate descriptors
  result = calc(mol)
  d.append(result)


final_df = pd.concat([df[['Name', 'Smiles']], pd.DataFrame(d, columns = (str(key) for key in result.keys()))], axis=1)

100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


In [15]:
final_df.head()

Unnamed: 0,Name,Smiles,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,2-phenoxyethanol,OCCOC1=CC=CC=C1,7.071068,6.54776,0,0,12.932143,2.154341,4.308683,12.932143,...,8.438366,38.130322,138.06808,6.903404,133,9,42.0,44.0,3.111111,2.5
1,2-bromoethanol,BrCCO,2.12132,2.3401,0,0,4.472136,1.618034,3.236068,4.472136,...,5.509388,22.328143,123.952377,13.772486,10,1,10.0,8.0,2.5,1.25
2,2-fluoroethanol,FCCO,2.12132,2.3401,0,0,4.472136,1.618034,3.236068,4.472136,...,5.509388,22.328143,64.032443,7.114716,10,1,10.0,8.0,2.5,1.25
3,methanol,CO,0.0,0.0,0,0,2.0,1.0,2.0,2.0,...,1.098612,7.493061,32.026215,5.337702,1,0,2.0,1.0,2.0,1.0
4,benzylalcohol,OCC1=CC=CC=C1,5.656854,5.42766,0,0,10.424292,2.135779,4.271558,10.424292,...,8.298291,35.247635,108.057515,6.753595,64,7,34.0,36.0,2.611111,2.0
