In [1]:
import csv, os
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors

root_dir = '../mol_finger'
id_prop_file = os.path.join(root_dir, 'id_prop.csv')
with open(id_prop_file) as f:
    reader = csv.reader(f)
    id_prop_data = [row for row in reader]

In [2]:
descriptor_names = [x[0] for x in Chem.Descriptors.descList]

calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

all_file_desc = []
all_file_target = []
num_data = len(id_prop_data)
for i in range (num_data):
    mol_id, target = id_prop_data[i]
    mol_file = os.path.join(root_dir, mol_id+'.mol')
    mol = Chem.MolFromMolFile(mol_file)
    
    descriptors = calculator.CalcDescriptors(mol)
    
    all_file_desc.append(descriptors)
    all_file_target.append(target)

In [3]:
df = pd.DataFrame(all_file_desc, columns=descriptor_names)

df['target'] = all_file_target
df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,target
0,3.694444,3.694444,0.523148,0.523148,0.357952,76.143,64.047,76.098951,32,0,...,0,0,0,0,0,0,0,0,0,0
1,2.405417,2.405417,1.355000,1.355000,0.414571,142.246,124.102,142.145901,58,0,...,0,0,0,0,0,0,0,0,0,0
2,2.000000,2.000000,2.000000,2.000000,0.353059,46.093,38.029,46.065126,20,0,...,0,0,0,0,0,0,0,0,0,0
3,3.818102,3.818102,0.802546,0.802546,0.470034,118.224,100.080,118.145901,50,0,...,0,0,0,0,0,0,0,0,0,0
4,3.770833,3.770833,0.250000,0.250000,0.419525,74.147,62.051,74.096426,32,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,3.153426,3.153426,1.095463,1.095463,0.591655,158.204,148.124,158.083301,58,0,...,0,0,0,0,0,0,0,0,0,0
140,3.025950,3.025950,1.095926,1.095926,0.686507,186.258,172.146,186.114601,70,0,...,0,0,0,0,0,0,0,0,0,0
141,2.992061,2.992061,1.191280,1.191280,0.676166,184.242,172.146,184.098951,68,0,...,0,0,0,0,0,0,0,0,0,0
142,2.423611,2.423611,1.375000,1.375000,0.445190,100.185,86.073,100.112076,42,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.to_csv('Data_AllFea.csv', index=False)