In [3]:
from lazyqsar.descriptors.chemeleon import ChemeleonDescriptor
from lazyqsar.descriptors.morgan import MorganFingerprint
from lazyqsar.descriptors.rdkit_descriptors import RDKitDescriptor
import pandas as pd
import tqdm
import sys
import os

In [4]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# List of pathogens to process
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"][8:9]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

In [5]:
for pathogen in pathogens:

    # Loading pathogen data
    pathogen_code = get_pathogen_code(pathogen)
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL = pd.read_csv(os.path.join(OUTPUT, pathogen_code, f"{pathogen_code}_ChEMBL_data.csv"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL['compound_chembl_id']))}")


Loading ChEMBL preprocessed data for mtuberculosis...
Number of activities for mtuberculosis: 714221
Number of compounds for mtuberculosis: 132378


In [6]:
# Create output directory
os.makedirs(os.path.join(OUTPUT, pathogen_code, "descriptors"), exist_ok=True)

In [9]:
# ChEMBL ID to SMILES
ChEMBL_id_to_SMILES = {i:j for i,j in zip(ChEMBL['compound_chembl_id'], ChEMBL['canonical_smiles'])}
ids = sorted(ChEMBL_id_to_SMILES)
SMILES = [[i, ChEMBL_id_to_SMILES[i]] for i in ids]

# Calculate Morgan
X_Morgan = MorganFingerprint().transform([i[1] for i in SMILES])

# Calculate rdkit
X_Rdkit = RDKitDescriptor().transform([i[1] for i in SMILES])

Transforming Morgan descriptors in chunks of 1000: 100%|██████████| 2/2 [00:28<00:00, 14.49s/it]


In [11]:
X_Morgan.shape, X_Rdkit.shape

((132378, 2048), (132378, 217))

In [12]:
SMILES

[['CHEMBL10', 'C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc3)[nH]2)cc1'],
 ['CHEMBL100129', 'O=[N+]([O-])c1c[nH]cc1-c1cccc(Cl)c1Cl'],
 ['CHEMBL100294', 'OCc1cn(Cc2ccccc2Cl)cc1-c1ccc(Cl)cc1'],
 ['CHEMBL100567', 'CCOC(=O)c1cn(Cc2ccccc2)cc1-c1ccc(OC)cc1'],
 ['CHEMBL100633', 'O=C(Nc1ccc(Cl)c(Cl)c1)c1cc(Cl)ccc1O'],
 ['CHEMBL100974', 'O=[N+]([O-])c1c[nH]cc1-c1ccccc1Cl'],
 ['CHEMBL101165', 'Cc1ccnc(-c2cc(C)ccn2)c1'],
 ['CHEMBL101253', 'Clc1ccc(Nc2nnc(Cc3ccncc3)c3ccccc23)cc1'],
 ['CHEMBL101298', 'O=[N+]([O-])c1c[nH]cc1-c1ccc(Cl)cc1'],
 ['CHEMBL101307', 'NC(=O)c1c[nH]cc1-c1ccc(Cl)cc1'],
 ['CHEMBL101543', 'COc1ccc(-c2cn(C)cc2[N+](=O)[O-])c(Cl)c1'],
 ['CHEMBL101815', 'O=[N+]([O-])c1cc2cccnc2c2ncccc12'],
 ['CHEMBL101892', 'NC(=O)c1cn([C@@H]2O[C@H](CO)[C@@H](O)[C@H]2O)c2ncnc(N)c12'],
 ['CHEMBL102083', 'CCOC(=O)c1cn(C)cc1-c1ccc(OC)cc1'],
 ['CHEMBL102425', 'Cc1cn([C@H]2C[C@H](O)[C@@H](CN=[N+]=[N-])O2)c(=O)[nH]c1=O'],
 ['CHEMBL102443', 'O=C(CBr)c1ccccn1'],
 ['CHEMBL102454', 'C[C@@H]1CC[C@H](n2cc(Br)