In [9]:
from lazyqsar.descriptors.chemeleon import ChemeleonDescriptor
from lazyqsar.descriptors.morgan import MorganFingerprint
from lazyqsar.descriptors.rdkit_descriptors import RDKitDescriptor
import pandas as pd
import numpy as np
import h5py
import tqdm
import sys
import os

In [2]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# List of pathogens to process
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"][8:9]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# Create output directory
OUTPUT = os.path.join(root, "..", "output")

In [3]:
for pathogen in pathogens:

    # Loading pathogen data
    pathogen_code = get_pathogen_code(pathogen)
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL = pd.read_csv(os.path.join(OUTPUT, pathogen_code, f"{pathogen_code}_ChEMBL_data.csv"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL['compound_chembl_id']))}")

    # # Create output directory
    # os.makedirs(os.path.join(OUTPUT, pathogen_code, "descriptors"), exist_ok=True)

    # ChEMBL ID to SMILES
    ChEMBL_id_to_SMILES = {i:j for i,j in zip(ChEMBL['compound_chembl_id'], ChEMBL['canonical_smiles'])}
    ids = sorted(ChEMBL_id_to_SMILES)
    SMILES = [[i, ChEMBL_id_to_SMILES[i]] for i in ids]

    # Calculate Morgan
    print("Calculating Morgan Fingerprints...")
    X_Morgan = MorganFingerprint().transform([i[1] for i in SMILES])

    # # Calculate rdkit
    # print("Calculating rdkit descriptors...")
    # X_rdkit = RDKitDescriptor().transform([i[1] for i in SMILES])


Loading ChEMBL preprocessed data for mtuberculosis...
Number of activities for mtuberculosis: 714221
Number of compounds for mtuberculosis: 132378
Calculating Morgan Fingerprints...


Transforming Morgan descriptors in chunks of 1000: 100%|██████████| 2/2 [00:28<00:00, 14.45s/it]


In [11]:
print("Saving results to H5 file...")
with h5py.File(os.path.join(OUTPUT, pathogen_code, "descriptors.h5"), "w") as f:
    dt = h5py.string_dtype(encoding='utf-8')
    f.create_dataset("SMILES", data=SMILES, dtype=dt)
    f.create_dataset("X_Morgan", data=X_Morgan.astype(np.int8))
    # f.create_dataset("X_rdkit", data=X_rdkit)

Saving results to H5 file...


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)