In [3]:
# Use a conda env with ollama installed
# Run this code on a GPU machine
from collections import Counter
from zipfile import ZipFile, ZIP_DEFLATED
from tqdm import tqdm
import pandas as pd
import numpy as np
# import ollama
import pickle
import sys
import os

import subprocess
sys.stderr.write("=== Python GPU Check ===\n\n")
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    sys.stderr.write(result.stdout)
except Exception as e:
    sys.stderr.write(f"nvidia-smi failed: {e}\n\n")


# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
# sys.path.append(os.path.join(root, "..", "..", "src"))
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# Load assays and docs information
assays = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "assays.csv"), low_memory=False)
docs = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "docs.csv"), low_memory=False)
assay_type_map = {"F": "Functional", "B": "Binding", "T": "Toxicity", "A": "ADME", "P": "Physicochemical", "U": "Uncategorized"}

# List of pathogens
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]
pathogens = ["Acinetobacter baumannii", "Mycobacterium tuberculosis", "Klebsiella pneumoniae"]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

=== Python GPU Check ===

nvidia-smi failed: [Errno 2] No such file or directory: 'nvidia-smi'



In [2]:
# Loading ChEMBL preprocessed data
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print(Counter(ChEMBL['target_type']))
del ChEMBL

Loading ChEMBL preprocessed data...
Original size: 24040987
Counter({'SINGLE PROTEIN': 10124041, 'CELL-LINE': 5167209, 'ORGANISM': 4590514, 'UNCHECKED': 2224642, 'ADMET': 494325, 'PROTEIN COMPLEX': 319734, 'NON-MOLECULAR': 281755, 'NO TARGET': 212946, 'PROTEIN-PROTEIN INTERACTION': 128634, 'PROTEIN FAMILY': 116877, 'TISSUE': 100632, 'NUCLEIC-ACID': 95662, 'PROTEIN COMPLEX GROUP': 49036, 'UNKNOWN': 32534, 'SUBCELLULAR': 29082, 'SELECTIVITY GROUP': 27389, 'PHENOTYPE': 26130, '3D CELL CULTURE': 13075, 'CHIMERIC PROTEIN': 3459, 'LIPID': 1243, 'MACROMOLECULE': 951, 'PROTEIN NUCLEIC-ACID COMPLEX': 585, 'SMALL MOLECULE': 512, 'OLIGOSACCHARIDE': 20})


In [10]:
for pathogen in pathogens:

    # Creating output directory
    print(f"Processing pathogen: {pathogen}...")
    pathogen_code = get_pathogen_code(pathogen)
    PATH_TO_OUTPUT = os.path.join(root, "..", "output", pathogen_code, "summaries")
    os.makedirs(PATH_TO_OUTPUT, exist_ok=True)

    # Loading assay data
    ASSAYS = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_raw.csv"))

    # Loading bioactivity data
    ChEMBL_PATHOGEN = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, f"{pathogen_code}_ChEMBL_raw_data.csv.gz"), low_memory=False)
    print(f"Number of activities: {len(ChEMBL_PATHOGEN)}")
    print(f"Number of compounds: {len(set(ChEMBL_PATHOGEN['compound_chembl_id']))}")

    break

Processing pathogen: Acinetobacter baumannii...
Number of activities: 45869
Number of compounds: 32898
