In [1]:
# This script needs to be run with a GPU machine available
from collections import Counter
from pydantic import BaseModel
from zipfile import ZipFile, ZIP_DEFLATED
from tqdm import tqdm
import pandas as pd
import numpy as np
import shutil
import ollama
import json
import sys
import os

import subprocess
print("=== Python GPU Check ===\n\n")
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    print(result.stdout)
except Exception as e:
    print(f"nvidia-smi failed: {e}\n\n")

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import DATAPATH, CONFIGPATH

# Load pathogen info
# pathogen_code = sys.argv[1]
pathogen_code = 'mtuberculosis'
df = pd.read_csv(os.path.join(CONFIGPATH, 'pathogens.csv'))
row = df.loc[df["code"].eq(pathogen_code)]
if row.empty: 
    raise SystemExit(f"Unknown code: {pathogen_code}")
pathogen = row.iloc[0]["pathogen"]

# Load assays and docs information
assays = pd.read_csv(os.path.join(DATAPATH, "chembl_activities", "assays.csv"), low_memory=False)
docs = pd.read_csv(os.path.join(DATAPATH, "chembl_activities", "docs.csv"), low_memory=False)
assay_type_map = {"F": "Functional", "B": "Binding", "T": "Toxicity", "A": "ADME", "P": "Physicochemical", "U": "Uncategorized"}

=== Python GPU Check ===


Tue Jan 20 15:04:51 2026       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.274.02             Driver Version: 581.04       CUDA Version: 13.0     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:01:00.0 Off |                  Off |
|  0%   30C    P8               7W / 450W |    456MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                         

In [6]:
class Parameters(BaseModel):
    organism_curated: str
    target_type_curated: str
    target_name_curated: str
    target_chembl_id_curated: str
    strain: str
    atcc_id: str
    mutations: list[str]
    known_drug_resistances: list[str]
    media: str

In [15]:
COLS = ["assay_id", "activity_type", "unit", "organism_curated", "target_type_curated", "target_name_curated", "target_chembl_id_curated", 
        "strain", "atcc_id", "mutations", "known_drug_resistances", "media"]

In [None]:
# Creating output directory
print(f"Processing pathogen: {pathogen}...")
PATH_TO_OUTPUT = os.path.join(root, "..", "output", pathogen_code, "assay_parameters")

# Loading assay data
ASSAYS = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_cleaned.csv"))

# For each assay
for idx, ASSAY in tqdm(ASSAYS.iterrows()):

    # Getting doc_id
    doc_id = assays[assays['chembl_id'] == ASSAY.assay_id]['doc_id'].tolist()[0]

    result = {
        "Assay ChEMBL ID": ASSAY.assay_id,
        "Assay type": assay_type_map[ASSAY.assay_type],
        "Assay organism": ASSAY.assay_organism,
        "Assay description": assays[assays['chembl_id'] == ASSAY.assay_id]['description'].tolist()[0],
        "Assay strain": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_strain'].tolist()[0],
        "Assay category": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_category'].tolist()[0],
        "Assay test type": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_test_type'].tolist()[0],
        "Assay cell type": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_cell_type'].tolist()[0],
        "Document title": docs[docs['doc_id'] == doc_id]['title'].tolist()[0],
        "Document abstract": docs[docs['doc_id'] == doc_id]['abstract'].tolist()[0],
        "Document journal": docs[docs['doc_id'] == doc_id]['journal'].tolist()[0],
        "Document PubMed ID": docs[docs['doc_id'] == doc_id]['pubmed_id'].tolist()[0],
        "Document DOI": docs[docs['doc_id'] == doc_id]['doi'].tolist()[0],
        "Target type": ASSAY.target_type,
        "Target organism": ASSAY.target_organism,
        "Activity type": ASSAY.activity_type,
        "Unit": ASSAY.unit,
        "Number of activities": ASSAY.activities,
        "Number of activities with nan value": ASSAY.nan_values,
        "Number of compounds": ASSAY.cpds,
        "Direction of biological activity:": ASSAY.direction, 
        "Number of compounds being active/inactive according to text comments": ASSAY.act_flag + ASSAY.inact_flag,
    }

    result = "\n\t".join(["- " + i + ": " + str(result[i]) for i in result])

    PROMPT = f"""
    You are an information extraction assistant specialized in analyzing biochemical data from ChEMBL associated to pathogens of global health concern. 
    Below, you will find a set of assay annotations from a single ChEMBL assay under study. 

    Your job is to return ONLY a JSON object with these keys:
    - organism_curated (string)
    - target_type_curated (string)
    - target_name_curated (string)
    - target_chembl_id_curated (string)
    - strain (string)
    - atcc_id (string)
    - mutations (array of strings)
    - known_drug_resistances (array of strings)
    - media (string)

    Return EXACTLY this JSON structure (fill values, keep keys unchanged):

    {{
    "organism_curated": "",
    "target_type_curated": "",
    "target_name_curated": "",
    "target_chembl_id_curated": "",
    "strain": "",
    "atcc_id": "",
    "mutations": [],
    "known_drug_resistances": [],
    "media": ""
    }}

    Rules:

    1. Only extract information explicitly stated in the provided assay annotations. Do NOT infer or use external background knowledge.
    2. If a field is missing or not stated, use "" for strings and [] for arrays. Do not use null.
    3. Use double quotes. Output must be valid JSON. Do not include any other keys or any extra text before or after the JSON. Do not use markdown fences.
    4. Do not output placeholders such as "unknown", "not stated", "not specified", or "N/A". Use "" or [] as specified.

    Field definitions:

    - "organism_curated": refers to the biological species explicitly stated (e.g., "Mycobacterium tuberculosis", "Homo sapiens"). 
    Do NOT include strain identifiers. Do NOT infer the species from a cell line name.
    - "target_type_curated": refers to target type. Set to "DISCARDED" if (a) the annotations explicitly indicate the assay is not a bioactivity assay (e.g., transcriptomics), 
    or (b) the annotations do not explicitly provide enough target evidence to assign a target type (i.e., no explicit target name/identifier and no explicit organism/cell line 
    assayed entity). Otherwise follow the mapping rules below.

        if target_type == UNCHECKED --> target_type_curated: SINGLE PROTEIN, ORGANISM or DISCARDED
        if target_type == NON-MOLECULAR --> target_type_curated: ORGANISM or DISCARDED
        if target_type != UNCHECKED AND target_type != NON-MOLECULAR --> target_type_curated: [same as target_type] OR DISCARDED

    - "target_name_curated": the target name explicitly stated in the annotations (e.g., "Mycobacterium tuberculosis" or "Lysine--tRNA ligase"). 
    If none is explicitly stated, leave it empty. If the assay target is a protein/complex, do NOT set target_name_curated to the organism name.
    Do NOT use target type labels as target names.
    - "target_chembl_id_curated": if provided in assay annotations, the target ChEMBL ID (e.g., CHEMBL360 for Mycobacterium tuberculosis or 
    CHEMBL3301561 for Lysine--tRNA ligase from Plasmodium falciparum). Otherwise leave it empty.
    - "strain": refers only to biological strain names (e.g., H37Rv, K12, PAO1). Do NOT include culture collection/catalog identifiers (e.g, ATCC, DSM or NCTC 
    related identifiers or catalog numbers).
    - "atcc_id": refers to the specific ATCC (American Type Culture Collection) identifier, if explicitly stated. Otherwise, leave it empty. 
    Extract only the numeric part and format exactly as "ATCC <number>". 
    - "mutations": include ONLY explicit mutations or variants stated in the text. Mutation format MUST be: one-letter amino acid + position (integer) + 
    one-letter amino acid (e.g., "S450L"). If the text uses a longer form (e.g., Ser450Leu) and the conversion is explicit/unambiguous, convert it to the one-letter format.
    - "known_drug_resistances": list drugs for which resistance is explicitly stated (e.g., ["rifampicin", "isoniazid"]). Do NOT infer resistance from mutations. 
    Only include resistances explicitly stated.
    - "media": refers to the growth or culture medium explicitly stated (e.g., Middlebrook 7H9 broth, Lowensteinâ€“Jensen, etc.).

    Important considerations:

    - If assay_type is BINDING and target_type is UNCHECKED, target_type_curated can only be set to DISCARDED or SINGLE PROTEIN, not ORGANISM.

        Assay annotations:

    {result}"""


    break

Processing pathogen: Mycobacterium tuberculosis...


0it [00:00, ?it/s]

0it [00:00, ?it/s]


In [13]:
schema = Parameters.model_json_schema()
response = ollama.chat(
    messages=[
    {'role': 'user','content': PROMPT}],
model='gpt-oss:20b',options={'temperature': 0, 'num_ctx': 12288}, keep_alive="1h",
format=schema)

# Parse JSON safely
js = json.loads(response.message['content'])

# Some validation
expected = {"organism_curated", "target_type_curated", "target_name_curated", "target_chembl_id_curated",
            "strain", "atcc_id", "mutations", "known_drug_resistances", "media"}
assert set(js.keys()) == expected, f"Unexpected keys: {set(js.keys())}"

# # Write to a JSON file
# out_path = os.path.join(PATH_TO_OUTPUT, "_".join([ASSAY.assay_id, str(ASSAY.activity_type), str(ASSAY.unit)]) + "_parameters.json")
# with open(out_path, "w") as outfile:
#     json.dump(js, outfile, indent=2)

In [14]:
js

{'organism_curated': 'Mycobacterium tuberculosis',
 'target_type_curated': 'ORGANISM',
 'target_name_curated': '',
 'target_chembl_id_curated': '',
 'strain': '',
 'atcc_id': '',
 'mutations': [],
 'known_drug_resistances': [],
 'media': 'DPPC, cholesterol, tyloxapol based media'}

In [None]:
# # Compress all JSON files in a ZIP file
# zip_path = os.path.join(root, "..", "output", pathogen_code, "assay_parameters.zip")
# with ZipFile(zip_path, "w", compression=ZIP_DEFLATED) as zipf:
#     for fname in os.listdir(PATH_TO_OUTPUT):
#         if fname.endswith(".json"):
#             full_path = os.path.join(PATH_TO_OUTPUT, fname)
#             # store inside zip without the full directory path
#             zipf.write(full_path, arcname=fname)

# # Remove the whole directory after zipping
# shutil.rmtree(PATH_TO_OUTPUT)