In [1]:
from collections import Counter
from zipfile import ZipFile, ZIP_DEFLATED
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# Load assays and docs information
assays = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "assays.csv"), low_memory=False)
docs = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "docs.csv"), low_memory=False)
assay_type_map = {"F": "Functional", "B": "Binding", "T": "Toxicity", "A": "ADME", "P": "Physicochemical", "U": "Uncategorized"}

# List of pathogens
pathogens = ["Acinetobacter baumannii", "Candida albicans", "Campylobacter", "Escherichia coli", "Enterococcus faecium", "Enterobacter",
             "Helicobacter pylori", "Klebsiella pneumoniae", "Mycobacterium tuberculosis", "Neisseria gonorrhoeae", "Pseudomonas aeruginosa",
             "Plasmodium falciparum", "Staphylococcus aureus", "Schistosoma mansoni", "Streptococcus pneumoniae"]
pathogens = ["Acinetobacter baumannii", "Mycobacterium tuberculosis", "Klebsiella pneumoniae"]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

In [2]:
# Loading ChEMBL preprocessed data
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print(Counter(ChEMBL['target_type']))
del ChEMBL

Loading ChEMBL preprocessed data...
Original size: 24040987
Counter({'SINGLE PROTEIN': 10124041, 'CELL-LINE': 5167209, 'ORGANISM': 4590514, 'UNCHECKED': 2224642, 'ADMET': 494325, 'PROTEIN COMPLEX': 319734, 'NON-MOLECULAR': 281755, 'NO TARGET': 212946, 'PROTEIN-PROTEIN INTERACTION': 128634, 'PROTEIN FAMILY': 116877, 'TISSUE': 100632, 'NUCLEIC-ACID': 95662, 'PROTEIN COMPLEX GROUP': 49036, 'UNKNOWN': 32534, 'SUBCELLULAR': 29082, 'SELECTIVITY GROUP': 27389, 'PHENOTYPE': 26130, '3D CELL CULTURE': 13075, 'CHIMERIC PROTEIN': 3459, 'LIPID': 1243, 'MACROMOLECULE': 951, 'PROTEIN NUCLEIC-ACID COMPLEX': 585, 'SMALL MOLECULE': 512, 'OLIGOSACCHARIDE': 20})


In [3]:
for pathogen in pathogens:

    # Creating output directory
    print(f"Processing pathogen: {pathogen}...")
    pathogen_code = get_pathogen_code(pathogen)
    PATH_TO_OUTPUT = os.path.join(root, "..", "output", pathogen_code, "parameters")
    os.makedirs(PATH_TO_OUTPUT, exist_ok=True)

    # Loading assay data
    ASSAYS = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_cleaned.csv"))
    ASSAYS = ASSAYS[ASSAYS['cpds'] > 50].reset_index(drop=True)

    # For each assay
    for idx, ASSAY in ASSAYS.iterrows():

        # Getting doc_id
        doc_id = assays[assays['chembl_id'] == ASSAY.assay_id]['doc_id'].tolist()[0]

        result = {
            "Assay ChEMBL ID": ASSAY.assay_id,
            "Assay type": assay_type_map[ASSAY.assay_type],
            "Assay organism": ASSAY.assay_organism,
            "Assay description": assays[assays['chembl_id'] == ASSAY.assay_id]['description'].tolist()[0],
            "Assay strain": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_strain'].tolist()[0],
            "Assay category": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_category'].tolist()[0],
            "Assay test type": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_test_type'].tolist()[0],
            "Assay cell type": assays[assays['chembl_id'] == ASSAY.assay_id]['assay_cell_type'].tolist()[0],
            "Document title": docs[docs['doc_id'] == doc_id]['title'].tolist()[0],
            "Document abstract": docs[docs['doc_id'] == doc_id]['abstract'].tolist()[0],
            "Document journal": docs[docs['doc_id'] == doc_id]['journal'].tolist()[0],
            "Document PubMed ID": docs[docs['doc_id'] == doc_id]['pubmed_id'].tolist()[0],
            "Document DOI": docs[docs['doc_id'] == doc_id]['doi'].tolist()[0],
            "Target type": ASSAY.target_type,
            "Target organism": ASSAY.target_organism,
            "Activity type": ASSAY.activity_type,
            "Unit": ASSAY.unit,
            "Number of activities": ASSAY.activities,
            "Number of activities with nan value": ASSAY.nan_values,
            "Number of compounds": ASSAY.cpds,
            "Direction of biological activity:": ASSAY.direction, 
            "Number of compounds being active/inactive according to activity_comment": ASSAY.activity_comment_counts,
            "Number of compounds being active/inactive according to standard_text": ASSAY.standard_text_count,
        }

        result = "\n".join([i + ": " + str(result[i]) for i in result])
        break
    break

Processing pathogen: Acinetobacter baumannii...


In [None]:
PROMPT = f"""
You are an information extraction assistant specialized in analyzing biochemical data.

Return ONLY a JSON object with these keys:
- organism (string)
- target_type (string)
- strain (string)
- mutations (array of strings)
- known_drug_resistances (array of strings)
- media (string)

    Rules:

- If a field is missing or not stated: use "" for strings and [] for arrays.
- Do not include any other keys or any extra text before or after the JSON.
- Do not use markdown fences.
- "Mutations" should include specific genetic variants or engineered changes if mentioned; otherwise [].
- "Known drug resistances" should list drug resistances of the strain used in the assay; if only general mentions exist, use [].
- "Media" refers to the growth or culture medium (e.g., Middlebrook 7H9 broth, Lowensteinâ€“Jensen, etc.).

All available assay annotations are enumerated below:
{input_data}

"""

class Parameters(BaseModel):
    organism: str
    strain: str
    mutations: list[str]
    known_drug_resistances: list[str]
    media: str
schema = Parameters.model_json_schema()

response = ollama.chat(
    messages=[
    {'role': 'user','content': PROMPT}],
model='gpt-oss:20b',options={'temperature': 0, 'num_ctx': 12288}, keep_alive="1h",
format=schema)

# Parse JSON safely
js = json.loads(response.message['content'])

# Some validation
expected = {"organism", "strain", "mutations", "known_drug_resistances", "media"}
assert set(js.keys()) == expected, f"Unexpected keys: {set(js.keys())}"

# Add metadata
js["assay_id"] = assay_id
js["assay_type"] = assay_type
js["activity_type"] = act_type
js["unit"] = unit

# Write to a JSON file
out_path = os.path.join(PATH_TO_OUTPUT, "_".join([assay_id, act_type, unit]) + "_parameters.json")
with open(out_path, "w") as outfile:
    json.dump(js, outfile, indent=2)