In [7]:
# Use a conda env with ollama installed
# Run this code on a GPU machine
from collections import Counter
from zipfile import ZipFile, ZIP_DEFLATED
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import ollama
import pickle
import sys
import os

alpha = 2

# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "/aloy/home/acomajuncosa/Ersilia/chembl-antimicrobial-tasks/scripts/08_get_assay_descriptions"
sys.path.append(os.path.join(root, "..", "..", "src"))
from default import CONFIGPATH

# Load pickle
ASSAYS_SUBSET = pickle.load(open(os.path.join(root, "..", "..", "tmp", "assays.pkl"), "rb"))[alpha]

SYSTEM = f"""
You are a ChEMBL biodata curator. Your task is to write a complete, accurate and standardized description of a given biological assay.

Formatting instructions:
- The description must be structured into three paragraphs (enumerated below), each of 80-120 words.
- Each paragraph must begin with a bold markdown title in the exact format:
  **1. Assay description** \newline
  **2. Outcome interpretation** \newline
  **3. Results and insights** \newline
- The assay description paragraph must explain the objective of the assay, the experimental system, and methodology. Specify the biological target and target type, \
    the pathogen under study (and the corresponding strain, if available), the assay format (e.g., cell-based, binding), the detection method, and any relevant experimental \
    conditions (e.g., temperature, compound concentration).
- The outcome interpretation paragraph must describe how assay outputs are measured and interpreted. Specify how results relate to biological activity or target modulation, \
    controls, reference compounds, signal thresholds and normalization steps. Identify the direction of the biological activity: (-1) if lower values lead to higher activity \
    e.g., IC50 or percentage of survival; (+1) if higher values result in higher activity e.g., percentage of growth inhibition or percentage of effect; (0) if it’s \
    inconclusive or not trivial, e.g., clearance or solubility). Notice that artefacts may eventually appear e.g., data with percentages of growth inhibition < 0% or > 100%. \
    Consider those artefacts as noise.  
- The results and insights paragraph must summarize typical activity ranges, notable behaviors (e.g., agonists, inhibitors), data quality and curation notes that support \
    integration and reproducibility. Highlight meaningful observations from the distribution of activity data. It must be coherent with the outcome interpretation paragraph \
    (i.e. the direction of the biological activity). Additionally, elaborate on the chemical diversity of the assay compounds, basing your description on the number of observed \
    clusters at different ECFP4 Tanimoto similarity cut-offs (e.g., if 10 compounds lead to 10 clusters at a 0.3 ECFP4 Tanimoto similarity cut-off, the assay is \
    chemically diverse. On the contrary, if 10 compounds lead to a single cluster at a ECFP4 0.85 Tanimoto similarity cut-off, the set is probably a chemical series). \
    Finally, you will be provided 30 sampled smiles maximum (top10, bottom10 and 10 randomly selected) from the assay with their corresponding activity values: try to identify \
    common scaffolds or functional groups related with antimicrobial activity (e.g., quinolones or sulfonamides), particularly among active compounds (notice \
    that actives will have lower values if direction is (-1) or higher if direction is (+1)). Interpret the results accordingly.  
- Separate paragraphs with a single blank line.
- Use only standard ASCII spacing for all numbers, units, and symbols.
- Insert commas in numbers when necessary (e.g., 1,000; 100,000).
- Do not insert non-breaking spaces, narrow spaces, or special typographic characters.
- Do not use tables or hidden formatting.
- Use scientific and formal language.
- Avoid speculation, informal expressions or fabricated information.
- If any relevant data is missing (reported as ‘nan’), state “not reported” rather than inventing details.
- Do NOT write Q&A, lists, bullets, or add any heading besides the three above.
- Think as much as needed.

"""

In [None]:
# Load assays and docs information
assays = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "assays.csv"), low_memory=False)
docs = pd.read_csv(os.path.join(CONFIGPATH, "chembl_activities", "docs.csv"), low_memory=False)
assay_type_map = {"F": "Functional", "B": "Binding", "T": "Toxicity", "A": "ADME", "P": "Physicochemical", "U": "Uncategorized"}

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

# For each assay
for ASSAY in ASSAYS_SUBSET:

    # Get data
    assay_id, assay_type, assay_organism, doc_chembl_id, target_type, target_chembl_id, target_organism, activity_type, unit, activities, nan_values, cpds = ASSAY
    doc_id = assays[assays['chembl_id'] == assay_id]['doc_id'].tolist()[0]
    pathogen_code = get_pathogen_code(assay_organism)

    # Get clusters info and path to output
    CLUSTERS_INFO = pd.read_csv(os.path.join(root, "..", "..", "output", pathogen_code, 'assays_clusters.csv'))
    PATH_TO_OUTPUT = os.path.join(root, "..", "..", "output", pathogen_code)

    # Loading ChEMBL data for that pathogen
    print(f"Loading ChEMBL preprocessed data for {pathogen_code}...")
    ChEMBL = pd.read_csv(os.path.join(root, "..", "..", "output", pathogen_code, f"{pathogen_code}_ChEMBL_data.csv"), low_memory=False)
    print(f"Number of activities for {pathogen_code}: {len(ChEMBL)}")
    print(f"Number of compounds for {pathogen_code}: {len(set(ChEMBL['compound_chembl_id']))}")

    # Getting ChEMBL bioactivities, compounds and clusters
    if type(unit) == str:
        assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'] == unit)]["value"].astype(float).tolist()
        compounds = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'] == unit)]['canonical_smiles'].tolist()
        relations = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'] == unit)]['relation'].tolist()
        clusters = CLUSTERS_INFO[(CLUSTERS_INFO['assay_id'] == assay_id) & (CLUSTERS_INFO['activity_type'] == activity_type) & (CLUSTERS_INFO['unit'] == unit)]
    else:
        assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'].isna())]["value"].astype(float).tolist()
        compounds = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'].isna())]['canonical_smiles'].tolist()
        relations = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == activity_type) & (ChEMBL['unit'].isna())]['relation'].tolist()
        clusters = CLUSTERS_INFO[(CLUSTERS_INFO['assay_id'] == assay_id) & (CLUSTERS_INFO['activity_type'] == activity_type) & (CLUSTERS_INFO['unit'].isna())]

    # Get cluster data
    clusters = clusters[["clusters_0.3", "clusters_0.6", "clusters_0.85"]]
    assert len(clusters) == 1
    clusters = clusters.values[0]

    # Clean unit string
    if type(unit) == str:
        unit = unit.replace('/', 'FwdS')
        unit = unit.replace(" ", "__")

    # Get compounds
    compounds_notnans = [f"{i} --> {activity_type} {j} {k} {unit}" for i,j,k in zip(compounds, relations, assay_activities) if np.isnan(k) == False]
    compounds_notnans = sorted(compounds_notnans, key=lambda x: float(x.split()[-2]))[::-1]
    compounds_nans = [f"{i} --> {activity_type} {j} {k} {unit}" for i,j,k in zip(compounds, relations, assay_activities) if np.isnan(k) == True]
    COMPOUNDS = []
    if len(compounds_notnans) >= 30:
        COMPOUNDS.extend(compounds_notnans[:10] + random.sample(compounds_notnans[10:-10], 10) + compounds_notnans[-10:])
    elif len(compounds_notnans) + len(compounds_nans) > 30:
        COMPOUNDS.extend(compounds_notnans + random.sample(compounds_nans, 30 - len(compounds_notnans)))
    else:
        COMPOUNDS.extend(compounds_notnans + compounds_nans)  

    # Getting activities that are nans
    assay_activities_nans = [i for i in assay_activities if np.isnan(i)]
    assay_activities = [i for i in assay_activities if np.isnan(i) == False]
    if len(assay_activities) == 0:
        p1, p25, mean, median, p75, p99 = [np.nan] * 6
    else:
        p1 = round(np.percentile(assay_activities, 1), 3)
        p25 = round(np.percentile(assay_activities, 25), 3)
        mean = round(np.mean(assay_activities), 3)
        median = round(np.percentile(assay_activities, 50), 3)
        p75 = round(np.percentile(assay_activities, 75), 3)
        p99 = round(np.percentile(assay_activities, 99), 3)

        result = {
            "Assay ChEMBL ID": assay_id,
            "Assay type": assay_type_map[assay_type],
            "Assay organism": assay_organism,
            "Assay description": assays[assays['chembl_id'] == assay_id]['description'].tolist()[0],
            "Assay strain": assays[assays['chembl_id'] == assay_id]['assay_strain'].tolist()[0],
            "Assay category": assays[assays['chembl_id'] == assay_id]['assay_category'].tolist()[0],
            "Assay test type": assays[assays['chembl_id'] == assay_id]['assay_test_type'].tolist()[0],
            "Assay cell type": assays[assays['chembl_id'] == assay_id]['assay_cell_type'].tolist()[0],
            "Document title": docs[docs['doc_id'] == doc_chembl_id]['title'].tolist()[0],
            "Document abstract": docs[docs['doc_id'] == doc_chembl_id]['abstract'].tolist()[0],
            "Document journal": docs[docs['doc_id'] == doc_chembl_id]['journal'].tolist()[0],
            "Document PubMed ID": docs[docs['doc_id'] == doc_chembl_id]['pubmed_id'].tolist()[0],
            "Document DOI": docs[docs['doc_id'] == doc_chembl_id]['doi'].tolist()[0],
            "Target type": target_type,
            "Target organism": target_organism,
            "Activity type": activity_type,
            "Unit": unit,
            "Number of activities": len(assay_activities),
            "Number of activities with nan value": len(assay_activities_nans),
            "Number of compounds": cpds,
            "Activity stats": {
                "Percentile 1": p1,
                "Percentile 25": p25,
                "Mean": mean,
                "Median": median,
                "Percentile 75": p75,
                "Percentile 99": p99},
            "Relation stats": dict(Counter(relations)),
            "Number of compound clusters at a ECFP4 Tanimoto similarity cut-off of 0.3": clusters[0],
            "Number of compound clusters at a ECFP4 Tanimoto similarity cut-off of 0.6": clusters[1],
            "Number of compound clusters at a ECFP4 Tanimoto similarity cut-off of 0.85:": clusters[2],
            "Example smiles": "\n" + "\n".join(COMPOUNDS)
        }

        result = "\n".join([i + ": " + str(result[i]) for i in result])
        USER = f"""Below you will find enumerated annotations from the assay under study.\n\n{result}\n\nUsing the information provided, return a standardized description for the assay."""

        # Print data
        with open(os.path.join(PATH_TO_OUTPUT, "descriptions", f"{assay_id}_{activity_type}_{unit}_input.txt"), "w") as f:
            f.write(USER)
        
        # Non streaming call
        response = ollama.generate(model='gpt-oss:20b', prompt=SYSTEM + USER, stream=False, think=True)

        # Print response
        with open(os.path.join(PATH_TO_OUTPUT, "descriptions", f"{assay_id}_{activity_type}_{unit}_output.txt"), "w") as f:
            f.write(response.response)

        # Create a zip that bundles both generated files for this assay
        base = f"{assay_id}_{activity_type}_{unit}"
        in_path = os.path.join(PATH_TO_OUTPUT, "descriptions", f"{base}_input.txt")
        out_path = os.path.join(PATH_TO_OUTPUT, "descriptions", f"{base}_output.txt")
        zip_path = os.path.join(PATH_TO_OUTPUT, "descriptions", f"{base}.zip")

        with ZipFile(zip_path, "w", compression=ZIP_DEFLATED, compresslevel=9) as zf:
            zf.write(in_path, arcname=f"{base}_input.txt")
            zf.write(out_path, arcname=f"{base}_output.txt")

        # Remove text files
        os.remove(in_path)
        os.remove(out_path)

        print(f"✓ Completed {assay_id} - {activity_type} - {unit}")

Loading ChEMBL preprocessed data for abaumannii...
Number of activities for abaumannii: 45869
Number of compounds for abaumannii: 32898


IndexError: list index out of range

In [9]:
docs

Unnamed: 0,doc_id,journal,year,volume,issue,first_page,last_page,pubmed_id,doi,chembl_id,title,doc_type,authors,abstract,patent_id,ridx,src_id,chembl_release_id,contact
0,-1,,,,,,,,,CHEMBL1158643,Unpublished dataset,DATASET,,,,CLD0,0,7,
1,1,J Med Chem,2004.0,47,1,1,9,14695813.0,10.1021/jm030283g,CHEMBL1139451,The discovery of ezetimibe: a view from outsid...,PUBLICATION,Clader JW.,,,CLD0,1,1,
2,2,J Med Chem,2004.0,47,1,10,13,14695814.0,10.1021/jm034189b,CHEMBL1148466,Self-association of okadaic acid upon complexa...,PUBLICATION,"Daranas AH, Fernández JJ, Morales EQ, Norte M,...",Okadaic acid (OA) is a toxin responsible for d...,,CLD0,1,1,
3,3,J Med Chem,2004.0,47,1,101,109,14695824.0,10.1021/jm030287l,CHEMBL1139452,Synthesis and structure-activity relationships...,PUBLICATION,"Cho H, Murakami K, Nakanishi H, Fujisawa A, Is...",A variety of novel heterocyclic compounds havi...,,CLD0,1,1,
4,4,J Med Chem,2004.0,47,1,110,122,14695825.0,10.1021/jm030933g,CHEMBL1139453,Potent inhibitors of the Plasmodium falciparum...,PUBLICATION,"Ersmark K, Feierberg I, Bjelic S, Hamelink E, ...",The hemoglobin-degrading aspartic proteases pl...,,CLD0,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99137,134529,,2021.0,,,,,,,CHEMBL5728415,Heteroaryl-substituted sulfonamide compounds a...,PATENT,,,US-10981905-B2,US10981905,37,36,
99138,134530,,2021.0,,,,,,,CHEMBL5728416,Substituted pyrrolopyridine JAK inhibitors and...,PATENT,,,US-10981906-B2,US10981906,37,36,
99139,134531,,2021.0,,,,,,,CHEMBL5728417,Biaryl kinase inhibitors,PATENT,,,US-10981910-B2,US10981910,37,36,
99140,134532,,2021.0,,,,,,,CHEMBL5728418,Imidazopyrrolopyridine as inhibitors of the JA...,PATENT,,,US-10981911-B2,US10981911,37,36,
