In [1]:
# Use a conda env with ollama installed
# Run this code on a GPU machine
from collections import Counter
from zipfile import ZipFile, ZIP_DEFLATED
from tqdm import tqdm
import pandas as pd
import numpy as np
import zipfile
import random
import ollama
import sys
import os

In [2]:
# Define root directory
# root = os.path.dirname(os.path.abspath(__file__))
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# List of pathogens to process
pathogens = ["Mycobacterium tuberculosis"]

def get_pathogen_code(pathogen):
    return str(pathogen.split()[0][0] + pathogen.split()[1]).lower() if len(pathogen.split()) > 1 else pathogen.lower()

In [18]:
for pathogen in pathogens:

    print(f"Processing pathogen: {pathogen}")
    pathogen_code = get_pathogen_code(pathogen)

    # Get assay data
    ASSAYS = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays.csv"), low_memory=False)[:100]

    # Create output directory
    PATH_TO_OUTPUT = os.path.join(root, "..", "output", pathogen_code, "parameters")
    os.makedirs(PATH_TO_OUTPUT, exist_ok=True)

    # For each assay
    for assay_id, assay_type, act_type, unit in zip(ASSAYS["assay_id"], ASSAYS["assay_type"], ASSAYS["activity_type"], ASSAYS["unit"]):

        if type(unit) != str:
            unit = 'nan'
        else:
            unit = unit.replace('/', 'FwdS').replace(" ", "__")

        print(f'Assay: {"_".join([assay_id, assay_type, act_type, unit])}')

        # Reading input data file from previous step
        with zipfile.ZipFile(os.path.join(root, "..", "output", pathogen_code, "descriptions", "_".join([assay_id, act_type, unit]) + ".zip"), 'r') as zip_ref:
            with zip_ref.open("_".join([assay_id, act_type, unit]) + "_input.txt") as f:
                input_data = f.read().decode('utf-8')

        PROMPT = f"""
        You are an information extraction assistant specialized in analyzing biochemical data.
        Read the assay annotations and return a single CSV line with the following 5 columns, in this exact order and separated by commas:
        - Organism
        - Strain
        - Mutations
        - Known drug resistances
        - Media

        Rules:
        - If any field is missing or not stated, leave it blank.
        - "Mutations" should include specific genetic variants or engineered changes if mentioned; otherwise leave blank.
        - "Known drug resistances": list drug resistances of the strain used in the assay; if only general mentions exist, leave blank.
        - "Media" refers to the growth or culture medium (e.g., Middlebrook 7H9 broth, Lowensteinâ€“Jensen, etc.).
        - Output exactly one line, no header, no extra text, no quotes, no trailing commas and, specially, no tabs nor commas within individual columns.
        - The final output must have exactly 5 columns in the specified order and, therefore, EXACTLY 4 commas.

        All available assay annotations are enumerated below:

        {input_data}
        """

        # Non streaming call
        response = ollama.generate(model='gpt-oss:20b', prompt=PROMPT, stream=False, think=True)
        result = response.response.strip().split(",")

        # Check number of columns in response
        if len(result) != 5:
            print(f"Error: Expected 5 columns but got {len(result)}. Response was: {response.response}")
            break

        # Add extra info
        result = [assay_id, assay_type, act_type, unit] + result

        # Save result
        with open(os.path.join(PATH_TO_OUTPUT, "_".join([assay_id, act_type, unit])) + "_parameters.csv", "w") as outfile:
            outfile.write(",".join([str(i) for i in result]))

Processing pathogen: Mycobacterium tuberculosis
Assay: CHEMBL4649948_F_PERCENTEFFECT_%
Assay: CHEMBL4649949_F_PERCENTEFFECT_%
Assay: CHEMBL4649971_F_PERCENTEFFECT_%
Assay: CHEMBL4649972_F_PERCENTEFFECT_%
Assay: CHEMBL4649941_F_PERCENTEFFECT_%
Assay: CHEMBL4649965_F_PERCENTEFFECT_%
Assay: CHEMBL4649957_F_PERCENTEFFECT_%
Assay: CHEMBL4649961_F_PERCENTEFFECT_%
Assay: CHEMBL4649947_F_PERCENTEFFECT_%
Assay: CHEMBL4649949_F_IC50_umol.L-1
Assay: CHEMBL4649948_F_IC50_umol.L-1
Assay: CHEMBL1794349_F_AC50_umol.L-1
Assay: CHEMBL1794426_F_EC50_umol.L-1
Assay: CHEMBL1794324_F_AC50_umol.L-1
Assay: CHEMBL2098495_F_MIC90_umol.L-1
Assay: CHEMBL4649972_F_IC50_umol.L-1
Assay: CHEMBL4649957_F_IC50_umol.L-1
Assay: CHEMBL4649941_F_IC50_umol.L-1
Assay: CHEMBL2094261_F_INHIBITION_%
Assay: CHEMBL2094262_F_INHIBITION_%
Assay: CHEMBL2114816_F_AC50_umol.L-1
Assay: CHEMBL4649965_F_IC50_umol.L-1
Assay: CHEMBL2114860_F_AC50_umol.L-1
Assay: CHEMBL4649971_F_IC50_umol.L-1
Assay: CHEMBL2098496_F_MIC90_umol.L-1
Assay: CH

In [17]:
response.response

'Mycobacterium tuberculosis,,,DPPC cholesterol tyloxapol based media'

In [11]:
response.response

'Mycobacterium tuberculosis,,,7H9 glucose tyloxapol based media'