In [1]:
# Standard library
import ast
import json
import os
import pickle
import re
import time
from datetime import datetime
from typing import Any, List, Tuple, Union
# Third-party packages
import numpy as np
import pandas as pd
import requests
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Document processing
from docx import Document
from docx.enum.style import WD_STYLE_TYPE
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt, RGBColor
# API clients
from anthropic import Anthropic
# Bioinformatics
import biomni
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Seq import Seq
from biomni.tool.database import query_geo
from biomni.utils import parse_hpo_obo
#!pip install python-docx reportlab pillow matplotlib pandas

In [2]:
from dotenv import load_dotenv
import os
load_dotenv() # This loads the variables from the .env file
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") # Access the API key by its name

In [4]:
from biomni.agent import A1
agent = A1(path='./data', llm='claude-sonnet-4-20250514')

Checking and downloading missing data lake files...


In [43]:
llm_output=agent.go("Create CRSIPR genelist that regulate type I interferon")

Using prompt-based retrieval with the agent's LLM

Create CRSIPR genelist that regulate type I interferon

I'll help you create a CRISPR gene list that regulates type I interferon. Let me break this down into a systematic approach to identify genes involved in type I interferon regulation and then design guide RNAs for them.

## Plan

1. [ ] Search literature and databases for genes that regulate type I interferon signaling
2. [ ] Query pathway databases to identify key genes in type I interferon pathways
3. [ ] Compile a comprehensive list of type I interferon regulatory genes
4. [ ] Design sgRNAs for the identified genes using CRISPR design tools
5. [ ] Organize the results into a structured gene list with guide RNAs

Let me start by gathering information about type I interferon regulatory genes:

<execute>
# First, let's search for type I interferon regulatory genes in literature and databases
from biomni.tool.literature import query_pubmed
from biomni.tool.database import query_keg

In [45]:
# ================================================================
# BIOMNI AGENT OUTPUT LOGGER - CLEAN DOCUMENT GENERATION
# Handles biomni agent outputs with proper formatting and cleanup
# ================================================================
## parse and create word document
# ---------- Regexes ----------
AnsiRegex = re.compile(r"\x1b\[[0-9;]*m")

BANNER_REGEXES = [
    re.compile(r"^=+\s*Human Message\s*=+\s*$", re.I),
    re.compile(r"^=+\s*Ai Message\s*=+\s*$", re.I),
]

TAG_PATTERN = re.compile(
    r"(?s)(<execute>.*?</execute>|<observation>.*?</observation>|<solution>.*?</solution>)"
)

# ---------- Parsing helpers ----------
def _try_parse_literal_list(text: str):
    """
    Try to parse a string that looks like a Python list/tuple of strings.
    Returns parsed list on success; otherwise returns the original string.
    """
    try:
        obj = ast.literal_eval(text)
        if isinstance(obj, (list, tuple)) and all(isinstance(x, str) for x in obj):
            return list(obj)
    except Exception:
        pass
    return text


def _flatten_parts(obj) -> List[str]:
    """Recursively flatten lists/tuples into a list of strings."""
    if isinstance(obj, (list, tuple)):
        out: List[str] = []
        for x in obj:
            out.extend(_flatten_parts(x))
        return out
    if isinstance(obj, str):
        return [obj]
    return [str(obj)]


def normalize_input(llm_output: Union[str, List[str], Tuple[str, ...]]) -> str:
    """
    Accepts:
      - a string,
      - (possibly nested) lists/tuples of strings,
      - or a string-representation of such a list/tuple.
    Returns one big cleaned string.
    """
    if isinstance(llm_output, (list, tuple)):
        parts = _flatten_parts(llm_output)
    elif isinstance(llm_output, str):
        parsed = _try_parse_literal_list(llm_output)
        if isinstance(parsed, (list, tuple)):
            parts = _flatten_parts(parsed)
        else:
            parts = [llm_output]
    else:
        parts = [str(llm_output)]

    # Join with blank lines to keep separation
    text = "\n\n".join(parts)

    # Strip ANSI color codes
    text = AnsiRegex.sub("", text)

    # Remove big "==== Human/Ai Message ====" banners (line-based)
    cleaned_lines = []
    for line in text.splitlines():
        if any(rx.match(line.strip()) for rx in BANNER_REGEXES):
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines).strip()


def segment_blocks(text: str) -> List[Tuple[str, str]]:
    """
    Splits the text into ordered blocks of (kind, content),
    kind ∈ {"plain", "execute", "observation", "solution"}.
    Deduplicates identical (kind, content) blocks.
    """
    blocks: List[Tuple[str, str]] = []
    pos = 0
    for m in TAG_PATTERN.finditer(text):
        # preceding plain text
        if m.start() > pos:
            pre = text[pos:m.start()].strip()
            if pre:
                blocks.append(("plain", pre))

        tag_block = m.group(0)
        if tag_block.startswith("<execute>"):
            content = tag_block[len("<execute>"):-len("</execute>")].strip()
            blocks.append(("execute", content))
        elif tag_block.startswith("<observation>"):
            content = tag_block[len("<observation>"):-len("</observation>")].strip()
            blocks.append(("observation", content))
        elif tag_block.startswith("<solution>"):
            content = tag_block[len("<solution>"):-len("</solution>")].strip()
            blocks.append(("solution", content))

        pos = m.end()

    # trailing plain text
    if pos < len(text):
        tail = text[pos:].strip()
        if tail:
            blocks.append(("plain", tail))

    # Deduplicate identical blocks while preserving order
    unique_blocks: List[Tuple[str, str]] = []
    seen = set()
    for kind, content in blocks:
        key = (kind, content)
        if key not in seen:
            unique_blocks.append((kind, content))
            seen.add(key)
    return unique_blocks


# ---------- Word formatting helpers ----------
def ensure_code_style(document: Document, style_name="Code"):
    """Create a monospace paragraph style for code blocks if it doesn't exist."""
    styles = document.styles
    if style_name in styles:
        return style_name
    style = styles.add_style(style_name, WD_STYLE_TYPE.PARAGRAPH)
    style.font.name = "Courier New"
    style.font.size = Pt(10)
    style.font.color.rgb = RGBColor(50, 50, 50)
    return style_name


def add_title(document: Document, title: str):
    p = document.add_paragraph()
    run = p.add_run(title)
    run.bold = True
    run.font.size = Pt(18)
    p.alignment = WD_ALIGN_PARAGRAPH.LEFT

    sub = document.add_paragraph()
    sub_run = sub.add_run(datetime.now().strftime("Generated on %Y-%m-%d %H:%M"))
    sub_run.italic = True
    sub_run.font.size = Pt(9)


def _write_inline_markdown(run_adder, text: str):
    """
    Minimal inline markdown: **bold** and *italic*.
    run_adder is a function(text, bold=False, italic=False).
    """
    # Handle bold segments first: **...**
    parts = re.split(r"(\*\*.*?\*\*)", text)
    for part in parts:
        if part.startswith("**") and part.endswith("**"):
            inner = part[2:-2]
            ital_parts = re.split(r"(\*.*?\*)", inner)
            for ip in ital_parts:
                if ip.startswith("*") and ip.endswith("*"):
                    run_adder(ip[1:-1], bold=True, italic=True)
                else:
                    if ip:
                        run_adder(ip, bold=True, italic=False)
        else:
            ital_parts = re.split(r"(\*.*?\*)", part)
            for ip in ital_parts:
                if ip.startswith("*") and ip.endswith("*"):
                    run_adder(ip[1:-1], bold=False, italic=True)
                else:
                    if ip:
                        run_adder(ip, bold=False, italic=False)


def add_markdownish_block(document: Document, text: str):
    """
    Handle simple markdown:
      - Headings: lines starting with #, ##, ###
      - Bullets: -, *, •
      - Numbers: 1. 2. etc.
      - Checkboxes: - [ ] and - [x]/[X]/[✓]
      - Paragraphs otherwise
    """
    lines = text.splitlines()
    for raw in lines:
        line = raw.rstrip()
        if not line:
            document.add_paragraph("")
            continue

        # Headings
        if line.startswith("### "):
            p = document.add_paragraph()
            r = p.add_run(line[4:].strip())
            r.bold = True
            r.font.size = Pt(13)
            continue
        if line.startswith("## "):
            p = document.add_paragraph()
            r = p.add_run(line[3:].strip())
            r.bold = True
            r.font.size = Pt(15)
            continue
        if line.startswith("# "):
            p = document.add_paragraph()
            r = p.add_run(line[2:].strip())
            r.bold = True
            r.font.size = Pt(17)
            continue

        # Checkboxes
        m_chk = re.match(r"^\s*[-*]\s*\[( |x|X|✓)\]\s*(.*)$", line)
        if m_chk:
            mark = m_chk.group(1)
            txt = m_chk.group(2)
            box = "☐" if mark.strip() == "" else "☑"
            p = document.add_paragraph(f"{box} ")
            def add_run(text, bold=False, italic=False):
                rr = p.add_run(text)
                rr.bold = bold
                rr.italic = italic
            _write_inline_markdown(add_run, txt)
            continue

        # Bullets
        if re.match(r"^\s*[-*•–]\s+", line):
            txt = re.sub(r"^\s*[-*•–]\s+", "", line)
            p = document.add_paragraph(style="List Bullet")
            def add_run(text, bold=False, italic=False):
                rr = p.add_run(text)
                rr.bold = bold
                rr.italic = italic
            _write_inline_markdown(add_run, txt)
            continue

        # Numbered
        if re.match(r"^\s*\d+\.\s+", line):
            txt = re.sub(r"^\s*\d+\.\s+", "", line)
            p = document.add_paragraph(style="List Number")
            def add_run(text, bold=False, italic=False):
                rr = p.add_run(text)
                rr.bold = bold
                rr.italic = italic
            _write_inline_markdown(add_run, txt)
            continue

        # Normal paragraph
        p = document.add_paragraph()
        def add_run(text, bold=False, italic=False):
            rr = p.add_run(text)
            rr.bold = bold
            rr.italic = italic
        _write_inline_markdown(add_run, line)


def add_code_block(document: Document, code: str, style_name="Code"):
    p = document.add_paragraph(style=style_name)
    p.add_run(code)


def add_observation_block(document: Document, text: str):
    document.add_paragraph().add_run("Observation").bold = True
    add_code_block(document, text)


# ---------- Main API ----------
def llm_output_to_docx(
    llm_output: Union[str, List[str], Tuple[str, ...]],
    out_path: str = "LLM_Report.docx",
    title: str = "LLM Analysis Report",
):
    """
    Normalize, parse, and write a human-friendly .docx from raw LLM logs.
    """
    raw = normalize_input(llm_output)
    blocks = segment_blocks(raw)

    doc = Document()
    ensure_code_style(doc, "Code")
    add_title(doc, title)

    for kind, content in blocks:
        if not content.strip():
            continue
        if kind == "plain":
            add_markdownish_block(doc, content)
        elif kind == "execute":
            doc.add_paragraph().add_run("Executed Code").bold = True
            add_code_block(doc, content, "Code")
        elif kind == "observation":
            add_observation_block(doc, content)
        elif kind == "solution":
            doc.add_paragraph().add_run("Solution").bold = True
            add_markdownish_block(doc, content)
        else:
            add_markdownish_block(doc, content)

        # Spacing between blocks
        doc.add_paragraph("")

    doc.save(out_path)
    return out_path


# ---------- Minimal runnable example ----------
#if __name__ == "__main__":
    # This mimics your nested structure: ( [list_of_strs], "another big string" )
#    example_llm_output = llm_output
#
#    out = llm_output_to_docx(
#        example_llm_output,
#       out_path="LLM_Report.docx",
#        title="LLM Output Formatting Demo",
#    )
#    print(f"Wrote: {out}")


In [46]:
report_name="LLM_Report1.docx"
# Call the function
doc_path = llm_output_to_docx(
    llm_output,
    out_path=report_name,
    title="Test LLM Report"
)

print("✅ Word file created:", doc_path)

✅ Word file created: LLM_Report1.docx
