# 09_binding_pocket_analysis

Notebook UI for UPO homolog pocket analysis using an LLM with binding, alignment, and optional reaction inputs.

## Python Path Setup
Ensure project-root imports work whether Jupyter starts from repo root or `notebooks/`.

In [1]:
from pathlib import Path
import os
import sys

cwd = Path.cwd().resolve()
repo_root = cwd.parent if cwd.name == "notebooks" else cwd
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
src_root = repo_root / "src"
if src_root.exists() and str(src_root) not in sys.path:
    sys.path.insert(0, str(src_root))

## Imports
Load helper functions for table loading, LLM analysis, output export, and thread persistence.

In [2]:
import importlib
import agentic_protein_design.steps.binding_pocket as bp
bp = importlib.reload(bp)
from project_config.local_api_keys import OPENAI_API_KEY
from agentic_protein_design.core.thread_context import build_thread_context_text
from agentic_protein_design.core import apply_notebook_markdown_style, resolve_input_path

analyze_pocket_profiles = bp.analyze_pocket_profiles
default_user_inputs = bp.default_user_inputs
build_prompt_with_context = bp.build_prompt_with_context
generate_llm_pocket_analysis = bp.generate_llm_pocket_analysis
generate_llm_mutation_design_proposal = bp.generate_llm_mutation_design_proposal
run_llm_pocket_analysis_stages = bp.run_llm_pocket_analysis_stages
prompt_3 = bp.prompt_3
init_thread = bp.init_thread
load_input_tables = bp.load_input_tables
persist_thread_update = bp.persist_thread_update
save_llm_analysis = bp.save_llm_analysis
save_mutation_design_proposal = bp.save_mutation_design_proposal
save_binding_outputs = bp.save_binding_outputs
setup_data_root = bp.setup_data_root
get_step_processed_dir = bp.get_step_processed_dir

apply_notebook_markdown_style(font_size_px=14, line_height=1.4)


## API Key Setup
Load the OpenAI key from `project_config/local_api_keys.py` into environment variables for LLM calls.

In [3]:
if OPENAI_API_KEY and OPENAI_API_KEY != "REPLACE_WITH_YOUR_OPENAI_API_KEY":
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

"OPENAI_API_KEY" in os.environ

True

## User Inputs
Edit all run parameters here (single place): dataset root, thread selection, analysis options, model, and input paths.

In [9]:
root_key = "examples"
existing_thread_key = "binding_pocket_llm_analysis_59353c876ab140688b1c239a15aac24e"  # None

user_inputs = {
    "selected_positions": None, # [100, 103, 104, 107, 141, 222],
    "pairwise_comparisons":  [("CviUPO", "ET096")], # None
    "focus_question": (
        "Identify per-protein structural interpretations and cross-homolog patterns "
        "that could explain activity/property differences."
    ),
    "design_requirements": (
        "Backbone: ET096. Goal: improve peroxygenative mono-oxidation selectivity on S82 "
        "while retaining useful activity and limiting over-oxidation to Di-Ox. "
        "Prioritize conservative, mechanistically justified mutations and a first-round panel <= 12 variants."
    ),
    "literature_context_thread_key": "literature_review_d762a72ec7f04bec9b66ccd3aac21b91",  # Optional: literature-review thread key
    "reaction_data_description": (
        "- Veratryl alcohol: peroxygenative\n"
        "- Naphthalene: peroxygenative\n"
        "- NBD: peroxygenative\n"
        "- ABTS: peroxidative\n"
        "- S82: mixed; Mono-Ox ~ peroxygenation-biased, Di-Ox ~ peroxidation-biased\n"
        "Use ratios (e.g. Mono-Ox : Di-Ox) to infer peroxygenation vs peroxidation balance."
    ),
    "use_reaction_data": True,
    "llm_model": "gpt-5.2",
    "llm_temperature": 0.2,
    "llm_max_rows_per_table": 300,
}

input_paths = {
    # Paths are relative to the data root from project_config.variables.address_dict[root_key].
    "binding_csv": "pdb/bindingpocket_analysis.csv",
    "alignment_csv": "pdb/reps_ali_withDist_FILT.csv",
    "reaction_data_csv": "pdb/substrate_reaction_data.csv",
}

# Optional: reset analysis options from helper defaults
# user_inputs = default_user_inputs()

## Setup Runtime Context
Initialize data directories and active chat thread from the values above.

In [10]:
data_root, resolved_dirs = setup_data_root(root_key)
step_processed_dir = get_step_processed_dir(resolved_dirs)
thread, threads_preview = init_thread(root_key, existing_thread_key)
thread_id = thread["thread_id"]
data_root, step_processed_dir, thread_id


(PosixPath('/Users/charmainechia/Documents/projects/agentic-protein-design/examples'),
 PosixPath('/Users/charmainechia/Documents/projects/agentic-protein-design/examples/processed/09_binding_pocket_analysis'),
 '59353c876ab140688b1c239a15aac24e')

## Load Input Tables
Load descriptor and alignment tables, and optional reaction data, from `input_paths`.

In [11]:
binding_csv = resolve_input_path(data_root, input_paths["binding_csv"])
alignment_csv = resolve_input_path(data_root, input_paths["alignment_csv"])
reaction_data_csv = None
if user_inputs.get("use_reaction_data", False) and input_paths.get("reaction_data_csv", "").strip():
    reaction_data_csv = resolve_input_path(data_root, input_paths["reaction_data_csv"])

pocket, ali, reaction_df = load_input_tables(binding_csv, alignment_csv, reaction_data_csv)
binding_csv, alignment_csv, reaction_data_csv, pocket.head(3), (None if reaction_df is None else reaction_df.head(3))


(PosixPath('/Users/charmainechia/Documents/projects/agentic-protein-design/examples/pdb/bindingpocket_analysis.csv'),
 PosixPath('/Users/charmainechia/Documents/projects/agentic-protein-design/examples/pdb/reps_ali_withDist_FILT.csv'),
 PosixPath('/Users/charmainechia/Documents/projects/agentic-protein-design/examples/pdb/substrate_reaction_data.csv'),
    Unnamed: 0                    struct_name                  struct_name.1  \
 0           0                ET096_S82_glide                ET096_S82_glide   
 1           1               CviUPO_S82_glide               CviUPO_S82_glide   
 2           2  CviUPO-F88L+T158A_S82_chai1_0  CviUPO-F88L+T158A_S82_chai1_0   
 
                    struct_name.2  num_pocket_res_ali  num_pocket_res<6  \
 0                ET096_S82_glide                  38                12   
 1               CviUPO_S82_glide                  39                13   
 2  CviUPO-F88L+T158A_S82_chai1_0                  33                10   
 
    reactive_center_d

## Structured Exports
Generate heuristic comparative tables and export CSVs to `processed/`.

In [12]:
selected_positions = user_inputs["selected_positions"]
interp_df, pattern_summary = analyze_pocket_profiles(pocket, ali, selected_positions)
out_interp, out_patterns = save_binding_outputs(interp_df, pattern_summary, step_processed_dir)


## LLM Pocket Analysis
Query the LLM client with the full prompt and input tables, then save markdown output.

Prerequisite: set `OPENAI_API_KEY` in `project_config/local_api_keys.py`.

In [13]:
# Run two-stage LLM analysis
stage_outputs = run_llm_pocket_analysis_stages(pocket, ali, reaction_df, user_inputs)
prompt_2_output = stage_outputs["prompt_2_output"]
llm_analysis = stage_outputs["combined_analysis"]

out_llm = save_llm_analysis(llm_analysis, step_processed_dir)

# Prompt 3 defaults (overwritten in the next cell)
mutation_design_text = ""
out_mutation_design = None
literature_context_thread_key = None

print(out_llm)


### Binding Pocket Analysis - Stage 1

<details><summary>Prompt</summary>

```text

Analyse the uploaded inputs for a set of proteins to interpret how binding-pocket structure relates to catalytic activity and selectivity. 
Consider how both the proximal (<6 Å from docked ligand) and distal (up to ~11 Å from binding pocket centroid) residues affect the binding pocket environment.

INPUTS
- binding_pocket_table: extracted binding-pocket properties (per protein), calculated separately over proximal and distal residue sets where available.
- pocket_alignment_table: filtered residue alignment of pocket-proximal positions.
- reaction_data (optional): enzyme activity data on substrates.

OBJECTIVE
For each protein, integrate structural descriptors with (optional) reaction data to infer mechanistic behavior and classify pocket phenotypes.

TASKS

1) For each protein:
   - Generate a punchy tagline.
   - Provide a concise 5-6 bullet summary addressing:
        (i) proximal electrostatics  
        (ii) proximal sterics  
        (iii) distal electrostatics  
        (iv) distal sterics / outer pocket size  
        (v) overall synthesis of pocket phenotype, integrating structural properties with catalytic implications:
            - Interpret how geometry and chemistry influence productive (peroxygenative) vs competing (peroxidative) pathways.
            - If reaction_data is provided, use it to support structure–function relationships.

   Use the following column groups:

   PROXIMAL ELECTROSTATICS
   - charged_fraction (proximal), polar_fraction (proximal)
   - kd_weighted (proximal), hw_weighted (proximal)
   - median_dist_res_to_ligand_reactive_center

   PROXIMAL STERICS
   - mean_volume (proximal), weighted_mean_volume (proximal)
   - volume_variance (proximal)
   - small_residue_frac (proximal), bulky_residue_frac (proximal)
   - median_min_dist_res_to_ligand
   - reactive_center_distance
   - num_pocket_res_lt6

   DISTAL ELECTROSTATICS
   - charged_fraction (distal), polar_fraction (distal)
   - kd_weighted (distal), hw_weighted (distal)

   DISTAL STERICS / OUTER POCKET SIZE
   - mean_dist_to_centroid
   - mean_min_dist_to_centroid
   - mean_dist_backbone_to_centroid
   - mean_volume (distal)
   - volume_variance (distal)
   - small_residue_frac (distal), bulky_residue_frac (distal)
   - num_pocket_res_ali

   If proximal/distal suffixes are not explicitly present, infer proximal/distal groupings from context and state your assumption briefly.

2) Comparative analysis requirements (do BOTH):
   A) Intra-protein variant analysis (MANDATORY when variants are present):
   - Detect proteins that share the same base protein identity but differ by variant/mutation labels.
   - For each such protein family, explicitly compare each variant against its WT/reference form (if WT/reference is present).
   - If WT is not explicitly labeled, infer the closest reference sequence in that family and state the assumption.
   - For each variant-vs-reference comparison, report which structural dimensions changed:
        (i) proximal electrostatics
        (ii) proximal sterics
        (iii) distal electrostatics
        (iv) distal sterics / outer pocket size
   - Provide a mechanistic rationale linking those differences to functional shifts.

   B) User-requested pairwise comparisons:
   - Pairwise comparisons requested: CviUPO vs ET096
   - Perform each requested pairwise comparison in addition to section A.
   - Explicitly contrast which structural dimensions changed (prox electrostatics, prox sterics, distal electrostatics, distal sterics).
   - Provide a mechanistic rationale for functional shifts.

3) Distill cross-protein trends or clusters (“pocket phenotypes”):
   - Identify recurring structural archetypes (e.g., tight/polar pose-locking vs open/hydrophobic permissive).
   - Link clusters to turnover vs selectivity trade-offs.

OUTPUT STYLE
- Clear, human-interpretable, mechanistically grounded.
- Emphasize intuition over raw numbers.
- Keep summaries compact and comparative.


REACTION CONTEXT (OPTIONAL)
- Veratryl alcohol: peroxygenative
- Naphthalene: peroxygenative
- NBD: peroxygenative
- ABTS: peroxidative
- S82: mixed; Mono-Ox ~ peroxygenation-biased, Di-Ox ~ peroxidation-biased
Use ratios (e.g. Mono-Ox : Di-Ox) to infer peroxygenation vs peroxidation balance.

REACTION_DATA_STATUS: provided. Rows=5, Columns=['UPO Homologs', 'Veratryl alcohol (peroxygenation)', 'Naphthalene (peroxygenation)', 'NBD (peroxygenation)', 'ABTS (peroxidation)', 'S82 Mono-Ox %yield', 'S82 Di-Ox % Yield', 'S82 Total % Yield', 'S82 Mono:Di Ratio']
```
</details>

#### Response

(Stage 1 output included in compact combined view below.)

### Binding Pocket Analysis - Stage 2

<details><summary>Prompt</summary>

```text

You are given:
1) pocket_alignment_table: filtered alignment of variable residues located within <6 Å of the ligand in at least one structure.
2) structural_summary_text: prior analysis summarizing proximal/distal sterics, electrostatics, and pocket phenotypes for each protein.

TASK

Use the alignment table together with the structural_summary_text to:

1) Identify specific residue positions that likely drive differences in electrostatics and /or sterics. For each key variable position:
   - Describe residue identities across proteins.
   - Classify substitutions as steric (small↔bulky), electrostatic (neutral↔charged), or polarity shifts.
   - Predict mechanistic consequences (e.g., tighter cage, increased radical escape, altered substrate orientation).
   - Specifically contrast the effect of point mutations in variants of the same base sequence. 
     Explain how the mutations modify the previously identified pocket environment and its chemistry. 

2) Provide a short ranked list of:
   - High-confidence mechanistic driver residues
   - Secondary modulators
   - Likely neutral/background mutations

GUIDELINES
- Use sequence numbering from each protein (not alignment index).
- Explicitly tie residue-level effects back to the structural phenotypes described earlier.
- Emphasize causal mechanistic reasoning over descriptive comparison.
- Keep the output structured and concise.

The goal is to move from global pocket phenotype to residue-level mechanistic hypotheses.

```
</details>

#### Response

(Stage 2 output included in compact combined view below.)

### Combined Pocket Analysis

Output(layout=Layout(border_bottom='1px solid #e0e0e0', border_left='1px solid #e0e0e0', border_right='1px sol…

/Users/charmainechia/Documents/projects/agentic-protein-design/examples/processed/09_binding_pocket_analysis/binding_pocket_llm_analysis.md


## LLM Backbone Engineering Proposal
Use Stage-2 residue-level drivers plus optional literature-thread context to propose mutation designs under user requirements.

In [None]:
design_requirements = str(user_inputs.get("design_requirements", "")).strip()
literature_context_thread_key = str(user_inputs.get("literature_context_thread_key", "")).strip() or None

context_result = build_thread_context_text(
    literature_context_thread_key,
    include_referenced_files=True,
    max_chars_per_file=20000,
    on_missing="warn",
)
literature_context = str(context_result.get("context_text", ""))
literature_context_bundle = context_result.get("context_bundle")

mutation_design_text = generate_llm_mutation_design_proposal(
    prompt_2_output=prompt_2_output,
    design_requirements=design_requirements,
    user_inputs=user_inputs,
    literature_context=literature_context,
)
out_mutation_design = save_mutation_design_proposal(mutation_design_text, step_processed_dir)
out_mutation_design


## Save Thread Update
Run this final cell to append run metadata and prompt context to `chats/<llm_process_tag>_<thread_id>.json`.

In [None]:
persist_thread_update(
    root_key=root_key,
    thread_id=thread_id,
    user_inputs=user_inputs,
    input_paths=input_paths,
    selected_positions=selected_positions,
    reaction_df=reaction_df,
    out_interp=out_interp,
    out_patterns=out_patterns,
    llm_analysis_path=out_llm,
    llm_analysis_text=llm_analysis,
    mutation_design_path=out_mutation_design,
    mutation_design_text=mutation_design_text,
    literature_context_thread_key=literature_context_thread_key,
    llm_model=str(user_inputs.get("llm_model", "")),
)