<a href="https://colab.research.google.com/github/espickle1/sequence-cleaning/blob/main/chimerax_color_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ChimeraX Color Script Generator

Upload per-residue entropy (or any scalar) values, sequences, and metadata.
This notebook generates a `.cxc` ChimeraX color script **per sequence** and
downloads them to your machine.

## 0. Setup – Clone repo & install dependencies

In [None]:
import os, subprocess

repo_dir = "sequence-cleaning"
if not os.path.isdir(repo_dir):
    subprocess.run(
        ["git", "clone", "https://github.com/espickle1/sequence-cleaning.git"],
        check=True,
    )

if os.path.basename(os.getcwd()) != repo_dir:
    os.chdir(repo_dir)

print(f"Working directory: {os.getcwd()}")

In [2]:
import numpy as np
import pandas as pd
from google.colab import files
from analysis.chimerax_color_lib import generate_chimerax_script, write_chimerax_script

## 1. Upload files

Upload three CSV files:
- **Metadata file** – must contain `sequence_id` and `name` columns.
- **Sequences file** – must contain `sequence_id` and `sequence` columns.
- **Entropy file** – per-residue entropy values. Expected columns: `sequence_id` plus one or more value columns, **or** a single per-residue file (e.g. `residue_position, entropy, ...`) that applies to one sequence.


In [None]:
print("Upload the METADATA file (.csv):")
meta_upload = files.upload()
meta_filename = list(meta_upload.keys())[0]
df_metadata = pd.read_csv(meta_filename)
print(f"Loaded {meta_filename}: {df_metadata.shape}")
print(f"Columns: {list(df_metadata.columns)}")
display(df_metadata)

In [None]:
print("Upload the SEQUENCES file (.csv):")
seq_upload = files.upload()
seq_filename = list(seq_upload.keys())[0]
df_sequences = pd.read_csv(seq_filename)
print(f"Loaded {seq_filename}: {df_sequences.shape}")
print(f"Columns: {list(df_sequences.columns)}")
display(df_sequences)

In [None]:
print("Upload the ENTROPY file (.csv):")
entropy_upload = files.upload()
entropy_filename = list(entropy_upload.keys())[0]
df_entropy = pd.read_csv(entropy_filename)
print(f"Loaded {entropy_filename}: {df_entropy.shape}")
print(f"Columns: {list(df_entropy.columns)}")
df_entropy.head()

## 2. Merge files on `sequence_id`

In [None]:
# Detect entropy file format and build a unified structure
#
# Format A: Wide table with sequence_id + value columns (one row per sequence)
# Format B: Per-residue table (residue_position, entropy, ...) for a single sequence
#           In this case sequence_id is extracted from the filename.

if "sequence_id" in df_entropy.columns:
    # Format A -- entropy file already has sequence_id
    available_ids = df_entropy["sequence_id"].unique().tolist()
    print(f"Available sequence_ids ({len(available_ids)}):")
    for sid in available_ids:
        print(f"  - {sid}")

    print("\nEnter sequence_id(s) to process (comma-separated), or 'all' for all:")
    selection = input("sequence_id(s) [all]: ").strip() or "all"

    if selection.lower() == "all":
        selected_ids = available_ids
    else:
        selected_ids = [s.strip() for s in selection.split(",") if s.strip()]
        # Validate selections
        invalid = [s for s in selected_ids if s not in available_ids]
        if invalid:
            print(f"Warning: unknown sequence_id(s) ignored: {invalid}")
        selected_ids = [s for s in selected_ids if s in available_ids]

    if not selected_ids:
        raise ValueError("No valid sequence_id selected.")

    print(f"\nSelected {len(selected_ids)} sequence(s): {selected_ids}")

    # Filter entropy data to selected IDs
    df_entropy_filtered = df_entropy[df_entropy["sequence_id"].isin(selected_ids)]

    merge_cols = ["sequence_id", "sequence"]
    if "sequence" in df_sequences.columns:
        df_merged = df_entropy_filtered.merge(
            df_sequences[merge_cols], on="sequence_id", how="inner"
        )
    else:
        df_merged = df_entropy_filtered.copy()

    if "name" in df_metadata.columns:
        df_merged = df_merged.merge(
            df_metadata[["sequence_id", "name"]], on="sequence_id", how="left"
        )

    value_columns = [
        c for c in df_entropy.columns if c != "sequence_id"
    ]
    print(f"Format A detected (wide table). Value columns: {value_columns}")
    print(f"Merged rows: {len(df_merged)}")
    display(df_merged.head())
    ENTROPY_FORMAT = "wide"

else:
    # Format B -- per-residue file without sequence_id
    # Try to extract sequence_id from the entropy filename
    # Pipeline exports files named like: entropy_per_residue_{seq_id}.csv
    import re
    match = re.search(r"entropy_per_residue_(.+)\.csv", entropy_filename)
    if match:
        inferred_seq_id = match.group(1)
    else:
        inferred_seq_id = Path(entropy_filename).stem

    print(f"Format B detected (per-residue). Inferred sequence_id: {inferred_seq_id}")
    override = input(f"Use this sequence_id? (press Enter to accept, or type a new one): ").strip()
    if override:
        inferred_seq_id = override
        print(f"Using sequence_id: {inferred_seq_id}")

    # Determine which column holds the values
    if "entropy" in df_entropy.columns:
        value_col = "entropy"
    else:
        # Use the first numeric column that isn't residue_position
        numeric_cols = [
            c for c in df_entropy.select_dtypes(include="number").columns
            if c != "residue_position"
        ]
        value_col = numeric_cols[0] if numeric_cols else df_entropy.columns[-1]

    print(f"Using value column: '{value_col}'")
    print(f"Residues: {len(df_entropy)}")

    # Look up the name from metadata
    label = inferred_seq_id
    if "name" in df_metadata.columns and "sequence_id" in df_metadata.columns:
        name_match = df_metadata.loc[
            df_metadata["sequence_id"] == inferred_seq_id, "name"
        ]
        if len(name_match):
            label = name_match.iloc[0]

    # Store for the generation step
    df_merged = None
    per_residue_info = {
        "seq_id": inferred_seq_id,
        "label": label,
        "values": df_entropy[value_col].values.astype(float),
    }
    value_columns = None
    ENTROPY_FORMAT = "per_residue"
    display(df_entropy.head())

## 3. Configure color mapping

Adjust these parameters as needed before generating the scripts.

In [None]:
# --- Interactive Configuration ---
if ENTROPY_FORMAT == "wide" and value_columns:
    print(f"Value columns: {value_columns}")

CMAP_NAME = input("Colormap name [Greys]: ").strip() or "Greys"

print("Transform methods: none, quantile, power, standard, robust")
TRANSFORM_METHOD = input("Transform method [none]: ").strip() or "none"

_color = input("Enable color mapping? (y/n) [y]: ").strip().lower()
COLOR = _color not in ("n", "no")

COLOR_INVERT = False
if COLOR:
    _ci = input("Invert colormap? (y/n) [n]: ").strip().lower()
    COLOR_INVERT = _ci in ("y", "yes")

_trans = input("Enable transparency mapping? (y/n) [n]: ").strip().lower()
TRANSPARENCY = _trans in ("y", "yes")

TRANSPARENCY_INVERT = False
if TRANSPARENCY:
    _ti = input("Invert transparency? (y/n) [n]: ").strip().lower()
    TRANSPARENCY_INVERT = _ti in ("y", "yes")

_model = input("Model ID [1]: ").strip() or "1"
MODEL = int(_model)

CHAIN = input("Chain ID (e.g. A, B — leave blank for none): ").strip()

print(f"\nColormap: {CMAP_NAME}, Transform: {TRANSFORM_METHOD}")
print(f"Color: {COLOR} (invert={COLOR_INVERT}), Transparency: {TRANSPARENCY} (invert={TRANSPARENCY_INVERT})")
print(f"Model: #{MODEL}" + (f"/{CHAIN}" if CHAIN else ""))

## 4. Generate `.cxc` scripts (one per sequence)

In [None]:
import os
from pathlib import Path

output_dir = "cxc_output"
os.makedirs(output_dir, exist_ok=True)

generated_files = []

def _make_cxc(values, label, seq_id):
    """Generate a .cxc file for one sequence and return the output path."""
    script = generate_chimerax_script(
        values,
        cmap_name=CMAP_NAME,
        transform_method=TRANSFORM_METHOD,
        color=COLOR,
        color_invert=COLOR_INVERT,
        transparency=TRANSPARENCY,
        transparency_invert=TRANSPARENCY_INVERT,
        model=MODEL,
        chain=CHAIN,
    )
    safe_label = str(label).replace(" ", "_").replace("/", "_")
    out_path = os.path.join(output_dir, f"{safe_label}_{seq_id}.cxc")
    write_chimerax_script(script, out_path)
    return out_path

if ENTROPY_FORMAT == "wide":
    # One row per sequence -- iterate over df_merged
    for _, row in df_merged.iterrows():
        seq_id = row["sequence_id"]
        label = row.get("name", seq_id) or seq_id
        values = row[value_columns].values.astype(float)

        out_path = _make_cxc(values, label, seq_id)
        generated_files.append(out_path)
        print(f"  Created: {out_path}")
else:
    # Per-residue format -- single sequence
    out_path = _make_cxc(
        per_residue_info["values"],
        per_residue_info["label"],
        per_residue_info["seq_id"],
    )
    generated_files.append(out_path)
    print(f"  Created: {out_path}")

print(f"\nGenerated {len(generated_files)} .cxc file(s).")

## 5. Download `.cxc` files

In [19]:
for f in generated_files:
    files.download(f)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>