<a href="https://colab.research.google.com/github/crisprking/miniprotein_genai/blob/main/02_sequence_proteinmpnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🔧 Install CPU‑only PyTorch + ProteinMPNN (≈45 s)
%pip install --quiet --upgrade pip
%pip install --quiet torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
%pip install --quiet proteinmpnn==1.0.4 biopython

print("✅ PyTorch and ProteinMPNN installed")


[31mERROR: Could not find a version that satisfies the requirement proteinmpnn==1.0.4 (from versions: 0.1.2, 0.1.3)[0m[31m
[0m[31mERROR: No matching distribution found for proteinmpnn==1.0.4[0m[31m
[0m✅ PyTorch and ProteinMPNN installed


In [None]:
# 🚀 Generate sequences with ProteinMPNN
import subprocess, sys, shutil, re, textwrap
from pathlib import Path
from google.colab import files
import importlib.util

NUM_SEQS = 8
TEMP     = "0.1"      # CLI expects string

# 1️⃣  Get backbone
record = Path("outputs/last_backbone.txt")
pdb_src = Path(record.read_text().strip()) if record.is_file() else None
if pdb_src is None or not pdb_src.is_file():
    print("⚠️ Backbone not found automatically — please upload a PDB.")
    uploaded = files.upload()
    if not uploaded:
        sys.exit("Upload cancelled.")
    pdb_src = Path(next(iter(uploaded)))

safe = re.sub(r"[ ()]", "_", pdb_src.name)
pdb_dst = Path("inputs") / safe
pdb_dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(pdb_src, pdb_dst)
print(f"Using backbone: {pdb_dst.resolve()}")

# 2️⃣  Locate script inside wheel
pkg_dir = Path(importlib.util.find_spec("proteinmpnn").origin).parent
script  = next(pkg_dir.rglob("protein_mpnn_run.py"))

# 3️⃣  Output folder
out_dir = Path("outputs/mpnn")
out_dir.mkdir(parents=True, exist_ok=True)

# 4️⃣  Minimal CLI  (let ProteinMPNN pick its own default weights)
cmd = [
    sys.executable, str(script),
    "--pdb-path",           str(pdb_dst),
    "--out-folder",         str(out_dir),
    "--num-seq-per-target", str(NUM_SEQS),
    "--sampling-temp",      TEMP,
]

print("\n• Executing:\n", " ".join(cmd), "\n")
res = subprocess.run(cmd, capture_output=True, text=True)
if res.returncode:
    print("❌ ProteinMPNN stderr:\n")
    print(textwrap.indent(res.stderr, "  "))
    raise RuntimeError("ProteinMPNN failed (see stderr above)")
else:
    print(res.stdout)
    print(f"✅ FASTA files saved in {out_dir}")


⚠️ Backbone not found automatically — please upload a PDB.


Saving design1_0.pdb to design1_0 (12).pdb
Using backbone: /content/inputs/design1_0__12_.pdb

• Executing:
 /usr/bin/python3 /usr/local/lib/python3.11/dist-packages/proteinmpnn/protein_mpnn_run.py --pdb-path inputs/design1_0__12_.pdb --out-folder outputs/mpnn --num-seq-per-target 8 --sampling-temp 0.1 

----------------------------------------
chain_id_jsonl is NOT loaded
----------------------------------------
fixed_positions_jsonl is NOT loaded
----------------------------------------
pssm_jsonl is NOT loaded
----------------------------------------
omit_AA_jsonl is NOT loaded
----------------------------------------
bias_AA_jsonl is NOT loaded
----------------------------------------
tied_positions_jsonl is NOT loaded
----------------------------------------
bias by residue dictionary is not loaded, or not provided
----------------------------------------
----------------------------------------
Number of edges: 48
Training noise level: 0.2A
Generating sequences for: design1_0__12

In [None]:
# 📄 Preview the first designed sequence (recursive search)
import glob, textwrap, sys, pathlib, pprint

# look for *.fa OR *.fasta anywhere under outputs/mpnn/
fa = sorted(
    glob.glob("outputs/mpnn/**/*.fa",     recursive=True)
  + glob.glob("outputs/mpnn/**/*.fasta", recursive=True)
)

if not fa:
    # Show the directory tree for quick debugging
    print("No FASTA files found — directory contents:\n")
    for path in pathlib.Path("outputs/mpnn").rglob("*"):
        print("  ", path.relative_to("outputs/mpnn"))
    sys.exit("\nCell 2 may not have generated sequences. "
             "Check the log above.")

print(textwrap.indent(open(fa[0]).read(), "  "))


  >design1_0__11_, score=2.1556, global_score=2.1556, fixed_chains=[], designed_chains=['A'], model_name=v_48_020, git_hash=unknown, seed=497
  GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
  >T=0.1, sample=1, score=1.1238, global_score=1.1238, seq_recovery=0.0100
  AERARLEEELAEAERLIALLRELAAAGAAPEERLAALLELARTLSPERAAALRARLEAFRAAVAALPPAEAEAALRAFLDALLAELEEEAARLRERLAAL
  >T=0.1, sample=2, score=1.0550, global_score=1.0550, seq_recovery=0.0100
  AELAALRAEAAAAQALLDTLAALRKAGAPPEEILAALRALAATLPPALAARLRARLDAHRAAAAALPPAERAAALAAFLEALAAELAARLAELQARIAAL
  >T=0.1, sample=3, score=1.1465, global_score=1.1465, seq_recovery=0.0100
  SELEKLKEELANAKKLLELLKKMKKEGLSPEEILEKLIEEAKTLPPELSEKILKELNKFKEELASLPEEEREKKLKEYLEKLEKELEKKAKELEAKLKAL
  >T=0.1, sample=4, score=1.1889, global_score=1.1889, seq_recovery=0.0100
  ERRAKLEQLLAEHKKLLEKLKAMKEKGVSPEEMLKELKELAKTLSPELSKKYLEKLEKVEEEAKSLPEEEREKYLKEFLKELEEELEKKAAELEAELKAL
  >T=0.1, sample=5, score=1.2105, global_sc