# SSTR2 Virtual Screening Pipeline Demo

**SSTR2 (Somatostatin Receptor Type 2)** 타겟에 대한 가상 스크리닝 파이프라인 데모입니다.

## Pipeline Overview

```
AlphaFold3 Complex (SSTR2 + Somatostatin-14)
        │
        ├─ Step 0: Binding Pocket Analysis (Biopython)
        │
        ├─ Arm 1: Small Molecule Screening
        │    └─ MolMIM (CMA-ES) → DiffDock Docking
        │
        ├─ Arm 2: Peptide Variant Analysis
        │    └─ Somatostatin Alanine Scanning → FlexPepDock
        │
        └─ Arm 3: De Novo Binder Design
             └─ RFdiffusion → ProteinMPNN → ESMFold
```

### NVIDIA NIM APIs Used
| API | Purpose |
|-----|--------|
| **MolMIM** | Small molecule generation & optimization |
| **DiffDock** | Molecular docking |
| **RFdiffusion** | De novo peptide backbone design |
| **ProteinMPNN** | Inverse folding (backbone → sequence) |
| **ESMFold** | Sequence → structure prediction |

> **Note**: API 호출이 필요한 셀은 `[API]` 라벨이 붙어 있습니다.  
> API 키 없이도 기존 결과 분석 셀은 실행 가능합니다.

---
## 0. Setup & Environment Check

In [None]:
import sys
import json
import os
from pathlib import Path
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# 한글 폰트 설정 (macOS)
if sys.platform == "darwin":
    mpl.rc("font", family="AppleGothic")
mpl.rc("axes", unicode_minus=False)

# 스타일
plt.style.use("seaborn-v0_8-whitegrid")
mpl.rcParams.update({"figure.dpi": 120, "figure.figsize": (10, 5)})

# 프로젝트 루트
REPO_ROOT = Path(".").resolve().parent
sys.path.insert(0, str(REPO_ROOT))

print(f"Python:     {sys.version}")
print(f"Repo root:  {REPO_ROOT}")
print(f"Platform:   {sys.platform}")

In [None]:
# 의존성 체크
deps = {
    "numpy": "np",
    "matplotlib": "mpl",
    "requests": None,
    "Bio": None,
}
for pkg, alias in deps.items():
    try:
        mod = __import__(pkg)
        ver = getattr(mod, "__version__", "OK")
        print(f"  ✓ {pkg:15s} {ver}")
    except ImportError:
        print(f"  ✗ {pkg:15s} NOT INSTALLED")

# API 키 확인
api_key = os.getenv("NGC_CLI_API_KEY") or os.getenv("NVIDIA_API_KEY")
if not api_key:
    for kf in [REPO_ROOT / "molmim.key", REPO_ROOT / "ngc.key"]:
        if kf.exists():
            api_key = kf.read_text().strip()
            break

HAS_API_KEY = bool(api_key and api_key.startswith("nvapi-"))
print(f"\n  API Key:  {'✓ 설정됨 (***' + api_key[-6:] + ')' if HAS_API_KEY else '✗ 미설정 — API 호출 셀은 스킵됩니다'}")

---
## 1. AlphaFold3 Complex Analysis

AlphaFold3 Server에서 예측한 **SSTR2 + Somatostatin-14** 복합체 구조를 분석합니다.

In [None]:
# AlphaFold3 confidence scores 로드
conf_dir = REPO_ROOT / "data" / "fold_test1"
confidences = []
for i in range(5):
    fp = conf_dir / f"fold_test1_summary_confidences_{i}.json"
    if fp.exists():
        confidences.append(json.loads(fp.read_text()))

print(f"모델 수: {len(confidences)}개\n")
print(f"{'Model':>7s} {'Ranking':>8s} {'pTM':>6s} {'ipTM':>6s} {'Disorder':>10s}")
print("-" * 45)
for i, c in enumerate(confidences):
    print(f"  #{i:4d}  {c['ranking_score']:8.3f} {c['ptm']:6.3f} {c['iptm']:6.3f} {c['fraction_disordered']:10.2%}")

In [None]:
# AlphaFold3 모델 신뢰도 비교
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

models = list(range(len(confidences)))
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(confidences)))

# Ranking Score
ax = axes[0]
scores = [c["ranking_score"] for c in confidences]
bars = ax.bar(models, scores, color=colors, edgecolor="white", linewidth=0.8)
ax.set_title("Ranking Score", fontweight="bold")
ax.set_xlabel("Model")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for b, s in zip(bars, scores):
    ax.text(b.get_x() + b.get_width()/2, s + 0.02, f"{s:.2f}", ha="center", fontsize=9)

# pTM & ipTM
ax = axes[1]
x = np.arange(len(models))
w = 0.35
ax.bar(x - w/2, [c["ptm"] for c in confidences], w, label="pTM", color="#4C72B0", edgecolor="white")
ax.bar(x + w/2, [c["iptm"] for c in confidences], w, label="ipTM", color="#DD8452", edgecolor="white")
ax.set_title("pTM vs ipTM", fontweight="bold")
ax.set_xlabel("Model")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
ax.legend()

# Inter-chain PAE
ax = axes[2]
best_idx = np.argmax(scores)
pae = np.array(confidences[best_idx]["chain_pair_pae_min"])
im = ax.imshow(pae, cmap="RdYlGn_r", vmin=0, vmax=8)
ax.set_title(f"Chain-pair PAE (Model {best_idx})", fontweight="bold")
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(["SST-14", "SSTR2"])
ax.set_yticklabels(["SST-14", "SSTR2"])
for (j, i), val in np.ndenumerate(pae):
    ax.text(i, j, f"{val:.1f}", ha="center", va="center", fontsize=11, fontweight="bold")
plt.colorbar(im, ax=ax, label="PAE (Å)", shrink=0.8)

plt.tight_layout()
plt.suptitle("AlphaFold3: SSTR2 + Somatostatin-14 복합체", fontweight="bold", y=1.02)
plt.show()

---
## 2. Binding Pocket Analysis

AlphaFold3 복합체에서 Somatostatin(Chain A) 기준 **5Å 이내** SSTR2(Chain B) 잔기를 추출합니다.

In [None]:
from Bio.PDB import PDBParser

pdb_path = REPO_ROOT / "data" / "fold_test1" / "fold_test1_model_0.pdb"
parser = PDBParser(QUIET=True)
structure = parser.get_structure("complex", str(pdb_path))
model = structure[0]

chain_a = model["A"]  # Somatostatin-14
chain_b = model["B"]  # SSTR2

n_a = len(list(chain_a.get_residues()))
n_b = len(list(chain_b.get_residues()))
n_atoms = sum(1 for _ in structure.get_atoms())

print(f"PDB: {pdb_path.name}")
print(f"  Chain A (Somatostatin-14): {n_a} residues")
print(f"  Chain B (SSTR2):           {n_b} residues")
print(f"  Total atoms:               {n_atoms}")

In [None]:
# 바인딩 포켓 데이터 로드
pocket_json = REPO_ROOT / "results" / "sstr2_docking" / "binding_pocket.json"
pocket = json.loads(pocket_json.read_text())

print(f"바인딩 포켓 잔기: {pocket['num_pocket_residues']}개 (cutoff: {pocket['cutoff_angstrom']}Å)")
print(f"RFdiffusion contigs: {pocket['rfdiffusion']['contigs']}")
print(f"\nPocket residues:")
for r in pocket["pocket_residues"]:
    print(f"  {r['chain']}{r['resid']:4d} {r['resname']}")

In [None]:
# 포켓 잔기 유형 분포
resnames = [r["resname"] for r in pocket["pocket_residues"]]
counts = Counter(resnames)

# 아미노산 속성
aa_props = {
    "hydrophobic": ["ALA", "VAL", "ILE", "LEU", "MET", "PHE", "TRP", "PRO"],
    "polar": ["SER", "THR", "ASN", "GLN", "TYR", "CYS"],
    "charged+": ["LYS", "ARG", "HIS"],
    "charged-": ["ASP", "GLU"],
    "special": ["GLY"],
}
prop_colors = {
    "hydrophobic": "#E07B54",
    "polar": "#4C9A2A",
    "charged+": "#4169E1",
    "charged-": "#DC143C",
    "special": "#888888",
}

def get_prop(resname):
    for prop, aas in aa_props.items():
        if resname in aas:
            return prop
    return "special"

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
names, vals = zip(*sorted_counts)
bar_colors = [prop_colors[get_prop(n)] for n in names]
ax1.barh(range(len(names)), vals, color=bar_colors, edgecolor="white", linewidth=0.5)
ax1.set_yticks(range(len(names)))
ax1.set_yticklabels(names)
ax1.invert_yaxis()
ax1.set_xlabel("Count")
ax1.set_title("SSTR2 Binding Pocket Residue Types", fontweight="bold")
for i, v in enumerate(vals):
    ax1.text(v + 0.1, i, str(v), va="center", fontsize=9)

# Pie chart by property
prop_counts = Counter(get_prop(r) for r in resnames)
labels = list(prop_counts.keys())
sizes = list(prop_counts.values())
pie_colors = [prop_colors[l] for l in labels]
ax2.pie(sizes, labels=[f"{l}\n({s})" for l, s in zip(labels, sizes)],
        colors=pie_colors, autopct="%1.0f%%", startangle=90,
        textprops={"fontsize": 10})
ax2.set_title("Pocket Residue Properties", fontweight="bold")

plt.tight_layout()
plt.show()

print(f"\n→ 소수성 잔기가 {prop_counts.get('hydrophobic', 0)}/{len(resnames)}개로 "
      f"pocket이 주로 hydrophobic interaction 기반임을 시사")

---
## 3. MolMIM: Molecule Generation & Optimization `[API]`

NVIDIA MolMIM API로 시드 분자에서 새 분자를 생성하고 물성(QED)을 최적화합니다.

In [None]:
# MolMIM 클라이언트 초기화
if HAS_API_KEY:
    from bionemo.molmim_client import MolMIMClient
    molmim = MolMIMClient()
    print(f"MolMIM Client: {molmim.base_url}")
    print(f"API Key: ***{molmim.api_key[-6:]}")
else:
    molmim = None
    print("⚠ API 키 미설정 — 기존 결과만 표시합니다.")

In [None]:
# [API] 시드 분자에서 CMA-ES QED 최적화 분자 생성
seeds = {
    "Ethanol":    "CCO",
    "Aspirin":    "CC(=O)Oc1ccccc1C(=O)O",
    "Caffeine":   "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",
    "Coumarin":   "c1ccc2c(c1)cc(=O)oc2",
}

gen_results = {}

if molmim:
    for name, smi in seeds.items():
        print(f"\n--- {name} ({smi}) ---")
        try:
            mols = molmim.generate(
                smi=smi, num_molecules=5, algorithm="CMA-ES",
                property_name="QED", min_similarity=0.3,
                particles=10, iterations=3,
            )
            gen_results[name] = mols
            for i, mol in enumerate(mols, 1):
                s = mol.get("sample", mol.get("smiles", "?"))
                print(f"  {i}. {s:50s}  QED={mol.get('score', '?')}")
        except Exception as e:
            print(f"  Error: {e}")
else:
    print("⚠ API 키 미설정 — 스킵")

In [None]:
# 기존 최적화 결과 분석 (API 키 없어도 실행 가능)
opt_file = REPO_ROOT / "bionemo" / "result_optimization_20260209_170244.json"
if opt_file.exists():
    opt_data = json.loads(opt_file.read_text())
    print(f"Multi-round QED Optimization Result")
    print(f"  Seed:       {opt_data['seed']}")
    print(f"  Target:     {opt_data['property']} maximize")
    print(f"  Rounds:     {opt_data['rounds']}")
    print(f"  Best QED:   {opt_data['best_overall']['score']:.4f}")
    print(f"  Best SMILES: {opt_data['best_overall']['sample']}")

    # 라운드별 개선 시각화
    rounds = [r["round"] for r in opt_data["round_results"]]
    best_scores = [r["best"]["score"] for r in opt_data["round_results"]]

    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(rounds, best_scores, "o-", color="#4C72B0", linewidth=2, markersize=10)
    for r, s in zip(rounds, best_scores):
        ax.annotate(f"{s:.3f}", (r, s), textcoords="offset points",
                    xytext=(0, 12), ha="center", fontsize=10, fontweight="bold")
    ax.set_xlabel("Round", fontsize=12)
    ax.set_ylabel("QED Score", fontsize=12)
    ax.set_title("Multi-round CMA-ES QED Optimization", fontweight="bold", fontsize=13)
    ax.set_ylim(0, 1)
    ax.set_xticks(rounds)
    ax.axhline(y=0.9, color="green", linestyle="--", alpha=0.5, label="Drug-like threshold")
    ax.legend()
    plt.tight_layout()
    plt.show()
else:
    print("기존 최적화 결과 파일이 없습니다.")

---
## 4. Arm 1: Small Molecule Screening `[API]`

PubChem 검증된 SSTR2 리간드 시드에서 MolMIM으로 후보를 생성하고, DiffDock으로 도킹합니다.

### Seed Molecules (PubChem Verified)
| Name | PubChem CID | Description |
|------|------------|-------------|
| Paltusotine | 134168328 | FDA-approved oral SSTR2 agonist |
| L-054522 | 15965425 | Merck non-peptide SSTR2 agonist |
| Pasireotide | 9941444 | Multi-SST receptor agonist |
| Octreotide | 448601 | Cyclic somatostatin analogue |

In [None]:
# SSTR2 시드 분자 (PubChem 검증)
SEED_MOLECULES = {
    "Paltusotine\n(CID 134168328)": (
        "C1CN(CCC1N)C2=C3C=C(C=CC3=NC=C2C4=CC(=CC(=C4)F)F)"
        "C5=CC=CC(=C5O)C#N"
    ),
    "L-054522\n(CID 15965425)": (
        "CC(C1=CNC2=CC=CC=C21)C(C(=O)NC(CCCCN)C(=O)OC(C)(C)C)"
        "NC(=O)N3CCC(CC3)N4C5=CC=CC=C5NC4=O"
    ),
    "Pasireotide\n(CID 9941444)": (
        "C1C(CN2C1C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(C(=O)NC(C2=O)"
        "CC3=CC=CC=C3)CC4=CC=C(C=C4)OCC5=CC=CC=C5)CCCCN)"
        "CC6=CNC7=CC=CC=C76)C8=CC=CC=C8)OC(=O)NCCN"
    ),
    "Octreotide\n(CID 448601)": (
        "CC(C1C(=O)NC(CSSCC(C(=O)NC(C(=O)NC(C(=O)NC(C(=O)N1)"
        "CCCCN)CC2=CNC3=CC=CC=C32)CC4=CC=CC=C4)NC(=O)C(CC5=CC=CC=C5)N)"
        "C(=O)NC(CO)C(C)O)O"
    ),
}

print("SSTR2 Seed Molecules:")
for name, smi in SEED_MOLECULES.items():
    print(f"\n  {name.split(chr(10))[0]}")
    print(f"    SMILES: {smi[:70]}{'...' if len(smi) > 70 else ''}")
    print(f"    Length: {len(smi)} chars")

In [None]:
# [API] Arm 1: MolMIM → DiffDock 파이프라인
if molmim:
    from bionemo.diffdock_client import DiffDockClient
    diffdock = DiffDockClient()

    arm1_candidates = []
    for name, smi in SEED_MOLECULES.items():
        short_name = name.split("\n")[0]
        print(f"\n[MolMIM] {short_name} 기반 후보 생성...")
        try:
            mols = molmim.generate(
                smi=smi, num_molecules=5, algorithm="CMA-ES",
                property_name="QED", min_similarity=0.3,
                particles=10, iterations=3,
            )
            for mol in mols:
                arm1_candidates.append({
                    "seed": short_name,
                    "smiles": mol.get("sample", mol.get("smiles", "")),
                    "qed": mol.get("score", 0),
                })
            print(f"  -> {len(mols)}개 생성")
        except Exception as e:
            print(f"  -> Error: {e}")

    print(f"\n총 후보: {len(arm1_candidates)}개")
    if arm1_candidates:
        arm1_candidates.sort(key=lambda x: x["qed"], reverse=True)
        print("\nTop 5 (QED):")
        for c in arm1_candidates[:5]:
            print(f"  {c['seed']:15s} QED={c['qed']:.3f}  {c['smiles'][:50]}")
else:
    print("⚠ API 키 미설정 — 스킵. 아래 셀에서 기존 결과를 분석합니다.")

---
## 5. Arm 2: Somatostatin Peptide Variant Analysis

Somatostatin-14 야생형의 **Alanine scanning** 및 **강화 변이체**를 분석합니다.

```
Somatostatin-14: A G C K N F F W K T F T S C
                 1 2 3 4 5 6 7 8 9 10 11 12 13 14
                         ^^^^^^^^^^^^^^^^
                         Pharmacophore: Phe7-Trp8-Lys9-Thr10
```

In [None]:
# Arm 2 기존 결과 로드
arm2_file = REPO_ROOT / "results" / "sstr2_docking" / "arm2_flexpep" / "arm2_results_20260209_235308.json"
arm2 = json.loads(arm2_file.read_text())

WILDTYPE = arm2["wildtype"]
print(f"Wildtype: {WILDTYPE}")
print(f"Variants: {arm2['num_variants']}개")
print(f"PyRosetta: {'사용 가능' if arm2['pyrosetta_available'] else '미설치 (서열 분석만)'}")

print(f"\n{'Name':>20s}  {'Sequence':>16s}  Mutations")
print("-" * 60)
for r in arm2["results"]:
    muts = ", ".join(r["mutations"]) if r["mutations"] else "(wildtype)"
    marker = " ★" if "enhanced" in r["name"] else ""
    print(f"{r['name']:>20s}  {r['sequence']:>16s}  {muts}{marker}")

In [None]:
# 변이체 서열 시각화 (약효단 하이라이팅)
pharmacophore = set([5, 6, 7, 8, 9, 10])  # 0-indexed: F6, F7, W8, K9, T10, F11

fig, ax = plt.subplots(figsize=(14, 7))

variants = arm2["results"]
n_variants = len(variants)
n_res = len(WILDTYPE)

# 매트릭스 생성 (mutation = 1, pharmacophore = 0.5, else = 0)
matrix = np.zeros((n_variants, n_res))
labels = []
for i, v in enumerate(variants):
    seq = v["sequence"]
    labels.append(v["name"])
    for j in range(min(len(seq), n_res)):
        if j < len(WILDTYPE) and j < len(seq) and seq[j] != WILDTYPE[j]:
            matrix[i, j] = 1.0  # mutation
        elif j in pharmacophore:
            matrix[i, j] = 0.3  # pharmacophore

from matplotlib.colors import ListedColormap
cmap = ListedColormap(["#F0F0F0", "#B8D4E3", "#E07B54"])

im = ax.imshow(matrix, cmap=cmap, aspect="auto", vmin=0, vmax=1)

# 서열 문자 표시
for i, v in enumerate(variants):
    seq = v["sequence"]
    for j in range(min(len(seq), n_res)):
        color = "white" if matrix[i, j] == 1.0 else "black"
        fontweight = "bold" if matrix[i, j] > 0 else "normal"
        ax.text(j, i, seq[j], ha="center", va="center",
                fontsize=9, fontweight=fontweight, color=color)

ax.set_xticks(range(n_res))
ax.set_xticklabels([f"{WILDTYPE[i]}\n{i+1}" for i in range(n_res)], fontsize=8)
ax.set_yticks(range(n_variants))
ax.set_yticklabels(labels, fontsize=9)
ax.set_xlabel("Residue Position (Wildtype / Position)", fontsize=11)
ax.set_title("Somatostatin-14 Variant Library\n(orange = mutation, blue = pharmacophore region)",
             fontweight="bold", fontsize=12)

plt.tight_layout()
plt.show()

---
## 6. Arm 3: De Novo Peptide Binder Design — Results

RFdiffusion → ProteinMPNN → ESMFold 3단계 파이프라인으로 설계한 de novo 펩타이드 바인더 분석입니다.

In [None]:
# Arm 3 결과 로드
arm3_file = REPO_ROOT / "results" / "sstr2_docking" / "arm3_denovo" / "arm3_final_20260210_000106.json"
arm3 = json.loads(arm3_file.read_text())

print(f"Pipeline: {arm3['pipeline']}")
print(f"  Backbones designed: {arm3['total_backbones']}")
print(f"  Sequences designed: {arm3['total_designed']}")
print(f"  ESMFold verified:   {arm3['verified']}")
print(f"\n{'Label':>15s}  {'Sequence':>25s}  {'pLDDT':>7s}  {'Len':>4s}")
print("-" * 60)
for d in arm3["designs"]:
    label = f"bb{d['backbone_idx']:02d}_seq{d['seq_idx']}"
    seq = d["binder_sequence"]
    plddt = d["plddt"]
    print(f"{label:>15s}  {seq:>25s}  {plddt:7.1f}  {len(seq):4d}")

In [None]:
# Arm 3 pLDDT 분포
designs = arm3["designs"]
plddts = [d["plddt"] for d in designs]
labels_arm3 = [f"bb{d['backbone_idx']:02d}_s{d['seq_idx']}" for d in designs]
bb_ids = [d["backbone_idx"] for d in designs]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart with backbone coloring
unique_bb = sorted(set(bb_ids))
bb_colors = plt.cm.Set2(np.linspace(0, 0.8, len(unique_bb)))
bb_cmap = {bb: bb_colors[i] for i, bb in enumerate(unique_bb)}
bar_colors = [bb_cmap[b] for b in bb_ids]

bars = ax1.bar(range(len(plddts)), plddts, color=bar_colors, edgecolor="white", linewidth=0.8)
ax1.axhline(y=70, color="green", linestyle="--", alpha=0.7, label="pLDDT ≥ 70 (confident)")
ax1.axhline(y=50, color="orange", linestyle="--", alpha=0.7, label="pLDDT ≥ 50 (low conf.)")
ax1.set_xticks(range(len(labels_arm3)))
ax1.set_xticklabels(labels_arm3, rotation=45, ha="right", fontsize=8)
ax1.set_ylabel("pLDDT")
ax1.set_title("ESMFold pLDDT per Design", fontweight="bold")
ax1.legend(fontsize=8)
ax1.set_ylim(0, 100)

# Box plot per backbone
bb_data = {}
for d in designs:
    bb = f"Backbone {d['backbone_idx']}"
    bb_data.setdefault(bb, []).append(d["plddt"])

bp = ax2.boxplot(bb_data.values(), labels=bb_data.keys(), patch_artist=True,
                 widths=0.5, showmeans=True)
for patch, color in zip(bp["boxes"], bb_colors):
    patch.set_facecolor(color)
ax2.axhline(y=70, color="green", linestyle="--", alpha=0.7)
ax2.set_ylabel("pLDDT")
ax2.set_title("pLDDT Distribution per Backbone", fontweight="bold")
ax2.set_ylim(0, 100)

plt.tight_layout()
plt.show()

# 통계
high_conf = [d for d in designs if d["plddt"] >= 70]
print(f"\npLDDT ≥ 70: {len(high_conf)}/{len(designs)}개")
print(f"Mean pLDDT: {np.mean(plddts):.1f} ± {np.std(plddts):.1f}")
best = max(designs, key=lambda x: x["plddt"])
print(f"Best: bb{best['backbone_idx']:02d}_seq{best['seq_idx']} "
      f"= {best['binder_sequence']} (pLDDT={best['plddt']:.1f})")

In [None]:
# 설계 펩타이드 서열 아미노산 조성 분석
all_seqs = [d["binder_sequence"] for d in designs]
all_aa = "".join(all_seqs)
aa_counts = Counter(all_aa)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# AA frequency
sorted_aa = sorted(aa_counts.items(), key=lambda x: x[1], reverse=True)
aas, cnts = zip(*sorted_aa)

aa_type_colors = {
    "A": "#E07B54", "V": "#E07B54", "I": "#E07B54", "L": "#E07B54",
    "M": "#E07B54", "F": "#E07B54", "W": "#E07B54", "P": "#E07B54",
    "S": "#4C9A2A", "T": "#4C9A2A", "N": "#4C9A2A", "Q": "#4C9A2A",
    "Y": "#4C9A2A", "C": "#4C9A2A",
    "K": "#4169E1", "R": "#4169E1", "H": "#4169E1",
    "D": "#DC143C", "E": "#DC143C",
    "G": "#888888",
}
bar_c = [aa_type_colors.get(a, "gray") for a in aas]

ax1.bar(aas, cnts, color=bar_c, edgecolor="white")
ax1.set_xlabel("Amino Acid")
ax1.set_ylabel("Count")
ax1.set_title("Designed Peptide AA Composition", fontweight="bold")

# Length distribution
lengths = [len(s) for s in all_seqs]
ax2.hist(lengths, bins=range(min(lengths), max(lengths)+2), color="#4C72B0",
         edgecolor="white", align="left")
ax2.set_xlabel("Peptide Length")
ax2.set_ylabel("Count")
ax2.set_title("Designed Peptide Length Distribution", fontweight="bold")

plt.tight_layout()
plt.show()

print(f"\nLength range: {min(lengths)}-{max(lengths)} aa")
print(f"Top 5 AA: {', '.join(f'{a}({c})' for a, c in sorted_aa[:5])}")

---
## 7. Pipeline Summary

3-Arm 가상 스크리닝 전체 결과를 요약합니다.

In [None]:
# 전체 파이프라인 요약
print("=" * 65)
print("   SSTR2 Virtual Screening Pipeline — Summary")
print("=" * 65)

print(f"""
  Target:     SSTR2 (Somatostatin Receptor Type 2)
  Complex:    AlphaFold3 — SSTR2 + Somatostatin-14
  Pocket:     {pocket['num_pocket_residues']} residues (5Å cutoff)

  ┌──────────────────────────────────────────────────────┐
  │ Arm 1: Small Molecule Screening                      │
  │   Seeds: 4 PubChem-verified SSTR2 ligands            │
  │   Method: MolMIM (CMA-ES QED) → DiffDock             │
  │   Status: API pipeline ready                         │
  ├──────────────────────────────────────────────────────┤
  │ Arm 2: Peptide Variant Analysis                      │
  │   Wildtype: {WILDTYPE}                      │
  │   Variants: {arm2['num_variants']} (Ala scan + enhanced)              │
  │   Method: FlexPepDock (PyRosetta required)           │
  │   Status: Sequence analysis complete                 │
  ├──────────────────────────────────────────────────────┤
  │ Arm 3: De Novo Binder Design                         │
  │   Backbones: {arm3['total_backbones']} (RFdiffusion)                      │
  │   Sequences: {arm3['total_designed']} (ProteinMPNN)                     │
  │   Verified:  {arm3['verified']} (ESMFold pLDDT check)              │
  │   Best pLDDT: {max(d['plddt'] for d in arm3['designs']):.1f}                                │
  └──────────────────────────────────────────────────────┘
""")

print("Next Steps:")
print("  1. Arm 1 — DiffDock 도킹 결과 confidence 기반 랭킹")
print("  2. Arm 2 — PyRosetta FlexPepDock 에너지 스코어링")
print("  3. Arm 3 — Top 디자인 AlphaFold3 복합체 재예측")
print("  4. Cross-arm 히트 우선순위화 및 실험 후보 선정")

In [None]:
# 최종 시각화: 3-Arm 비교 대시보드
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Arm 1: Seed molecule SMILES lengths (proxy for complexity)
ax = axes[0]
seed_names = [n.split("\n")[0] for n in SEED_MOLECULES.keys()]
seed_lens = [len(s) for s in SEED_MOLECULES.values()]
ax.barh(seed_names, seed_lens, color="#4C72B0", edgecolor="white")
ax.set_xlabel("SMILES Length")
ax.set_title("Arm 1: Seed Molecules", fontweight="bold")
for i, v in enumerate(seed_lens):
    ax.text(v + 1, i, str(v), va="center", fontsize=9)

# Arm 2: Variant categories
ax = axes[1]
categories = {"Wildtype": 0, "Ala-scan": 0, "Enhanced": 0, "Octreotide": 0}
for r in arm2["results"]:
    if r["name"] == "wildtype":
        categories["Wildtype"] += 1
    elif "enhanced" in r["name"]:
        categories["Enhanced"] += 1
    elif "octreotide" in r["name"]:
        categories["Octreotide"] += 1
    else:
        categories["Ala-scan"] += 1

cat_colors = ["#2ECC71", "#E74C3C", "#3498DB", "#F39C12"]
ax.pie(categories.values(), labels=[f"{k}\n({v})" for k, v in categories.items()],
       colors=cat_colors, autopct="%1.0f%%", startangle=90,
       textprops={"fontsize": 9})
ax.set_title("Arm 2: Variant Types", fontweight="bold")

# Arm 3: pLDDT histogram
ax = axes[2]
ax.hist(plddts, bins=8, color="#E07B54", edgecolor="white", alpha=0.8)
ax.axvline(x=70, color="green", linestyle="--", linewidth=2, label="Confident (70)")
ax.set_xlabel("pLDDT")
ax.set_ylabel("Count")
ax.set_title("Arm 3: ESMFold pLDDT", fontweight="bold")
ax.legend(fontsize=8)

plt.suptitle("SSTR2 Virtual Screening — 3-Arm Pipeline Overview",
             fontweight="bold", fontsize=14, y=1.02)
plt.tight_layout()
plt.show()