In [32]:
from Bio import SeqIO
record = SeqIO.read("./3iwl/rcsb_pdb_3IWL.fasta", "fasta")
cys_positions = [i+1 for i, aa in enumerate(str(record.seq)) if aa == "C"]
print(cys_positions)


[12, 15, 41]


In [None]:
%%bash
rm -rf library/*

In [53]:
from itertools import combinations, product
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from pathlib import Path
import random, gzip

# ───────────────────────────  USER SETTINGS  ────────────────────────────
fasta_in     = "./3iwl/rcsb_pdb_3IWL.fasta"
design_range = range(5, 22)          # inclusive (5–21)
catalytic    = {12, 15}              # MUST stay C
variants     = "random"              # "single", "double", "full", "random"
lib_size     = 200000                # only used for "random"
library_path = Path("library/library.fasta.gz")   # .gz → compressed, .fasta → plain
# ────────────────────────────────────────────────────────────────────────

record  = SeqIO.read(fasta_in, "fasta")
wt_seq  = str(record.seq)

# positions we’re allowed to mutate
positions = [p for p in design_range if p not in catalytic]
choices   = {pos: ("K", "R") for pos in positions}

# ── build mut_lists according to the library type ───────────────────────
mut_lists = []

if variants == "single":
    mut_lists = [[(p, aa)] for p in positions for aa in choices[p]]

elif variants == "double":
    mut_lists = [[(p1, a1), (p2, a2)]
                 for (p1, p2) in combinations(positions, 2)
                 for (a1, a2) in product(choices[p1], choices[p2])]

elif variants == "full":
    per_site = [(*choices[p], None) for p in positions]          # None = keep WT
    for aa_tuple in product(*per_site):
        mut = [(p, aa) for p, aa in zip(positions, aa_tuple) if aa is not None]
        if mut:                                                  # drop all-WT case
            mut_lists.append(mut)

elif variants == "random":
    seen, rng = set(), random.Random(42)
    while len(mut_lists) < lib_size:
        k   = rng.randint(1, len(positions))                     # # sites to mutate
        pos = rng.sample(positions, k)
        mut = [(p, rng.choice(choices[p])) for p in pos]
        sig = tuple(sorted(mut))
        if sig not in seen:
            seen.add(sig)
            mut_lists.append(mut)

else:
    raise ValueError("variants must be one of: single, double, full, random")

# ── assemble SeqRecord objects ──────────────────────────────────────────
base_id = record.id.split("|", 1)[0]           # remove any upstream pipes
records = []

# optional: include wild-type first
records.append(
    SeqRecord(Seq(wt_seq),
              id="WT|WT|WT",         # three fields separated by “|”
              description="wild-type")
)


for i, muts in enumerate(mut_lists, 1):
    seq_list = list(wt_seq)
    tags     = []
    for pos, aa in muts:
        idx = pos - 1
        tags.append(f"{wt_seq[idx]}{pos}{aa}")
        seq_list[idx] = aa

    var_seq  = "".join(seq_list)
    tag_str  = "-".join(tags)
    var_id   = f"{base_id}|VAR{i:05d}|{tag_str}"
    records.append(SeqRecord(Seq(var_seq), id=var_id, description=""))

print(f"Built {len(records):,} total sequences (WT + variants)")

# ── write them all at once ──────────────────────────────────────────────
library_path.parent.mkdir(parents=True, exist_ok=True)

if library_path.suffix == ".gz":
    with gzip.open(library_path, "wt") as handle:
        SeqIO.write(records, handle, "fasta")
else:
    with library_path.open("wt") as handle:
        SeqIO.write(records, handle, "fasta")

size_mb = library_path.stat().st_size / 1_000_000
print(f"Wrote {len(records):,} sequences → {library_path}  ({size_mb:.2f} MB)")


Built 200,001 total sequences (WT + variants)
Wrote 200,001 sequences → library/library.fasta.gz  (4.07 MB)


In [6]:
%%bash
git clone https://github.com/wendao/sbPCR

Cloning into 'sbPCR'...


note that you need `libsvm` installed for the next steps.

In [54]:
%%bash
rm -rf sbPCR_run && mkdir sbPCR_run
gunzip -c library/library.fasta.gz > library/library.fasta

cd sbPCR_run
python ../sbPCR/scripts/get_align.py  ../library/library.fasta
python ../sbPCR/scripts/get_feature.py
svm-predict  formated_input  ../sbPCR/models/train_v1.model  predict


+ rm -rf sbPCR_run
+ mkdir sbPCR_run
+ gunzip -c library/library.fasta.gz
+ cd sbPCR_run
+ python ../sbPCR/scripts/get_align.py ../library/library.fasta
+ python ../sbPCR/scripts/get_feature.py
+ svm-predict formated_input ../sbPCR/models/train_v1.model predict


Accuracy = 88.2122% (529276/600003) (classification)


In [67]:
import pandas as pd, pathlib

wd = pathlib.Path("sbPCR_run")

# 1 ─ read sbPCR outputs (label only) ──────────────────────────────────
align = pd.read_csv(
    wd / "align",  sep=r"\s+",  header=None,
    names=["variant", "pos", "window"]
)
pred  = pd.read_csv(
    wd / "predict", sep=r"\s+", header=None,
    names=["label"], dtype={"label": int}
)

data = pd.concat([align, pred], axis=1)

# 2 ─ is WT reactive at both catalytic sites? ──────────────────────────
wt_rows = data[data.variant.str.startswith("WT") & data.pos.isin([12, 15])]
wt_reactive = (len(wt_rows) == 2) and (wt_rows["label"] == 1).all()
print("WT reactive at both cysteines? ", wt_reactive)

# 3 ─ variants with *both* sites reactive ──────────────────────────────
both_hits = (
    data.query("pos in [12, 15] and label == 1")
        .groupby("variant")
        .size()                          # count reactive catalytic sites
        .loc[lambda s: s == 2]           # keep only if both sites reactive
        .index
)

n_hits = len(both_hits)          # integer
pct    = n_hits / lib_size * 100 # percentage

print(f"{n_hits}/{lib_size} = {pct:.2f}% variants have both Cys 12 and Cys 15 predicted reactive")

WT reactive at both cysteines?  False
20446/200000 = 10.22% variants have both Cys 12 and Cys 15 predicted reactive
