In [4]:
from Bio import SeqIO
record = SeqIO.read("./3iwl/rcsb_pdb_3IWL.fasta", "fasta")
cys_positions = [i+1 for i, aa in enumerate(str(record.seq)) if aa == "C"]
print(cys_positions)


[12, 15, 41]


In [5]:
%%bash
rm -rf library/*

In [20]:
from itertools import combinations, product
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import random, os, pathlib

# ───────────────────────────  USER SETTINGS  ────────────────────────────
fasta_in     = "./3iwl/rcsb_pdb_3IWL.fasta"
design_range = range(5, 22)        # inclusive (5-21)
catalytic    = {12, 15}            # MUST stay C
variants     = "random"            # "single", "double", "full", "random"
lib_size     = 2000                # only used for "random"
out_dir      = pathlib.Path("library/3iwl")  # one file per variant
out_dir.mkdir(parents=True, exist_ok=True)
# ────────────────────────────────────────────────────────────────────────

record   = SeqIO.read(fasta_in, "fasta")
wt_seq   = str(record.seq)

# positions eligible for mutation
positions = [p for p in design_range if p not in catalytic]

# allowed substitutions at every mutable position
choices = {pos: ("K", "R") for pos in positions}

# -----------------------------------------------------------------------
# build the list of mutation sets (mut_lists) according to library type
# -----------------------------------------------------------------------
mut_lists = []

if variants == "single":
    mut_lists = [[(p, aa)]                   # one tuple  (pos, newAA)
                 for p in positions
                 for aa in choices[p]]

elif variants == "double":
    mut_lists = [[(p1, a1), (p2, a2)]        # two independent positions
                 for (p1, p2) in combinations(positions, 2)
                 for (a1, a2) in product(choices[p1], choices[p2])]

elif variants == "full":
    # Cartesian product of {original, K, R} at every position
    #   – keep original by inserting None
    #   – skip the all-None case (identical to WT)
    per_site = [(*choices[p], None) for p in positions]
    for aa_tuple in product(*per_site):
        mut = [(p, aa) for p, aa in zip(positions, aa_tuple) if aa is not None]
        if mut:                          # drop WT
            mut_lists.append(mut)

elif variants == "random":
    seen = set()
    rng = random.Random(42)
    while len(mut_lists) < lib_size:
        k   = rng.randint(1, len(positions))          # how many sites to mutate
        pos = rng.sample(positions, k)
        mut = [(p, rng.choice(choices[p])) for p in pos]
        key = tuple(sorted(mut))
        if key not in seen:
            seen.add(key)
            mut_lists.append(mut)

else:
    raise ValueError("variants must be one of: single, double, full, random")

# -----------------------------------------------------------------------
# write one FASTA per variant  (header:  protein|VARxxxxx|T11K-G14R …)
# -----------------------------------------------------------------------
base_id = record.id.split("|", 1)[0]          # strip any upstream pipes
written = 0

records = []
for i, muts in enumerate(mut_lists, 1):
    seq_list = list(wt_seq)
    tags     = []

    for pos, aa in muts:
        idx = pos - 1
        tags.append(f"{wt_seq[idx]}{pos}{aa}")
        seq_list[idx] = aa

    var_seq = "".join(seq_list)
    tag_str = "-".join(tags) or "WT"            # ensure non-empty for WT case
    var_id  = f"{base_id}|VAR{i:05d}|{tag_str}"

    records.append(SeqRecord(Seq(var_seq), id=var_id, description=""))

print(f"Built {len(records):,} variant sequences in memory")

# ─────────────── write *once* to a single FASTA (gz optional) ───────────
library_path = Path("library.fasta.gz")         # ← change to .fasta if no gzip
with open(library_path, "wt") as handle:   # use open(...) for plain text
    SeqIO.write(records, handle, "fasta")

size_mb = library_path.stat().st_size / 1_000_000
print(f"Wrote {len(records):,} variants → {library_path}  ({size_mb:.2f} MB)")

Built 2,000 variant sequences in memory
Wrote 2,000 variants → library.fasta.gz  (0.25 MB)


In [6]:
%%bash
git clone https://github.com/wendao/sbPCR

Cloning into 'sbPCR'...


In [9]:
%%bash
nix-shell -p libsvm

In [21]:
from pathlib import Path
from Bio import SeqIO

src_dir   = Path('library/3iwl')
out_file  = Path('library/library.fasta')

# aggregate all variant files
records = []
for f in src_dir.glob('*.fasta'):
    records.extend(SeqIO.parse(f, 'fasta'))

_ = SeqIO.write(records, out_file, 'fasta')
print(f"Wrote {len(records):,} records to {out_file}  "
      f"({out_file.stat().st_size/1e6:.2f} MB)")
print("example header:", records[0].id)


Wrote 2,000 records to library/library.fasta  (0.25 MB)
example header: 3IWL_1|VAR00981|G13K-S7R-E5K-V19R-R21K-G14R-D9R


In [22]:
%%bash
set -eux            # -e stop on error, -u undefined var, -x echo commands

# 0. fresh scratch
rm -rf sbPCR_run && mkdir sbPCR_run
cd sbPCR_run

# 1. window extraction
python ../sbPCR/scripts/get_align.py  ../library/library.fasta


+ rm -rf sbPCR_run
+ mkdir sbPCR_run
+ cd sbPCR_run
+ python ../sbPCR/scripts/get_align.py ../library/library.fasta


In [23]:
%%bash
set -eux
cd sbPCR_run

python ../sbPCR/scripts/get_feature.py


+ cd sbPCR_run
+ python ../sbPCR/scripts/get_feature.py


In [24]:
%%bash
set -eux
cd sbPCR_run

svm-predict \
    formated_input \
    ../sbPCR/models/train_v1.model \
    predict


+ cd sbPCR_run
+ svm-predict formated_input ../sbPCR/models/train_v1.model predict


Accuracy = 89.5% (5370/6000) (classification)


In [25]:
import pandas as pd, pathlib
wd = pathlib.Path('sbPCR_run')

align = pd.read_csv(wd/'align',   sep=' ', header=None,
                    names=['variant','pos','window'])
pred = pd.read_csv(wd/'predict', sep=r'\s+', engine='python',
                   header=None, names=['label','score'])


results = pd.concat([align, pred], axis=1)

hits = (results
        .query('pos in [12, 15] and label == 1')
        .sort_values('score', ascending=False))

print(f"{len(hits)} variants predicted reactive at Cys 12 or 15")
hits.head()


630 variants predicted reactive at Cys 12 or 15


Unnamed: 0,variant,pos,window,label,score
16,VAR00055,15,------MPKHRFRRRRRCKRCKKKRRRVLNKLGGVKYDIDL,1,
22,VAR00043,15,------MPKHEFSVKMKCGGCKEAVKRVLNKLGGVKYDIDL,1,
33,VAR01537,12,---------MPKHRRRKDRRCRGCRKRKSRVLNKLGGVKYD,1,
34,VAR01537,15,------MPKHRRRKDRRCRGCRKRKSRVLNKLGGVKYDIDL,1,
39,VAR00035,12,---------MPKHRRSKRKRCRGCRKRVKRVLNKLGGVKYD,1,
