In [1]:
#!pip install numpy
#!pip install Bio
import numpy as np
import pandas as pd
from Bio.Seq import Seq
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional

In [2]:
#Import BsaI data
bsaI_empirical = pd.read_csv('bsaI_empirical.csv')
bsaI_empirical.index = bsaI_empirical['Overhang']
bsaI_empirical = bsaI_empirical.drop(columns=['Overhang'])
bsaI_empirical = bsaI_empirical + 1
bsaI_empirical

Unnamed: 0_level_0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
Overhang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TTTT,636,9,41,17,3,1,1,1,8,1,...,1,1,1,1,1,1,1,1,1,1
GTTT,4,477,5,46,1,21,1,2,1,16,...,1,1,1,1,1,1,1,1,1,1
CTTT,2,2,597,3,1,1,19,1,1,1,...,1,1,1,1,1,1,1,1,1,1
ATTT,9,5,2,643,1,1,1,7,1,2,...,1,1,1,1,1,1,1,1,1,1
TGTT,1,1,1,1,494,17,65,57,3,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACAA,1,1,1,1,1,1,1,1,1,1,...,1,1,11,3,8,480,1,1,1,1
TAAA,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,362,1,11,4
GAAA,1,1,1,1,1,1,1,1,1,1,...,1,1,1,3,1,1,6,716,2,20
CAAA,1,1,1,1,1,1,1,1,1,1,...,5,1,1,1,1,1,4,1,486,1


In [3]:
# 1) Tiny helper that works with either str or Seq
def rc(s: str) -> str:
    return str(Seq(str(s)).reverse_complement())

# 2) Build blacklist 
manual_blacklist = [
    "CGCC","GCGG","GGCG","CGCG","CCGC","GGCC","CGGC","GGTG","CACC","CGCT","GCGC","GTCG","GCCG","AGCG",
    "CACG","GTGG","GGCT","GCTG","CGAC","GTTG","GTGC","GACG","AGCC","GGGC","ACGC","GCAC","CGTC",
    "CAGC","GGGG","GCGT","CGCA","GGTC","TGCG","GATG","GCCC","CGAG"
]

overhang_blacklist = manual_blacklist.copy()

# Ensure consistent casing
bsaI_empirical.index = bsaI_empirical.index.map(lambda x: str(x).upper())
bsaI_empirical.columns = bsaI_empirical.columns.map(lambda x: str(x).upper())

for codon in bsaI_empirical.index:
    codon_str = str(codon).upper()
    rc_str = rc(codon_str)

    # drop palindromes
    if codon_str == rc_str:
        overhang_blacklist.append(codon_str)
        continue

    # Look up efficiency
    if (codon_str in bsaI_empirical.index) and (rc_str in bsaI_empirical.columns):
        eff = bsaI_empirical.at[codon_str, rc_str]
        if eff < 300:
            overhang_blacklist.append(codon_str)
    else:
        print(f"Warning: no data for {codon_str} → {rc_str}")

# Deduplicate + sort
overhang_blacklist = sorted(set(overhang_blacklist))
print("Final overhang blacklist:", overhang_blacklist)

Final overhang blacklist: ['AATT', 'ACGC', 'ACGT', 'AGCC', 'AGCG', 'AGCT', 'ATAT', 'CAAC', 'CACC', 'CACG', 'CAGC', 'CATG', 'CCAC', 'CCCC', 'CCGC', 'CCGG', 'CGAC', 'CGAG', 'CGCA', 'CGCC', 'CGCG', 'CGCT', 'CGGC', 'CGTC', 'CTAG', 'GACG', 'GATC', 'GATG', 'GCAC', 'GCCC', 'GCCG', 'GCGC', 'GCGG', 'GCGT', 'GCTG', 'GGCC', 'GGCG', 'GGCT', 'GGGC', 'GGGG', 'GGTC', 'GGTG', 'GTAC', 'GTCG', 'GTGC', 'GTGG', 'GTTG', 'TATA', 'TCGA', 'TGCA', 'TGCG', 'TTAA']


In [4]:
def _score_breaks_with_fixed(gene, internal_breaks, empirical, overhang_blacklist, fixed_breaks):
    all_breaks = sorted(set(internal_breaks) | set(fixed_breaks))
    fwd = [str(gene[b:b+4]) for b in all_breaks]
    rc  = [str(gene[b:b+4].reverse_complement()) for b in all_breaks]
    all_ov = fwd + rc
    if len(set(all_ov)) != len(all_ov): return 0.0
    if set(all_ov) & set(overhang_blacklist): return 0.0
    try:
        sub = empirical.loc[all_ov, all_ov].copy()
    except KeyError:
        return 0.0
    rs = sub.sum(axis=1)
    if (rs == 0).any(): return 0.0
    sub = sub.div(rs, axis=0)
    score = 1.0
    for f, r in zip(fwd, rc):
        score *= float(sub.loc[f, r]) if (f in sub.index and r in sub.columns) else 0.0
        if score == 0.0: break
    return score

def _frag_lengths(internal_breaks, last_bp):
    bounds = [0] + sorted(internal_breaks) + [last_bp]
    return [int((bounds[i+1] - bounds[i]) + 4) for i in range(len(bounds)-1)]

def design_optimal_breakpoints_variable(
    gene,
    size_specs,                 # (min,max) or list of (min_i,max_i) per fragment
    empirical,                  # pd.DataFrame (4-mer x 4-mer)
    overhang_blacklist=(),
    slack=6,
    max_coord_descent_iters=2,
    force_end_overhangs=True,
    print_fragments=True,
    print_overhangs_and_score=True
):
    """
    Optimize Golden-Gate breakpoints with per-fragment size constraints.
    - Ends (0 and len-4) are fixed overhangs for scoring/uniqueness (not standalone fragments).
    - Fragment size = (right_boundary - left_boundary) + 4 (includes right 4-nt overhang).
    - size_specs: (min,max) for all fragments OR list of (min_i,max_i) per fragment (which fixes N).
    """
    if isinstance(gene, str):
        gene = Seq(gene)
    L = len(gene)
    if L < 8:
        raise ValueError("Sequence too short for 4-nt overhangs.")
    last_bp = L - 4

    # Normalize size_specs
    if isinstance(size_specs, tuple) and len(size_specs) == 2:
        global_min, global_max = size_specs
        if global_min < 8 or global_max < global_min:
            raise ValueError("Invalid global size_specs.")
        per_ranges = None
    elif isinstance(size_specs, (list, tuple)) and all(len(t) == 2 for t in size_specs):
        per_ranges = list(size_specs)
        for (mn, mx) in per_ranges:
            if mn < 8 or mx < mn:
                raise ValueError("Invalid per-fragment size_specs.")
    else:
        raise ValueError("size_specs must be (min,max) or list of (min,max).")

    fixed_breaks = {0, last_bp} if force_end_overhangs else set()

    # Decide N and initial internal breaks
    if per_ranges is None:
        # choose N under global bounds by average length
        N_candidates = []
        for N in range(1, max(2, int(np.ceil((L+4)/global_min)) + 1)):
            avg_len = (L + 4) / N
            if global_min <= avg_len <= global_max:
                N_candidates.append((N, abs(avg_len - (global_min+global_max)/2.0)))
        if N_candidates:
            N = sorted(N_candidates, key=lambda x: (x[1], x[0]))[0][0]
        else:
            N = int(np.clip(np.round((L+4)/((global_min+global_max)/2.0)), 1, max(1, int(np.ceil((L+4)/global_min)))))
        per_ranges = [size_specs] * N
    else:
        N = len(per_ranges)

    # Initial internal breaks based on desired mean lengths
    if N == 1:
        internal_breaks = []
    else:
        means = np.array([(mn + mx) / 2.0 for (mn, mx) in per_ranges], dtype=float)
        deltas = np.maximum(1.0, means - 4.0)  # subtract the +4 overhang accounting
        scale = last_bp / deltas.sum()
        deltas *= scale
        cum = np.cumsum(deltas[:-1])  # internal boundaries
        internal = [int(round(x)) for x in cum]
        # clamp strictly inside (1..last_bp-1) and strictly increasing
        internal = [min(max(1, b), last_bp-1) for b in internal]
        internal.sort()
        for i in range(1, len(internal)):
            if internal[i] <= internal[i-1]:
                internal[i] = min(last_bp-1, internal[i-1] + 1)
        internal_breaks = internal

    # Constraint helper
    def within_bounds(brks):
        lens = _frag_lengths(brks, last_bp)
        if len(lens) != N: return False
        return all(mn <= l <= mx for l, (mn, mx) in zip(lens, per_ranges))

    # Greedy nudge to satisfy per-fragment ranges
    tries = 0
    while (not within_bounds(internal_breaks)) and tries < 4 * max(1, N):
        lens = _frag_lengths(internal_breaks, last_bp)
        # choose fragment with largest violation
        def viol(l, rng):
            mn, mx = rng
            return (mn - l) if l < mn else ((l - mx) if l > mx else 0)
        v = [abs(viol(l, r)) for l, r in zip(lens, per_ranges)]
        idx = int(np.argmax(v))
        if v[idx] == 0: break
        # boundaries: [0] + internal + [last_bp]
        if 0 < idx < len(internal_breaks)+1:
            brk_idx = idx - 1
            step = -1 if lens[idx] > per_ranges[idx][1] else 1  # shrink or grow
            newpos = internal_breaks[brk_idx] + step
            newpos = min(max(1, newpos), last_bp-1)
            internal_breaks[brk_idx] = newpos
            # keep strictly increasing
            if brk_idx > 0 and internal_breaks[brk_idx] <= internal_breaks[brk_idx-1]:
                internal_breaks[brk_idx] = min(last_bp-1, internal_breaks[brk_idx-1] + 1)
            if brk_idx < len(internal_breaks)-1 and internal_breaks[brk_idx] >= internal_breaks[brk_idx+1]:
                internal_breaks[brk_idx] = max(1, internal_breaks[brk_idx+1] - 1)
        elif idx == 0 and internal_breaks:
            internal_breaks[0] = min(last_bp-1, internal_breaks[0] - 1 if lens[0] > per_ranges[0][1] else internal_breaks[0] + 1)
        elif idx == len(lens)-1 and internal_breaks:
            internal_breaks[-1] = max(1, internal_breaks[-1] + 1 if lens[-1] < per_ranges[-1][0] else internal_breaks[-1] - 1)
        tries += 1

    # Score & refine (coordinate descent)
    best = internal_breaks[:]
    best_score = _score_breaks_with_fixed(gene, best, empirical, overhang_blacklist, fixed_breaks) \
                 if (best or fixed_breaks) else 1.0
    for _ in range(max_coord_descent_iters):
        improved = False
        for i in range(len(best)):
            base = best[i]
            lo = max(1, base - slack)
            hi = min(last_bp-1, base + slack)
            local_pos, local_score = base, best_score
            for pos in range(lo, hi+1):
                trial = best[:]
                trial[i] = pos
                trial.sort()
                if (i>0 and trial[i] <= trial[i-1]) or (i < len(trial)-1 and trial[i] >= trial[i+1]):
                    continue
                if not within_bounds(trial):
                    continue
                s = _score_breaks_with_fixed(gene, trial, empirical, overhang_blacklist, fixed_breaks)
                if s > local_score:
                    local_score, local_pos = s, pos
            if local_pos != base:
                best[i] = local_pos
                best_score = local_score
                improved = True
        if not improved:
            break

    # Build fragments (boundaries [0] + best + [last_bp])
    bounds = [0] + best + [last_bp]
    fragments, frag_lengths = [], []
    for i in range(len(bounds)-1):
        frag_seq = gene[bounds[i] : bounds[i+1] + 4]  # include +4 overhang
        fragments.append(str(frag_seq))
        frag_lengths.append(len(frag_seq))
        if print_fragments:
            mn, mx = per_ranges[i]
            print(f"Fragment {i+1} ({len(frag_seq)} bp; target {mn}–{mx}): {frag_seq}")

    # Overhangs (start, internal, end)
    all_breaks_for_scoring = sorted(set(best) | fixed_breaks)
    overhangs_all = [str(gene[b:b+4]) for b in all_breaks_for_scoring]

    if print_overhangs_and_score:
        print("\nOverhangs (start, internal…, end):")
        for j, b in enumerate(all_breaks_for_scoring, start=1):
            print(f"  Break {j} @ {b}: {gene[b:b+4]}  (RC: {gene[b:b+4].reverse_complement()})")
        print(f"\nOverall fidelity score: {best_score:.6g}\n")

    return {
        "internal_breaks": best,
        "all_breaks_for_scoring": all_breaks_for_scoring,
        "fragment_lengths": frag_lengths,
        "n_fragments": len(fragments),
        "fragments": fragments,
        "overhangs_all": overhangs_all,
        "score": float(best_score),
        "per_fragment_ranges": per_ranges,
    }


In [59]:
ranges = [(1500, 1740), (1500, 1740), (1500, 1740), (1500, 1780), (1300, 1780)]
gene=Seq("ACGACACCTGCTTCaGCCACCATGATTCCTGCCAGATTTGCCGGGGTGCTGCTTGCTCTGGCCCTCATTTTGCCAGGGACCCTTTGTGCAGAAGGAACTCGCGGCAGGTCATCCACGGCCCGATGCAGCCTTTTCGGAAGTGACTTCGTCAACACCTTTGATGGGAGCATGTACAGCTTTGCGGGATACTGCAGTTACCTCCTGGCAGGGGGCTGCCAGAAACGCTCCTTCTCGATTATTGGGGACTTCCAGAATGGCAAGAGAGTGAGCCTCTCCGTGTATCTTGGGGAATTTTTTGACATCCATTTGTTTGTCAATGGTACCGTGACACAGGGGGACCAAAGAGTCTCCATGCCCTATGCCTCCAAAGGGCTGTATCTAGAAACTGAGGCTGGGTACTACAAGCTGTCCGGTGAGGCCTATGGCTTTGTGGCCAGGATCGATGGCAGCGGCAACTTTCAAGTCCTGCTGTCAGACAGATACTTCAACAAGACCTGCGGGCTGTGTGGCAACTTTAACATCTTTGCTGAAGATGACTTTATGACCCAAGAAGGGACCTTGACCTCGGACCCTTATGACTTTGCCAACTCATGGGCTCTGAGCAGTGGAGAACAGTGGTGTGAACGGGCATCTCCTCCCAGCAGCTCATGCAACATCTCCTCTGGGGAAATGCAGAAGGGCCTGTGGGAGCAGTGCCAGCTTCTGAAGAGCACCTCGGTGTTTGCCCGCTGCCACCCTCTGGTGGACCCCGAGCCTTTTGTGGCCCTGTGTGAGAAGACTTTGTGTGAGTGTGCTGGGGGGCTGGAGTGCGCCTGCCCTGCCCTCCTGGAGTACGCCCGGACCTGTGCCCAGGAGGGAATGGTGCTGTACGGCTGGACCGACCACAGCGCGTGCAGCCCAGTGTGCCCTGCTGGTATGGAGTATAGGCAGTGTGTGTCCCCTTGCGCCAGGACCTGCCAGAGCCTGCACATCAATGAAATGTGTCAGGAGCGATGCGTGGATGGCTGCAGCTGCCCTGAGGGACAGCTCCTGGATGAAGGCCTCTGCGTGGAGAGCACCGAGTGTCCCTGCGTGCATTCCGGAAAGCGCTACCCTCCCGGCACCTCCCTCTCTCGAGACTGCAACACaTGCATTTGCCGAAACAGCCAGTGGATCTGCAGCAATGAAGAATGTCCAGGGGAGTGCCTTGTCACAGGTCAATCACACTTCAAGAGCTTTGACAACAGATACTTCACCTTCAGTGGGATCTGCCAGTACCTGCTGGCCCGGGATTGCCAGGACCACTCCTTCTCCATTGTCATTGAGACTGTCCAGTGTGCTGATGACCGCGACGCTGTGTGCACCCGCTCCGTCACCGTCCGGCTGCCTGGCCTGCACAACAGCCTTGTGAAACTGAAGCATGGGGCAGGAGTTGCCATGGATGGCCAGGACGTCCAGCTCCCCCTCCTGAAAGGTGACCTCCGCATCCAGCATACAGTGACGGCCTCCGTGCGCCTCAGCTACGGGGAGGACCTGCAGATGGACTGGGATGGCCGCGGGAGGCTGCTGGTGAAGCTGTCCCCCGTCTATGCCGGGAAGACCTGCGGCCTGTGTGGGAATTACAATGGCAACCAGGGCGACGACTTCCTTACCCCCTCTGGGCTGGCGGAGCCCCGGGTGGAGGACTTCGGGAACGCCTGGAAGCTGCACGGGGACTGCCAGGACCTGCAGAAGCAGCACAGCGATCCCTGCGCCCTCAACCCGCGCATGACCAGGTTCTCCGAGGAGGCGTGCGCGGTCCTGACGTCCCCCACATTCGAGGCCTGCCATCGTGCCGTCAGCCCGCTGCCCTACCTGCGGAACTGCCGCTACGACGTGTGCTCCTGCTCGGACGGCCGCGAGTGCCTGTGCGGCGCCCTGGCCAGCTATGCCGCGGCCTGCGCGGGGAGAGGCGTGCGCGTCGCGTGGCGCGAGCCAGGCCGCTGTGAGCTGAACTGCCCGAAAGGCCAGGTGTACCTGCAGTGCGGGACCCCCTGCAACCTGACCTGCCGCTCTCTCTCTTACCCGGATGAGGAATGCAATGAGGCCTGCCTGGAGGGCTGCTTCTGCCCCCCAGGGCTCTACATGGATGAGAGGGGGGACTGCGTGCCCAAGGCCCAGTGCCCCTGTTACTATGACGGTGAGATCTTCCAGCCAGAAGACATCTTCTCAGACCATCACACCATGTGCTACTGTGAGGATGGCTTCATGCACTGTACCATGAGTGGAGTCCCCGGAAGCTTGCTGCCTGACGCTGTCCTCAGCAGTCCCCTGTCTCATCGCAGCAAAAGGAGCCTATCCTGTCGGCCCCCCATGGTCAAGCTGGTGTGTCCCGCTGACAACCTGCGGGCTGAAGGGCTCGAGTGTACCAAAACGTGCCAGAACTATGACCTGGAGTGCATGAGCATGGGCTGTGTCTCTGGCTGCCTCTGCCCCCCGGGCATGGTCCGGCATGAGAACAGATGTGTGGCCCTGGAAAGGTGTCCCTGCTTCCATCAGGGCAAGGAGTATGCCCCTGGAGAAACAGTGAAGATTGGCTGCAACACTTGTGTCTGTCAGGACCGGAAGTGGAACTGCACAGACCATGTGTGTGATGCCACGTGCTCCACGATCGGCATGGCCCACTACCTCACCTTCGACGGGCTCAAATACCTGTTCCCCGGGGAGTGCCAGTACGTTCTGGTGCAGGATTACTGCGGCAGTAACCCTGGGACCTTTCGGATCCTAGTGGGGAATAAGGGATGCAGCCACCCCTCAGTGAAATGCAAGAAACGGGTCACCATCCTGGTGGAGGGAGGAGAGATTGAGCTGTTTGACGGGGAGGTGAATGTGAAGAGGCCCATGAAGGATGAGACTCACTTTGAGGTGGTGGAGTCTGGCCGGTACATCATTCTGCTGCTGGGCAAAGCCCTCTCCGTGGTCTGGGACCGCCACCTGAGCATCTCCGTGGTCCTGAAGCAGACATACCAGGAGAAAGTGTGTGGCCTGTGTGGGAATTTTGATGGCATCCAGAACAATGACCTCACCAGCAGCAACCTCCAAGTGGAGGAAGACCCTGTGGACTTTGGGAACTCCTGGAAAGTGAGCTCGCAGTGTGCTGACACCAGAAAAGTGCCTCTGGACTCATCCCCTGCCACaTGCCATAACAACATCATGAAGCAGACGATGGTGGATTCCTCCTGTAGAATCCTTACCAGTGACGTCTTCCAGGACTGCAACAAGCTGGTGGACCCCGAGCCATATCTGGATGTCTGCATTTACGACACaTGCTCCTGTGAGTCCATTGGGGACTGCGCCTGCTTCTGCGACACCATTGCTGCCTATGCCCACGTGTGTGCCCAGCATGGCAAGGTGGTGACCTGGAGGACGGCCACATTGTGCCCCCAGAGCTGCGAGGAGAGGAATCTCCGGGAGAACGGGTATGAGTGTGAGTGGCGCTATAACAGCTGTGCACCcGCCTGTCAAGTCACGTGTCAGCACCCTGAGCCACTGGCCTGCCCTGTGCAGTGTGTGGAGGGCTGCCATGCCCACTGCCCTCCAGGGAAAATCCTGGATGAGCTTTTGCAGACCTGCGTTGACCCTGAAGACTGTCCAGTGTGTGAGGTGGCTGGCCGGCGTTTTGCCTCAGGAAAGAAAGTCACCTTGAATCCCAGTGACCCTGAGCACTGCCAGATTTGCCACTGTGATGTTGTCAACCTCACCTGTGAAGCCTGCCAGGAGCCGGGAGGCCTGGTGGTGCCTCCCACAGATGCCCCGGTGAGCCCCACCACTCTGTATGTGGAGGACATCTCGGAACCGCCGTTGCACGATTTCTACTGCAGCAGGCTACTGGACCTGGTCTTCCTGCTGGATGGCTCCTCCAGGCTGTCCGAGGCTGAGTTTGAAGTGCTGAAGGCCTTTGTGGTGGACATGATGGAGCGGCTGCGCATCTCCCAGAAGTGGGTCCGCGTGGCCGTGGTGGAGTACCACGACGGCTCCCACGCCTACATCGGGCTCAAGGACCGGAAGCGACCGTCAGAGCTGCGGCGCATTGCCAGCCAGGTGAAGTATGCGGGCAGCCAGGTGGCCTCCACCAGCGAGGTCTTGAAATACACACTGTTCCAAATCTTCAGCAAGATCGACCGCCCTGAAGCCTCCCGCATCACCCTGCTCCTGATGGCCAGCCAGGAGCCCCAACGGATGTCCCGGAACTTTGTCCGCTACGTCCAGGGCCTGAAGAAGAAGAAGGTCATTGTGATCCCGGTGGGCATTGGGCCCCATGCCAACCTCAAGCAGATCCGCCTCATCGAGAAGCAGGCCCCTGAGAACAAGGCCTTCGTGCTGAGCAGTGTGGATGAGCTGGAGCAGCAAAGGGACGAGATCGTTAGCTACCTCTGTGACCTTGCCCCTGAAGCCCCTCCTCCTACTCTGCCCCCCGACATGGCACAAGTCACTGTGGGCCCGGGGCTCTTGGGGGTTTCGACCCTGGGGCCCAAGAGGAACTCCATGGTTCTGGATGTGGCGTTCGTCCTGGAAGGATCGGACAAAATTGGTGAAGCCGACTTCAACAGGAGCAAGGAGTTCATGGAGGAGGTGATTCAGCGGATGGATGTGGGCCAGGACAGCATCCACGTCACGGTGCTGCAGTACTCCTACATGGTGACTGTGGAGTACCCCTTCAGCGAGGCACAGTCCAAAGGGGACATCCTGCAGCGGGTGCGAGAGATCCGCTACCAGGGCGGCAACAGGACCAACACTGGGCTGGCCCTGCGGTACCTCTCTGACCACAGCTTCTTGGTCAGCCAGGGTGACCGGGAGCAGGCGCCCAACCTGGTCTACATGGTCACCGGAAATCCTGCCTCTGATGAGATCAAGAGGCTGCCTGGAGACATCCAGGTGGTGCCCATTGGAGTGGGCCCTAATGCCAACGTGCAGGAGCTGGAGAGGATTGGCTGGCCCAATGCCCCTATCCTCATCCAGGACTTTGAGACGCTCCCCCGAGAGGCTCCTGACCTGGTGCTGCAGAGGTGCTGCTCCGGAGAGGGGCTGCAGATCCCCACCCTCTCCCCTGCACCTGACTGCAGCCAGCCCCTGGACGTGATCCTTCTCCTGGATGGCTCCTCCAGTTTCCCAGCTTCTTATTTTGATGAAATGAAGAGTTTCGCCAAGGCTTTCATTTCAAAAGCCAATATAGGGCCTCGTCTCACTCAGGTGTCAGTGCTGCAGTATGGAAGCATCACCACCATTGACGTGCCATGGAACGTGGTCCCGGAGAAAGCCCATTTGCTGAGCCTTGTGGACGTCATGCAGCGGGAGGGAGGCCCCAGCCAAATCGGGGATGCCTTGGGCTTTGCTGTGCGATACTTGACTTCAGAAATGCATGGTGCCAGGCCGGGAGCCTCAAAGGCGGTGGTCATCCTGGTCACGGACGTCTCTGTGGATTCAGTGGATGCAGCAGCTGATGCCGCCAGGTCCAACAGAGTGACAGTGTTCCCTATTGGAATTGGAGATCGCTACGATGCAGCCCAGCTACGGATCTTGGCAGGCCCAGCAGGCGACTCCAACGTGGTGAAGCTCCAGCGAATCGAAGACCTCCCTACCATGGTCACCTTGGGCAATTCCTTCCTCCACAAACTGTGCTCTGGATTTGTTAGGATTTGCATGGATGAGGATGGGAATGAGAAGAGGCCCGGGGACGTCTGGACCTTGCCAGACCAGTGCCACACCGTGACTTGCCAGCCAGATGGCCAGACCTTGCTGAAGAGTCATCGGGTCAACTGTGACCGGGGGCTGAGGCCTTCGTGCCCTAACAGCCAGTCCCCTGTTAAAGTGGAAGAaACCTGTGGCTGCCGCTGGACCTGCCCCTGCGTGTGCACAGGCAGCTCCACTCGGCACATCGTGACCTTTGATGGGCAGAATTTCAAGCTGACTGGCAGCTGTTCTTATGTCCTATTTCAAAACAAGGAGCAGGACCTGGAGGTGATTCTCCATAATGGTGCCTGCAGCCCTGGAGCAAGGCAGGGCTGCATGAAATCCATCGAGGTGAAGCACAGTGCCCTCTCCGTCGAGCTGCACAGTGACATGGAGGTGACGGTGAATGGGAGACTGGTgTCTGTTCCTTACGTGGGTGGGAACATGGAAGTCAACGTTTATGGTGCCATCATGCATGAGGTCAGATTCAATCACCTTGGTCACATCTTCACATTCACTCCACAAAACAATGAGTTCCAACTGCAGCTCAGCCCCAAGACTTTTGCTTCAAAGACGTATGGTCTGTGTGGGATCTGTGATGAGAACGGAGCCAATGACTTCATGCTGAGGGATGGCACAGTCACCACAGACTGGAAAACACTTGTTCAGGAATGGACTGTGCAGCGGCCAGGGCAGACGTGCCAGCCCATCCTGGAGGAGCAGTGTCTTGTCCCCGACAGCTCCCACTGCCAGGTCCTCCTCTTACCACTGTTTGCTGAATGCCACAAGGTCCTGGCTCCAGCCACATTCTATGCCATCTGCCAGCAGGACAGTTGCCACCAGGAGCAAGTGTGTGAGGTGATCGCCTCTTATGCCCACCTCTGTCGGACCAACGGGGTCTGCGTTGACTGGAGGACACCTGATTTCTGTGCTATGTCATGCCCACCATCTCTGGTCTACAACCACTGTGAGCATGGCTGTCCCCGGCACTGTGATGGCAACGTGAGCTCCTGTGGGGACCATCCCTCCGAAGGCTGTTTCTGCCCTCCAGATAAAGTCATGTTGGAAGGCAGCTGTGTCCCTGAAGAGGCCTGCACTCAGTGCATTGGTGAGGATGGAGTCCAGCACCAGTTCCTGGAAGCCTGGGTCCCGGACCACCAGCCCTGTCAGATCTGCACATGCCTCAGCGGGCGGAAGGTCAACTGCACAACGCAGCCCTGCCCCACGGCCAAAGCTCCCACGTGTGGCCTGTGTGAAGTAGCCCGCCTCCGCCAGAATGCAGACCAGTGCTGCCCCGAGTATGAGTGTGTGTGTGACCCAGTGAGCTGTGACCTGCCCCCAGTGCCTCACTGTGAACGTGGCCTCCAGCCCACACTGACCAACCCTGGCGAGTGCAGACCCAACTTCACaTGCGCCTGCAGGAAGGAGGAGTGCAAAAGAGTGTCCCCACCCTCCTGCCCCCCGCACCGTTTGCCCACCCTTCGGAAGACCCAGTGCTGTGATGAGTATGAGTGTGCCTGCAACTGTGTCAACTCCACAGTGAGCTGTCCCCTTGGGTACTTGGCCTCAACTGCCACCAATGACTGTGGCTGTACCACAACCACaTGCCTTCCCGACAAGGTGTGTGTCCACCGAAGCACCATCTACCCTGTGGGCCAGTTCTGGGAGGAGGGCTGCGATGTGTGCACaTGCACCGACATGGAGGATGCCGTGATGGGCCTCCGCGTGGCCCAGTGCTCCCAGAAGCCCTGTGAGGACAGCTGTCGGTCGGGCTTCACTTACGTTCTGCATGAAGGCGAGTGCTGTGGAAGGTGCCTGCCATCTGCCTGTGAGGTGGTGACTGGCTCACCGCGGGGGGACTCCCAGTCTTCCTGGAAGAGTGTCGGCTCCCAGTGGGCCTCCCCGGAGAACCCCTGCCTCATCAATGAGTGTGTCCGAGTGAAGGAGGAGGTCTTTATACAACAAAGGAACGTCTCCTGCCCCCAGCTGGAGGTCCCTGTCTGCCCCTCGGGCTTTCAGCTGAGCTGTAAGACCTCAGCGTGCTGCCCAAGCTGTCGCTGTGAGCGCATGGAGGCCTGCATGCTCAATGGCACTGTCATTGGGCCCGGGAAGACTGTGATGATCGATGTGTGCACGACCTGCCGCTGCATGGTGCAGGTcGGGGTCATCTCTGGATTCAAGCTGGAGTGCAGGAAGACCACaTGCAACCCCTGCCCCCTGGGTTACAAGGAAGAAAATAACACAGGTGAATGTTGTGGGAGATGTTTGCCTACGGCTTGCACCATTCAGCTAAGAGGAGGACAGATCATGACACTGAAGCGTGATGAGACGCTCCAGGATGGCTGTGATACTCACTTCTGCAAGGTCAATGAGAGAGGAGAGTACTTCTGGGAGAAGAGGGTCACAGGCTGCCCACCCTTTGATGAACACAAGTGTCTGGCTGAGGGAGGTAAAATTATGAAAATTCCAGGCACaTGCTGTGACACATGTGAGGAGCCTGAGTGCAACGACATCACTGCCAGGCTGCAGTATGTCAAGGTGGGAAGCTGTAAGTCTGAAGTAGAGGTGGATATCCACTACTGCCAGGGCAAATGTGCCAGCAAAGCCATGTACTCCATTGACATCAACGATGTGCAGGACCAGTGCTCCTGCTGCTCTCCGACACGGACGGAGCCCATGCAGGTcGCCCTGCACTGCACCAATGGCTCTGTTGTGTACCATGAGGTTCTCAATGCCATGGAGTGCAAATGCTCCCCCAGGAAGTGCAGCAAGTGACAGGCATGGCAGGTGaatg".upper())  
res = design_optimal_breakpoints_variable(
    gene=gene,
    size_specs=ranges,                  # per-fragment constraints
    empirical=bsaI_empirical,
    overhang_blacklist=overhang_blacklist,
    slack=30,
    max_coord_descent_iters=30,
    print_fragments=True,
    print_overhangs_and_score=True
)

Fragment 1 (1696 bp; target 1500–1740): ACGACACCTGCTTCAGCCACCATGATTCCTGCCAGATTTGCCGGGGTGCTGCTTGCTCTGGCCCTCATTTTGCCAGGGACCCTTTGTGCAGAAGGAACTCGCGGCAGGTCATCCACGGCCCGATGCAGCCTTTTCGGAAGTGACTTCGTCAACACCTTTGATGGGAGCATGTACAGCTTTGCGGGATACTGCAGTTACCTCCTGGCAGGGGGCTGCCAGAAACGCTCCTTCTCGATTATTGGGGACTTCCAGAATGGCAAGAGAGTGAGCCTCTCCGTGTATCTTGGGGAATTTTTTGACATCCATTTGTTTGTCAATGGTACCGTGACACAGGGGGACCAAAGAGTCTCCATGCCCTATGCCTCCAAAGGGCTGTATCTAGAAACTGAGGCTGGGTACTACAAGCTGTCCGGTGAGGCCTATGGCTTTGTGGCCAGGATCGATGGCAGCGGCAACTTTCAAGTCCTGCTGTCAGACAGATACTTCAACAAGACCTGCGGGCTGTGTGGCAACTTTAACATCTTTGCTGAAGATGACTTTATGACCCAAGAAGGGACCTTGACCTCGGACCCTTATGACTTTGCCAACTCATGGGCTCTGAGCAGTGGAGAACAGTGGTGTGAACGGGCATCTCCTCCCAGCAGCTCATGCAACATCTCCTCTGGGGAAATGCAGAAGGGCCTGTGGGAGCAGTGCCAGCTTCTGAAGAGCACCTCGGTGTTTGCCCGCTGCCACCCTCTGGTGGACCCCGAGCCTTTTGTGGCCCTGTGTGAGAAGACTTTGTGTGAGTGTGCTGGGGGGCTGGAGTGCGCCTGCCCTGCCCTCCTGGAGTACGCCCGGACCTGTGCCCAGGAGGGAATGGTGCTGTACGGCTGGACCGACCACAGCGCGTGCAGCCCAGTGTGCCCTGCTGGTATGGAGTATAGGCAGTGTGTGTCCCCTTGCGCCAGGACCTGCCAG

In [10]:
try:
    from Bio.Seq import Seq
    SeqType = Union[str, Seq]
except Exception:
    Seq = str  # fall back if Biopython isn't around
    SeqType = str

def _to_seq(x: SeqType) -> SeqType:
    return Seq(x) if isinstance(x, str) and 'Seq' in globals() and Seq is not str else x

def add_manual_flanks(
    res: dict,
    left: Optional[SeqType] = None,
    right: Optional[SeqType] = None,
    per_fragment: Optional[Dict[int, Dict[str, SeqType]]] = None,
    inplace: bool = False,
    output_key: str = "fragments_flanked"
) -> dict:
    """
    Add flanking sequences to each fragment in res['fragments'].

    Parameters
    ----------
    res : dict
        Must contain 'fragments': List[str|Seq]
    left : str|Seq or None
        Global left flank to prepend to every fragment (default: none)
    right : str|Seq or None
        Global right flank to append to every fragment (default: none)
    per_fragment : dict or None
        Optional per-fragment overrides like:
            {
              0: {"left": "AAAA", "right": "TTTT"},
              2: {"left": "GG",   "right": ""}  # empty/right-only allowed
            }
        Any key can be omitted; e.g., {"left": "..."} only.
    inplace : bool
        If True, modifies and returns the same dict; else returns a shallow copy.
    output_key : str
        Key name to store the flanked fragments.

    Returns
    -------
    dict
        res with:
          - res[output_key]: new list of flanked fragments
          - res['flanks_used']: list of dicts with left/right used per fragment
          - res['total_added_bp']: total length added across all fragments
    """
    if "fragments" not in res:
        raise KeyError("res must contain a 'fragments' key (list of sequences).")

    frags: List[SeqType] = res["fragments"]
    n = len(frags)
    if n == 0:
        out = res if inplace else dict(res)
        out[output_key] = []
        out["flanks_used"] = []
        out["total_added_bp"] = 0
        return out

    Lg = _to_seq(left) if left else _to_seq("")
    Rg = _to_seq(right) if right else _to_seq("")
    per_fragment = per_fragment or {}

    new_frags: List[SeqType] = []
    flanks_used: List[Dict[str, str]] = []
    total_added = 0

    for i, f in enumerate(frags):
        fi = _to_seq(f)
        Li = _to_seq(per_fragment.get(i, {}).get("left", Lg))
        Ri = _to_seq(per_fragment.get(i, {}).get("right", Rg))

        # Combine
        flanked = (Li or _to_seq("")) + fi + (Ri or _to_seq(""))
        new_frags.append(flanked)

        # Record keeping (store as plain strings for readability)
        Li_s = str(Li) if Li is not None else ""
        Ri_s = str(Ri) if Ri is not None else ""
        flanks_used.append({"index": i, "left": Li_s, "right": Ri_s})
        total_added += len(Li_s) + len(Ri_s)

    out = res if inplace else dict(res)
    out[output_key] = new_frags
    out["flanks_used"] = flanks_used
    out["total_added_bp"] = total_added
    return out

In [52]:
res2 = add_manual_flanks(
    res,
    left="CGTCTCA",           # prepend to each fragment
    right="AGAGACG",          # append to each fragment
    inplace=False
)
def print_fragments(res, key="fragments_flanked"):
    """
    Pretty-print the fragments from a res dict.
    """
    if key not in res:
        raise KeyError(f"'{key}' not found in res. Available keys: {list(res.keys())}")

    print(f"\nPrinting {len(res[key])} fragments from '{key}':\n")
    for i, frag in enumerate(res[key], start=1):
        print(f"Fragment {i:>2}: {frag}")
print_fragments(res2)


Printing 2 fragments from 'fragments_flanked':

Fragment  1: CGTCTCAGGAGGTTAATGTGGCTCTGGTTCTGGGTGGCCTTTTGCTCACATGTCCTGCAGGCAGCTGCGCGCTCGCTCGCTCACTGAGGCCGCCCGGGCGTCGGGCGACCTTTGGTCGCCCGGCCTCAGTGAGCGAGCGAGCGCGCAGAGAGGGAGTGGCCAACTCCATCACTAGGGGTTCCTGCGGCCGCACGCGTGGTTTGTACCGTACACCACTGAGACCGCGGTGGTTGACCAGACAAACCCTGATTTTGTAGGTAACCACGTGTGGACCGAGCGGCCGCAGGAACCCCTAGTGATGGAGTTGGCCACTCCCTCTCTGCGCGCTCGCTCGCTCACTGAGGCCGGGCGACCAAAGGTCGCCCGACGCCCGGGCTTTGCCCGGGCGGCCTCAGTGAGCGAGCGAGCGCGCAGCTGCCTGCAGGGGCCGTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTAGAGACG
Fragment  2: CGTCTCAATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGT