# Design primers
Design the actual primers.

Import Python modules:

In [None]:
import os
import re

import Bio.Seq
import Bio.SeqIO
from Bio.SeqUtils import MeltingTemp

import numpy

import pandas as pd

import yaml

Read configuration:

In [None]:
if not os.path.isfile("config.yaml"):
    os.chdir("../")  # if running interactively in a subdirectory
    
with open("config.yaml") as f:
    config = yaml.safe_load(f)

Get the primer specs:

In [None]:
targeted_mutations_offsets = config["targeted_mutations_offsets"]
saturated_sites_offsets = config["saturated_sites_offsets"]
primer_min_tm = config["primer_min_tm"]
primer_min_length = config["primer_min_length"]
primer_max_length = config["primer_max_length"]

print(
    f"{targeted_mutations_offsets=}\n"
    f"{saturated_sites_offsets=}\n"
    f"{primer_min_tm=}\n"
    f"{primer_min_length=}\n"
    f"{primer_max_length=}\n"
)

Read the gene and get the coordinate where the sequence starts:

In [None]:
# get upper case portion
extended_gene = str(Bio.SeqIO.read(config["extended_gene"], "fasta").seq)
m = re.fullmatch("[acgt]*(?P<gene>[ACGT]+)[acgt]*", extended_gene)
assert m, "not single upper case gene"
gene_start = m.start("gene")
gene = extended_gene[gene_start: m.end("gene")]

Read targeted mutations and sites to saturate:

In [None]:
targeted_mutations = pd.read_csv(config["targeted_mutations"])
print(f"{len(targeted_mutations)=}")

saturated_sites = pd.read_csv(config["saturated_sites"])
print(f"{len(saturated_sites)=}")

Get the most preferred human codon for each amino acid:

In [None]:
aa_to_codon = (
    pd.read_csv(config["human_codon_freqs"])
    .sort_values("frequency")
    .groupby("aa")
    .last()
    ["codon"]
    .to_dict()
)

aa_to_codon["-"] = "---"

Define function to design primers:

In [None]:
def design_primer(
    residue,
    mutant_codon,
    min_tm,
    min_length,
    max_length,
    offset,
    extended_gene,
    gene_start,
):
    assert len(mutant_codon) == 3
    extended_start = 3 * (residue - 1) + gene_start
    
    # first get tails upstream and downstream at min length
    upstream_length = (min_length - 3) // 2 + offset
    downstream_length = min_length - 3 - upstream_length - offset
    
    def seq(up_length, down_length):
        i_start = extended_start - up_length
        assert i_start >= 0, "insufficient downstream sequence"
        i_end = extended_start + 3 + down_length
        assert i_end <= len(extended_gene), "insufficient upstream sequence"
        return (
            extended_gene[i_start: extended_start]
            + mutant_codon
            + extended_gene[extended_start + 3: i_end]
        )
    
    s = seq(upstream_length, downstream_length)
    down_first = True
    while (
        (MeltingTemp.Tm_NN(s, strict=False) < min_tm)
        and (upstream_length + downstream_length + 3) <= max_length
    ):
        if down_first:
            downstream_length += 1
        else:
            upstream_length += 1
        down_first = not down_first
        s = seq(upstream_length, downstream_length)
    return s.replace("-", "")

Design the targeted primes at each offset:

In [None]:
for offset in targeted_mutations_offsets:
    
    targeted_mutations[
        f"primers_forward_offset{offset}"
    ] = targeted_mutations.apply(
        lambda row: design_primer(
            residue=row["sequential_site"],
            mutant_codon=aa_to_codon[row["mutant_aa"]],
            min_tm=primer_min_tm,
            min_length=primer_min_length,
            max_length=primer_max_length,
            offset=offset,
            extended_gene=extended_gene,
            gene_start=gene_start,
        ),
        axis=1,
    )
    
    targeted_mutations[
        f"primers_reverse_offset{offset}"
    ] = targeted_mutations.apply(
        lambda row: design_primer(
            residue=row["sequential_site"],
            mutant_codon=aa_to_codon[row["mutant_aa"]],
            min_tm=primer_min_tm,
            min_length=primer_min_length,
            max_length=primer_max_length,
            offset=-offset,
            extended_gene=extended_gene,
            gene_start=gene_start,
        ),
        axis=1,
    ).map(lambda s: str(Bio.Seq.Seq(s).reverse_complement()))
    
print(f"Writing {config['targeted_mutations_w_oligos']}")
targeted_mutations.to_csv(config["targeted_mutations_w_oligos"], index=False)

Design saturated primers at each offset:

In [None]:
for offset in saturated_sites_offsets:
    
    saturated_sites[
        f"primers_forward_offset{offset}"
    ] = saturated_sites.apply(
        lambda row: [
            design_primer(
                residue=row["sequential_site"],
                mutant_codon="NNC",
                min_tm=primer_min_tm,
                min_length=primer_min_length,
                max_length=primer_max_length,
                offset=offset,
                extended_gene=extended_gene,
                gene_start=gene_start,
            ),
            design_primer(
                residue=row["sequential_site"],
                mutant_codon="NNG",
                min_tm=primer_min_tm,
                min_length=primer_min_length,
                max_length=primer_max_length,
                offset=offset,
                extended_gene=extended_gene,
                gene_start=gene_start,
            ),
        ],
        axis=1,
    )
    
    saturated_sites[
        f"primers_reverse_offset{offset}"
    ] = saturated_sites.apply(
        lambda row: [
            str(Bio.Seq.Seq(design_primer(
                residue=row["sequential_site"],
                mutant_codon="NNC",
                min_tm=primer_min_tm,
                min_length=primer_min_length,
                max_length=primer_max_length,
                offset=offset,
                extended_gene=extended_gene,
                gene_start=gene_start,
            )).reverse_complement()),
            str(Bio.Seq.Seq(design_primer(
                residue=row["sequential_site"],
                mutant_codon="NNG",
                min_tm=primer_min_tm,
                min_length=primer_min_length,
                max_length=primer_max_length,
                offset=offset,
                extended_gene=extended_gene,
                gene_start=gene_start,
            )).reverse_complement())
        ],
        axis=1,
    )
    
print(f"Writing {config['saturated_sites_w_oligos']}")
saturated_sites.to_csv(config["saturated_sites_w_oligos"], index=False)

Make [oPool](https://www.idtdna.com/pages/products/custom-dna-rna/dna-oligos/custom-dna-oligos/opools-oligo-pools) order sheets:

In [None]:
prefix = config["opool_prefix"]

dfs = []
pool_stats = []
for df, name, explode in [
    (targeted_mutations, "targeted_mutations", False),
    (saturated_sites, "saturated_sites", True),
]:
    for orientation in ["forward", "reverse"]:
        pool_name = f"{prefix}{name}_{orientation}_oPool"
        cols = [c for c in df.columns if c.startswith(f"primers_{orientation}")]
        primers = []
        for c in cols:
            if explode:
                primers += df.explode(c)[c].tolist()
            else:
                primers += df[c].tolist()
        dfs.append(pd.DataFrame({"Pool name": pool_name, "Sequence": primers}))
        n_unique = 0
        for p in primers:
            assert re.fullmatch("[ACGTNacgtn]+", p), p
            n_unique += 4**(p.count("N") + p.count("n"))
        pool_stats.append(
            (
                pool_name,
                name,
                len(primers),
                n_unique,
            )
        )
        
opools = pd.concat(dfs)

opools.to_csv(config["opools"], index=False)

Get overall pool statistics:

In [None]:
pool_stats = (
    pd.DataFrame(pool_stats, columns=["Pool name", "pool type", "n_oligos", "unique_sequences"])
)

pool_stats.to_csv(config["opool_stats"], index=False)

pool_stats