# Make oligo sequences for DMS library
Script written by Brendan Larsen

In [None]:
import pandas as pd
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.Data import CodonTable
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [None]:
# define all codons
full_codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
    'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
}

In [None]:
## https://www.genscript.com/tools/codon-frequency-table
## Define the codons we want to use for each mutant, depending on human codon frequencies.
## These are the most frequent codons for each amino acid in humans:

codon_table = {
    'GCC':'A',
    'TGC':'C',
    'GAT':'D', #change from GAC because can make BsmBI sites
    'GAG':'E',
    'TTC':'F',
    'GGC':'G',
    'CAC':'H',
    'ATC':'I',
    'AAG':'K',
    'CTG':'L',
    'ATG':'M',
    'AAC':'N',
    'CCC':'P',
    'CAG':'Q',
    'CGG':'R',
    'AGC':'S',
    'ACC':'T',
    'GTG':'V',
    'TGG':'W',
    'TAC':'Y',
    'TAA':'*' 
}

In [None]:
def generate_mutations(
    input_file,
    output_file,
    fwd_primer='',
    fwd_spacer='',
    fwd_overhang='',
    fwd_additional='',
    rev_additional='',
    rev_overhang='',
    rev_spacer='',
    rev_primer='',
    primer_buffer='',
    include_stop_codons=False,
    stop_codons_every_other=False
):
    """
    Generate mutations for a given DNA sequence.

    Parameters:
    - input_file (str): Path to the input FASTA file containing the sequence to mutate.
    - output_file (str): Path to save the output FASTA file with mutations.
    - fwd_primer (str, optional): Forward primer sequence.
    - fwd_spacer (str, optional): Forward spacer sequence.
    - fwd_overhang (str, optional): Forward overhang sequence.
    - fwd_additional (str, optional): Additional forward sequence.
    - rev_additional (str, optional): Additional reverse sequence.
    - rev_overhang (str, optional): Reverse overhang sequence.
    - rev_spacer (str, optional): Reverse spacer sequence.
    - rev_primer (str, optional): Reverse primer sequence.
    - primer_buffer (str, optional): Primer buffer sequence.
    - include_stop_codons (bool, optional): Whether to include stop codons in mutations. Default is True.
    - stop_codons_every_other (bool, optional): If True, include stop codons at every other site. Default is False.
    """

    # Read Input Amplicon
    seq = Seq(str(AlignIO.read(input_file, 'fasta')[0].seq))
    
    # Golden Gate recognition sites
    bsmbi_fwd = 'CGTCTC'
    bsmbi_rev = 'GAGACG'
    
    # Make empty list to append mutations to
    mutations = []
    
    # Make the wildtype sequence at the top of the file
    wt_seq = primer_buffer + fwd_primer + bsmbi_fwd + fwd_spacer + fwd_overhang + fwd_additional + seq + rev_additional + rev_overhang + rev_spacer + bsmbi_rev + rev_primer + primer_buffer
    mutations.append(SeqRecord(wt_seq, id='WT_seq'))
    
    # Loop through each position and make all possible mutations based on codon table
    num_codons = len(seq) // 3
    for codon_index in range(num_codons):
        codon_start = codon_index * 3
        codon_end = codon_start + 3
        original_codon = seq[codon_start:codon_end]
        original_aa = full_codon_table[str(original_codon)]
        
        for mutated_codon, amino_acid in codon_table.items():
            # Skip if the mutated amino acid is the same as the original amino acid
            if amino_acid != original_aa:
                # Check stop codon conditions
                if (amino_acid != "*" or  # Always include non-stop codon mutations
                    (include_stop_codons and  # Include stop codons if specified
                     (not stop_codons_every_other or  # Either include all stop codons
                      (stop_codons_every_other and codon_index % 2 == 0)))):  # Or include stop codons at every other position
                    mutated_seq = (primer_buffer + fwd_primer + bsmbi_fwd + fwd_spacer + 
                                   fwd_overhang + fwd_additional + 
                                   seq[:codon_start] + Seq(mutated_codon) + seq[codon_end:] + 
                                   rev_additional + rev_overhang + rev_spacer + 
                                   bsmbi_rev + rev_primer + primer_buffer)
                    mutations.append(SeqRecord(mutated_seq, id=f"Pool1_Mutation_{codon_index+1}_{amino_acid}", description=""))
    
    # Save the mutations as a FASTA file
    SeqIO.write(mutations, output_file, "fasta")
    print(f"Mutations saved to {output_file}")
    print(f"Total mutations generated: {len(mutations) - 1}")  # Subtract 1 to exclude the WT sequence


In [None]:
#amplicon1
generate_mutations(
    input_file='amplicons/pool1.fasta',
    output_file='output/amplicon1_oPools.fasta',
    
    primer_buffer='TA',
    fwd_primer='CCAGGCAGCATGTTGTAGTG',
    fwd_spacer='A',
    fwd_overhang='TCCT',
    fwd_additional='G',

    rev_additional='GTG',
    rev_overhang='GGCG',
    rev_spacer='A',
    rev_primer='GTACGAGCCGATGGAATCAG',

    include_stop_codons=True,
    stop_codons_every_other=True
)

In [None]:
#amplicon2
generate_mutations(
    input_file='amplicons/pool2.fasta',
    output_file='output/amplicon2_oPools.fasta',
    
    primer_buffer='ATA',
    fwd_primer='TGGAGAGACTACGCACTGAC',
    fwd_spacer='A',
    fwd_overhang='CACG',
    fwd_additional='AC',

    rev_additional='AT',
    rev_overhang='CAAC',
    rev_spacer='T',
    rev_primer='GCACAGTGTACGAGATGGTC',
)

In [None]:
#amplicon3
generate_mutations(
    input_file='amplicons/pool3.fasta',
    output_file='output/amplicon3_oPools.fasta',
    
    primer_buffer='AA',
    fwd_primer='TCAAGCACTGCACTCAAGTC',
    fwd_spacer='A',
    fwd_overhang='GGAC',
    fwd_additional='',

    rev_additional='G',
    rev_overhang='ATCT',
    rev_spacer='C',
    rev_primer='CCTCACTCGTCACTACCATG',
)

In [None]:
#amplicon4
generate_mutations(
    input_file='amplicons/pool4.fasta',
    output_file='output/amplicon4_oPools.fasta',
    
    primer_buffer='TTA',
    fwd_primer='TCATCCATCCAGTACGCCAG',
    fwd_spacer='A',
    fwd_overhang='ACCG',
    fwd_additional='AG',

    rev_additional='A',
    rev_overhang='TCGG',
    rev_spacer='A',
    rev_primer='CAGCAGCATACGTCGGTAAC',
)

In [None]:
#amplicon5
generate_mutations(
    input_file='amplicons/pool5.fasta',
    output_file='output/amplicon5_oPools.fasta',
    
    primer_buffer='',
    fwd_primer='CGAGTCCAGTTACGGAGATG',
    fwd_spacer='C',
    fwd_overhang='TATC',
    fwd_additional='',

    rev_additional='GG',
    rev_overhang='CGAA',
    rev_spacer='T',
    rev_primer='CTGTCTTGCGTCCATCCATG',
)

In [None]:
#amplicon6
generate_mutations(
    input_file='amplicons/pool6.fasta',
    output_file='output/amplicon6_oPools.fasta',

    primer_buffer='',
    fwd_primer='AGGTTAGACAGGCTGACGTG',
    fwd_spacer='A',
    fwd_overhang='CTCA',
    fwd_additional='G',

    rev_additional='G',
    rev_overhang='ACAC',
    rev_spacer='T',
    rev_primer='GCTGTCATCACCTCAACTGC',
)