In [5]:
#!pip install biopython
#!pip install numpy
#!pip install pandas
#!pip install scipy
#!pip install openpyxl
#!pip install primer3
import numpy as np
import matplotlib.pyplot as plt
from Bio.Seq import Seq
from Bio import SeqIO
import pandas as pd
import itertools
import re
from scipy.spatial import distance
import json
from Bio.SeqUtils import MeltingTemp as mt
import random
import pprint
import string
import primer3


In [6]:
#Import primers
orthogonal_F = pd.read_excel('./RR_orthogonalFv1_plate.xlsx')
orthogonal_R = pd.read_excel('./RR_orthogonalRv1_plate.xlsx')


In [7]:
#Isolate 20-nt variable primer binding region and only use that to align to sequences
orthogonal_F['PrimerEnd'] = orthogonal_F.Sequence.apply(lambda x: x[-20:])
orthogonal_R['PrimerEnd'] = orthogonal_R.Sequence.apply(lambda x: x[-20:])


In [8]:
# Exclude poor primers determined empirically
bad_primers_F = ["A9", "C3", "D3", "E3", "G4", "G6", "H4"]
bad_primers_R = ["A9", "C6", "D3", "D7", "E10", "E3", "E6", "H4"]
red_primers = ["A8","A9","C3","C4","C6","C8","C9","H2","D3","D4","D8","D9","E3","E6","F3","F4","G3","G4","G6","H3","H4","H5","H7"]

orthogonal_F['Exclude'] = orthogonal_F['Well Position'].map(lambda x: True if x in bad_primers_F else False)
orthogonal_R['Exclude'] = orthogonal_R['Well Position'].map(lambda x: True if x in bad_primers_R else False)

orthogonal_F['Worse'] = orthogonal_F['Well Position'].map(lambda x: True if x in red_primers else False)
orthogonal_R['Worse'] = orthogonal_R['Well Position'].map(lambda x: True if x in red_primers else False)


In [9]:
#Check for BsaI sites and remove any primers with them
bsaI_seq = Seq('GGTCTC')
orthogonal_F['BsaI_Site_Present'] = orthogonal_F.Sequence.apply(lambda x: ( (str(bsaI_seq) in x) | (str(bsaI_seq.reverse_complement()) in x) ))
orthogonal_R['BsaI_Site_Present'] = orthogonal_R.Sequence.apply(lambda x: ( (str(bsaI_seq) in x) | (str(bsaI_seq.reverse_complement()) in x) ))


In [10]:
# Adapted from Willow Coyote-Maestas' DIMPLE paper, cite doi: 10.1186/s13059-023-02880-6
def check_nonspecific(primer, fragment, Tm_verb = 20, Tm_rem = 28, primer3_shift = 4, verbose=True):
    non = []
    # Forward
    for i in range(len(fragment) - len(primer)):  # Scan each position
        match = [
            primer[j].lower() == fragment[i + j].lower() for j in range(len(primer))
        ]
        first = 10
        for k in range(len(match) - 3):
            if (match[k] and match[k + 1] and match[k + 3]) or (
                match[k] and match[k + 1] and match[k + 2]
            ):
                first = k
                break
        if (
            sum(match[first:]) > len(primer[first:]) * 0.8
            and sum(match[first:]) > 6
            and match[-1]
        ):  # string compare - sum of matched nt is greater than 80%
            try:
                melt = mt.Tm_NN(
                    primer[first:],
                    c_seq=fragment[i + first : i + len(primer)].complement(),
                    nn_table=mt.DNA_NN2,
                    de_table=mt.DNA_DE1,
                    imm_table=mt.DNA_IMM1,
                )
                if verbose == True:
                    if melt > Tm_verb:
                        print("Found non-specific match at " + str(i + 1) + "bp:")
                        print(" match:" + fragment[i : i + len(primer)])
                        print("primer:" + primer + " Tm:" + str(round(melt, 1)))
                if melt > Tm_rem:
                    non.append(True)
            except ValueError as valerr:
                # use primer3 instead, as mt.DNA_NN2 table does not have enough information to compute Tm
                result = primer3.calcHeterodimer(str(primer[first:]), 
                                                 str(fragment[i + first : i + len(primer)].complement())
                                                )
                melt = result.tm
                if verbose == True:
                    if melt > Tm_verb-primer3_shift:
                        print("Found non-specific match using Primer3 at " + str(i + 1) + "bp:")
                        print(" match:" + fragment[i : i + len(primer)])
                        print("primer:" + primer + " Tm:" + str(round(melt, 1)))
                if melt > Tm_rem-primer3_shift:
                    non.append(True)
                    
    # Reverse
    fragment = fragment.reverse_complement()
    for i in range(len(fragment) - len(primer)):
        match = [
            primer[j].lower() == fragment[i + j].lower() for j in range(len(primer))
        ]
        first = 10
        for k in range(0, len(match) - 3, 1):
            if match[k] and match[k + 1] and match[k + 3]:
                first = k
                break
        if (
            sum(match[first:]) > len(primer[first:]) * 0.8
            and sum(match[first:]) > 6
            and match[-1]
        ):  # string compare - sum of matched nt is greater than 80%
            try:
                melt = mt.Tm_NN(
                    primer[first:],
                    c_seq=fragment[i + first : i + len(primer)].complement(),
                    nn_table=mt.DNA_NN2,
                    de_table=mt.DNA_DE1,
                    imm_table=mt.DNA_IMM1,
                )
                if verbose == True:
                    if melt > Tm_verb:
                        print("Found non-specific match at " + str(i + 1) + "bp:")
                        print(" match:" + fragment[i : i + len(primer)])
                        print("primer:" + primer + " Tm:" + str(melt))
                if melt > Tm_rem:
                    non.append(True)
            except ValueError as valerr:
                # use primer3 instead, as mt.DNA_NN2 table does not have enough information to compute Tm
                result = primer3.calcHeterodimer(str(primer[first:]), 
                                                 str(fragment[i + first : i + len(primer)].complement())
                                                )
                melt = result.tm
                if verbose == True:
                    if melt > Tm_verb-primer3_shift:
                        print("Found non-specific match using Primer3 at " + str(i + 1) + "bp:")
                        print(" match:" + fragment[i : i + len(primer)])
                        print("primer:" + primer + " Tm:" + str(round(melt, 1)))
                if melt > Tm_rem-primer3_shift:
                    non.append(True)
    return sum(non)


In [11]:
# Use refseq MANE annotations to find gene cDNA sequences
refseq_MANE_summary_filepath = './MANE.GRCh38.v1.3.summary.txt'
refseq_MANE_filepath = "./MANE.GRCh38.v1.3.refseq_rna.gbff"
refseq_MANE_summary = pd.read_csv(refseq_MANE_summary_filepath, sep='\t')
refseq_MANE_stream = SeqIO.parse(refseq_MANE_filepath, "genbank")

# genes for assay
genes_of_interest = \
['ARF1',
 'RHEB',
 'RAP1A',
 'NRAS',
 'CDC42',
 'RAC1',
 'RHOA',
 'RAB1A',
 'RALA',
 'RAN',
 'RIT1',
 'EPM2A',
 'NHLRC1',
 'TANGO2',
 'GATM',
 'SMS',
 'SUFU',
 'ETV6',
 'RUNX1',
 'GATA2',
 'CEBPA',
 'TRIO',
 'GRIN2A',
 'GRIA3',
 'SLC6A8',
 'RAB7A',
 'RAB11B',
 'RAB18',
 'RAB23',
 'RAB27A',
 'AKT1',
 'FOXP3',
 'STK11',
 'PRKAG2',
 'BBS1',
 'SMAD4',
 'STAT3',
 'BCL10',
 'HPS1',
 'PIK3CA', 
]

# find correct transcripts
genes_to_MANE_transcripts = refseq_MANE_summary.\
    query('MANE_status == "MANE Select"').\
    query('symbol in @genes_of_interest')

# find refseq
gene_transcript_dict = {}
for rec in refseq_MANE_stream:
    if rec.id in genes_to_MANE_transcripts['RefSeq_nuc'].values:
        print('Processing ' + rec.description)
        for feature in rec.features:
            if feature.type == "CDS":
                gene_transcript_dict[rec.id] = [rec.description,
                                                str(feature.location.extract(rec).seq)]
genes_to_MANE_transcripts = genes_to_MANE_transcripts.\
    merge(pd.DataFrame.from_dict(gene_transcript_dict, orient='index', columns=['Description', 'CDS_Seq']),
             left_on='RefSeq_nuc', right_index=True)


Processing Homo sapiens HPS1 biogenesis of lysosomal organelles complex 3 subunit 1 (HPS1), transcript variant 1, mRNA
Processing Homo sapiens serine/threonine kinase 11 (STK11), transcript variant 1, mRNA
Processing Homo sapiens glutamate ionotropic receptor NMDA type subunit 2A (GRIN2A), transcript variant 1, mRNA
Processing Homo sapiens AKT serine/threonine kinase 1 (AKT1), transcript variant 4, mRNA
Processing Homo sapiens glycine amidinotransferase (GATM), transcript variant 1, mRNA; nuclear gene for mitochondrial product
Processing Homo sapiens ADP ribosylation factor 1 (ARF1), transcript variant 4, mRNA
Processing Homo sapiens ras homolog family member A (RHOA), transcript variant 1, mRNA
Processing Homo sapiens RUNX family transcription factor 1 (RUNX1), transcript variant 1, mRNA
Processing Homo sapiens cell division cycle 42 (CDC42), transcript variant 1, mRNA
Processing Homo sapiens ETS variant transcription factor 6 (ETV6), transcript variant 1, mRNA
Processing Homo sapiens

In [12]:
for gene in genes_of_interest:
    print(gene)
    gene_cds_seq = Seq(genes_to_MANE_transcripts[genes_to_MANE_transcripts['symbol']==gene]['CDS_Seq'].values[0].upper())
    print('CDS: ' + str(gene_cds_seq))
    print('AA: ' + str(gene_cds_seq.translate()))


ARF1
CDS: ATGGGGAACATCTTCGCCAACCTCTTCAAGGGCCTTTTTGGCAAAAAAGAAATGCGCATCCTCATGGTGGGCCTGGATGCTGCAGGGAAGACCACGATCCTCTACAAGCTTAAGCTGGGTGAGATCGTGACCACCATTCCCACCATAGGCTTCAACGTGGAAACCGTGGAGTACAAGAACATCAGCTTCACTGTGTGGGACGTGGGTGGCCAGGACAAGATCCGGCCCCTGTGGCGCCACTACTTCCAGAACACACAAGGCCTGATCTTCGTGGTGGACAGCAATGACAGAGAGCGTGTGAACGAGGCCCGTGAGGAGCTCATGAGGATGCTGGCCGAGGACGAGCTCCGGGATGCTGTCCTCCTGGTGTTCGCCAACAAGCAGGACCTCCCCAACGCCATGAATGCGGCCGAGATCACAGACAAGCTGGGGCTGCACTCACTACGCCACAGGAACTGGTACATTCAGGCCACCTGCGCCACCAGCGGCGACGGGCTCTATGAAGGACTGGACTGGCTGTCCAATCAGCTCCGGAACCAGAAGTGA
AA: MGNIFANLFKGLFGKKEMRILMVGLDAAGKTTILYKLKLGEIVTTIPTIGFNVETVEYKNISFTVWDVGGQDKIRPLWRHYFQNTQGLIFVVDSNDRERVNEAREELMRMLAEDELRDAVLLVFANKQDLPNAMNAAEITDKLGLHSLRHRNWYIQATCATSGDGLYEGLDWLSNQLRNQK*
RHEB
CDS: ATGCCGCAGTCCAAGTCCCGGAAGATCGCGATCCTGGGCTACCGGTCTGTGGGGAAATCCTCATTGACGATTCAATTTGTTGAAGGCCAATTTGTGGACTCCTACGATCCAACCATAGAAAACACTTTTACAAAGTTGATCACAGTAAATGGACAAGAATATCATCTTCAACTTGTAGACACAGCCGGGCAAGATGAATATTCTATCTTTCCTCAGACATACTCCATAGATATTAATGGCTATATT

In [13]:
# Define genes
# Lib2 proteins:
# ctags:
# ARF1 (ctag)
ARF1 = Seq('ATGGGGAACATCTTCGCCAACCTCTTCAAGGGCCTTTTTGGCAAAAAAGAAATGCGCATCCTCATGGTGGGCCTGGATGCTGCAGGGAAAACCACGATCCTCTACAAGCTTAAGCTGGGTGAGATCGTGACCACCATTCCCACCATAGGCTTCAACGTGGAAACCGTGGAGTACAAGAACATCAGCTTCACTGTGTGGGACGTGGGTGGCCAGGACAAGATCCGGCCCCTGTGGCGCCACTACTTCCAGAACACACAAGGCCTGATCTTCGTGGTGGACAGCAATGACAGAGAGCGTGTGAACGAGGCCCGTGAGGAGCTCATGAGGATGCTGGCCGAGGACGAGCTCCGGGATGCTGTCCTCCTGGTGTTCGCCAACAAGCAGGACCTCCCCAACGCCATGAATGCGGCCGAGATCACAGACAAGCTGGGGCTGCACTCACTACGCCACAGGAACTGGTACATTCAGGCCACCTGTGCCACCAGCGGCGACGGGCTCTATGAAGGACTGGACTGGCTGTCCAATCAGCTCCGGAACCAGAAG'.upper())

# EPM2A (ctag)
EPM2A = Seq('ATGCGCTTCCGCTTTGGGGTGGTGGTGCCACCCGCCGTGGCCGGCGCCCGGCCGGAGCTGCTGGTGGTGGGGTCGCGGCCCGAGCTGGGGCGTTGGGAACCAAGAGGAGCTGTACGACTTCGTCCTGCTGGGACTGCTGCTGGTGACGGGGCCCTGGCCCTGCAGGAGCCGGGCCTGTGGCTCGGGGAGGTGGAGCTGGCGGCCGAGGAGGCGGCGCAGGACGGGGCGGAGCCGGGCCGCGTGGACACGTTCTGGTACAAGTTCCTGAAGCGGGAGCCGGGAGGAGAGCTCTCCTGGGAAGGCAATGGACCTCATCATGACCGTTGCTGTACTTACAATGAAAACAACTTGGTGGATGGTGTGTATTGTCTCCCAATAGGACACTGGATTGAGGCCACTGGGCACACCAATGAAATGAAGCACACAACAGACTTCTATTTTAATATTGCAGGCCACCAAGCCATGCATTATTCAAGAATTCTACCAAATATCTGGCTGGGTAGCTGCCCTCGTCAGGTGGAACATGTAACCATCAAACTGAAGCATGAATTGGGGATTACAGCTGTAATGAATTTCCAGACTGAATGGGATATTGTACAGAATTCCTCAGGCTGTAACCGCTACCCAGAGCCCATGACTCCAGACACTATGATTAAACTATATAGGGAAGAAGGCTTGGCCTACATCTGGATGCCAACACCAGATATGAGCACCGAAGGCCGAGTACAGATGCTGCCCCAGGCGGTGTGCCTGCTGCATGCGCTGCTGGAGAAGGGACACATCGTGTACGTGCACTGCAACGCTGGGGTGGGCCGCTCCACCGCGGCTGTCTGCGGCTGGCTCCAGTATGTGATGGGCTGGAATCTGAGGAAGGTGCAGTATTTCCTCATGGCCAAGAGGCCGGCTGTCTACATTGACGAAGAGGCCTTGGCCCGGGCACAAGAAGATTTTTTCCAGAAATTTGGGAAGGTTCGTTCTTCTGTGTGTAGCCTG'.upper())

# TANGO2 (ctag)
TANGO2 = Seq('ATGTGCATCATCTTCTTTAAGTTTGATCCTCGCCCTGTTTCCAAAAACGCGTACAGGCTCATCTTGGCAGCCAACAGGGATGAATTCTACAGCCGACCCTCCAAGTTAGCTGACTTCTGGGGGAACAACAACGAGATCCTCAGTGGGCTGGACATGGAGGAAGGCAAGGAAGGAGGCACATGGCTGGGCATCAGCACACGTGGCAAGCTGGCAGCACTCACCAACTACCTGCAGCCGCAGCTGGACTGGCAGGCCCGAGGGCGAGGTGAACTTGTCACCCACTTTCTGACCACTGACGTGGACAGCTTGTCCTACCTGAAGAAGGTGTCTATGGAGGGCCATCTGTACAATGGCTTCAACCTCATAGCAGCCGACCTGAGCACAGCAAAGGGAGATGTCATTTGCTACTATGGGAACCGAGGGGAGCCTGATCCTATCGTTTTGACGCCAGGCACCTACGGGCTGAGCAACGCGCTGCTGGAGACTCCCTGGAGGAAGCTGTGCTTTGGGAAGCAGCTGTTCCTGGAGGCTGTGGAACGGAGCCAGGCGCTGCCCAAGGATGTGCTCATCGCCAGCCTCCTGGATGTGCTCAACAATGAAGAGGCGCAGCTGCCAGACCCGGCCATCGAGGACCAGGGTGGGGAGTACGTGCAGCCCATGCTGAGCAAGTACGCGGCTGTGTGCGTGCGCTGCCCTGGCTACGGCACCAGAACCAACACTATCATCCTGGTAGATGCGGACGGCCACGTGACCTTCACTGAGCGTAGCATGATGGACAAGGACCTCTCCCACTGGGAAACCAGAACCTATGAGTTCACACTGCAGAGC'.upper())

# GATM (ctag)
#GATM = Seq('ATGTTGAGAGTAAGATGCCTCAGAGGTGGTTCCAGAGGTGCAGAGGCCGTACATTACATCGGATCTCGGCTTGGACGAACCTTGACAGGATGGGTGCAGCGAACTTTCCAGAGCACCCAGGCAGCTACGGCTTCCTCCCGGAACTCCTGTGCAGCTGACGACAAAGCCACTGAGCCTCTGCCCAAGGACTGCCCTGTCTCTTCTTACAACGAATGGGACCCCTTAGAGGAAGTGATAGTGGGCAGAGCAGAAAACGCCTGTGTTCCACCGTTCACCATCGAGGTGAAGGCCAACACATATGAAAAGTACTGGCCATTTTACCAGAAGCAAGGAGGGCATTATTTTCCCAAAGATCATTTGAAAAAGGCTGTTGCTGAAATTGAAGAAATGTGCAATATTTTAAAAACGGAAGGAGTGACAGTAAGGAGGCCTGACCCCATTGACTGGTCATTGAAGTATAAAACTCCTGATTTTGAGTCTACGGGTTTATACAGTGCAATGCCTCGAGACATCCTGATAGTTGTGGGCAATGAGATTATCGAGGCTCCCATGGCATGGCGTTCACGCTTCTTTGAGTACCGAGCGTACAGGTCAATTATCAAAGACTACTTCCACCGTGGCGCCAAGTGGACAACAGCTCCTAAGCCCACAATGGCTGATGAGCTTTATAACCAGGATTATCCCATCCACTCTGTAGAAGATAGACACAAATTGGCTGCTCAGGGAAAATTTGTGACAACTGAGTTTGAGCCATGCTTTGATGCTGCTGACTTCATTCGAGCTGGAAGAGATATTTTTGCACAGAGAAGCCAGGTTACAAACTACCTAGGCATTGAATGGATGCGTAGGCATCTTGCTCCAGACTACAGAGTGCATATCATCTCCTTTAAAGATCCCAATCCCATGCATATTGATGCTACCTTCAACATCATTGGACCTGGTATTGTGCTTTCCAACCCTGACCGACCATGTCACCAGATTGATCTTTTCAAGAAAGCAGGATGGACTATCATTACTCCTCCAACACCAATCATCCCAGACGATCATCCACTCTGGATGTCATCCAAATGGCTTTCCATGAATGTCTTAATGCTAGATGAAAAACGTGTTATGGTGGATGCCAATGAAGTTCCAATTCAAAAGATGTTTGAAAAGCTGGGTATCACTACCATTAAAGTTAACATTCGTAATGCCAATTCCCTGGGAGGAGGCTTCCATTGCTGGACCTGCGATGTCCGGCGCCGAGGCACCTTACAGTCCTACTTGGAC'.upper())

# SUFU (ctag)
SUFU = Seq('ATGGCGGAGCTGCGGCCTTCTGGTGCCCCTGGACCTACTGCCCCACCCGCTCCAGGTCCAACAGCTCCACCGGCCTTCGCTTCGCTCTTTCCCCCGGGACTGCACGCCATCTACGGAGAGTGCCGCCGCCTTTACCCTGACCAGCCGAACCCGCTCCAGGTTACCGCTATCGTCAAGTACTGGTTGGGTGGCCCAGACCCCTTGGACTATGTTAGCATGTACAGGAATGTGGGGAGCCCTTCTGCTAACATCCCCGAGCACTGGCACTACATCAGCTTCGGCCTGAGTGATCTCTATGGTGACAACAGAGTCCATGAGTTTACAGGAACAGATGGACCTAGTGGTTTTGGCTTTGAGTTGACCTTTCGTCTGAAGAGAGAAACTGGGGAGTCTGCCCCACCAACATGGCCCGCAGAGTTAATGCAGGGCTTGGCACGATACGTGTTCCAGTCAGAGAACACCTTCTGCAGTGGGGACCATGTGTCCTGGCACAGCCCTTTGGATAACAGTGAGTCAAGAATTCAGCACATGCTGCTGACAGAGGACCCACAGATGCAGCCCGTGCAGACACCCTTTGGGGTAGTTACCTTCCTCCAGATCGTTGGTGTCTGCACTGAGGAACTACACTCAGCCCAGCAGTGGAACGGGCAGGGCATCCTGGAGCTGCTGCGGACAGTGCCTATTGCTGGCGGCCCCTGGCTGATAACTGACATGCGGAGGGGAGAAACCATATTTGAGATCGATCCACACCTCCAAGAGAGAGTTGACAAAGGCATCGAGACAGATGGCTCCAACCTGAGTGGTGTCAGTGCCAAGTGTGCCTGGGATGACCTGAGCCGGCCCCCCGAGGATGACGAGGACAGCCGGAGCATCTGCATCGGCACACAGCCCCGGCGACTCTCTGGCAAAGACACAGAGCAGATCCGGGAAACCCTGAGGAGAGGACTCGAGATCAACAGCAAACCTGTCCTTCCACCAATCAACCCTCAGCGGCAGAATGGCCTCGCCCACGACCGGGCCCCGAGCCGCAAAGACAGCCTGGAAAGTGACAGCTCCACGGCCATCATTCCCCATGAGCTGATTCGCACGCGGCAGCTTGAGAGCGTACATCTGAAATTCAACCAGGAGTCCGGAGCCCTCATTCCTCTCTGCCTAAGGGGCAGGCTCCTGCATGGACGGCACTTTACATATAAAAGTATCACAGGTGACATGGCCATCACGTTTGTCTCCACGGGAGTGGAAGGCGCCTTTGCCACTGAGGAGCATCCTTACGCGGCTCATGGACCCTGGTTACAAATTCTGTTGACCGAAGAGTTTGTAGAGAAAATGTTGGAGGATTTAGAAGATTTGACTTCTCCAGAGGAATTCAAACTTCCCAAAGAGTACAGCTGGCCTGAAAAGAAGCTGAAGGTGTCCATCCTGCCTGACGTGGTGTTCGACAGTCCGCTACAC'.upper())

# BCL10 (ctag)
BCL10 = Seq('ATGGAGCCCACCGCACCGTCCCTCACCGAGGAGGACCTCACTGAAGTGAAGAAGGACGCCTTAGAAAATTTACGTGTATACCTGTGTGAGAAAATCATAGCTGAGAGACATTTTGATCATCTACGTGCAAAAAAAATACTCAGTAGAGAAGACACTGAAGAAATTTCTTGTCGAACATCAAGTAGAAAAAGGGCTGGAAAATTGTTAGACTACTTACAGGAAAACCCAAAAGGTCTGGACACCCTTGTTGAATCTATTCGGCGAGAAAAAACACAGAACTTCCTGATACAGAAGATTACAGATGAAGTGCTGAAACTTAGAAATATAAAACTAGAACATCTGAAAGGACTAAAATGTAGCAGTTGTGAACCTTTTCCAGATGGAGCCACGAACAACCTCTCCAGATCAAATTCAGATGAGAGTAATTTCTCTGAAAAACTGAGGGCATCCACTGTCATGTACCATCCAGAAGGAGAATCCAGCACGACGCCCTTTTTTTCTACTAATTCTTCTCTGAATTTGCCTGTTCTAGAAGTAGGCAGAACTGAAAATACCATCTTCTCTTCAACTACACTTCCCAGACCTGGGGACCCAGGGGCTCCTCCTTTGCCACCAGATCTACAGTTAGAAGAAGAAGGAACTTGTGCAAACTCTAGTGAGATGTTTCTTCCCTTAAGATCACGTACTGTTTCACGACAA'.upper())

# STK11 (ctag)
STK11 = Seq('ATGGAGGTGGTGGACCCGCAGCAGCTGGGCATGTTCACGGAGGGCGAGCTGATGTCGGTGGGTATGGACACGTTCATCCACCGCATCGACTCCACCGAGGTCATCTACCAGCCGCGCCGCAAGCGGGCCAAGCTCATCGGCAAGTACCTGATGGGGGACCTGCTGGGGGAAGGCTCTTACGGCAAGGTGAAGGAGGTGCTGGACTCGGAAACGCTGTGCAGGAGGGCCGTCAAGATCCTCAAGAAGAAGAAGTTGCGAAGGATCCCCAACGGGGAGGCCAACGTGAAGAAGGAAATTCAACTACTGAGGAGGTTACGGCACAAAAATGTCATCCAGCTGGTGGATGTGTTATACAACGAAGAGAAGCAGAAAATGTATATGGTGATGGAGTACTGCGTGTGTGGCATGCAGGAAATGCTGGACAGCGTGCCGGAGAAGCGTTTCCCAGTGTGCCAGGCCCACGGGTACTTCTGTCAGCTGATTGACGGCCTGGAGTACCTGCATAGCCAGGGCATTGTGCACAAGGACATCAAGCCGGGGAACCTGCTGCTCACCACCGGTGGCACCCTCAAAATCTCCGACCTGGGCGTGGCCGAGGCACTGCACCCGTTCGCGGCGGACGACACATGCCGGACCAGCCAGGGCTCCCCGGCTTTCCAGCCGCCCGAGATTGCCAACGGCCTGGACACCTTCTCCGGCTTCAAGGTGGACATCTGGTCGGCTGGGGTCACCCTCTACAACATCACCACGGGTCTGTACCCCTTCGAAGGGGACAACATCTACAAGTTGTTTGAGAACATCGGGAAGGGGAGCTACGCCATCCCGGGCGACTGTGGCCCCCCGCTCTCTGACCTGCTGAAAGGGATGCTTGAGTACGAACCGGCCAAGAGGTTCTCCATCCGGCAGATCCGGCAGCACAGCTGGTTCCGGAAGAAACATCCTCCGGCTGAAGCACCAGTGCCCATCCCACCGAGCCCAGACACCAAGGACCGGTGGCGCAGCATGACTGTGGTGCCGTACTTGGAGGACCTGCACGGCGCGGACGAGGACGAGGACCTCTTCGACATCGAGGATGACATCATCTACACTCAGGACTTCACGGTGCCCGGACAGGTCCCAGAAGAGGAGGCCAGTCACAATGGACAGCGCCGGGGCCTCCCCAAGGCCGTGTGTATGAACGGCACAGAGGCGGCGCAGCTGAGCACCAAATCCAGGGCGGAGGGCCGGGCCCCCAACCCTGCCCGCAAGGCCTGCTCCGCCAGCAGCAAGATCCGCCGGCTGTCGGCCTGCAAGCAGCAG'.upper())

# PRKAG2 (ctag)
PRKAG2 = Seq('ATGGGAAGCGCGGTTATGGACACCAAGAAGAAAAAAGATGTTTCCAGCCCCGGCGGGAGCGGCGGCAAGAAAAATGCCAGCCAGAAGAGGCGTTCGCTGCGCGTGCACATTCCGGACCTGAGCTCCTTCGCCATGCCGCTCCTGGACGGCGACCTGGAGGGTTCCGGAAAGCATTCCTCTCGAAAGGTGGACAGCCCCTTCGGCCCGGGCAGCCCCTCCAAAGGGTTCTTCTCCAGAGGCCCCCAGCCCCGGCCCTCCAGCCCCATGTCTGCACCTGTGAGGCCCAAGACCAGCCCCGGCTCTCCCAAAACCGTGTTCCCGTTCTCCTACCAGGAGTCCCCGCCACGCTCCCCTCGACGCATGAGCTTCAGTGGGATCTTCCGCTCCTCCTCCAAAGAGTCTTCCCCCAACTCCAACCCTGCTACCTCGCCCGGGGGCATCAGGTTTTTCTCCCGCTCCAGAAAAACCTCCGGCCTCTCCTCCTCTCCGTCAACACCCACCCAAGTGACCAAGCAGCACACGTTTCCCCTGGAATCCTATAAGCACGAGCCTGAACGGTTAGAGAATCGCATCTATGCCTCGTCTTCCCCCCCGGACACAGGGCAGAGGTTCTGCCCGTCTTCCTTCCAGAGCCCGACCAGGCCTCCACTGGCATCACCGACACACTATGCTCCCTCCAAAGCCGCGGCGCTGGCGGCGGCCCTGGGACCCGCGGAAGCCGGCATGCTGGAGAAGCTGGAGTTCGAGGACGAAGCAGTAGAAGACTCAGAAAGTGGTGTTTACATGCGATTCATGAGGTCACACAAGTGTTATGACATCGTTCCAACCAGTTCAAAGCTTGTTGTCTTTGATACTACATTACAAGTTAAAAAGGCCTTCTTTGCTTTGGTAGCCAACGGTGTCCGAGCAGCGCCACTGTGGGAGAGTAAAAAACAAAGTTTTGTAGGAATGCTAACAATTACAGATTTCATAAATATACTACATAGATACTATAAATCACCTATGGTACAGATTTATGAATTAGAGGAACATAAAATTGAAACATGGAGGGAGCTTTATTTACAAGAAACATTTAAGCCTTTAGTGAATATATCTCCAGATGCAAGCCTCTTCGATGCTGTATACTCCTTGATCAAAAATAAAATCCACAGATTGCCCGTTATTGACCCTATCAGTGGGAATGCACTTTATATACTTACCCACAAAAGAATCCTCAAGTTCCTCCAGCTTTTTATGTCTGATATGCCAAAGCCTGCCTTCATGAAGCAGAACCTGGATGAGCTTGGAATAGGAACGTACCACAACATTGCCTTCATACATCCAGACACTCCCATCATCAAAGCCTTGAACATATTTGTGGAAAGACGAATATCAGCTCTGCCTGTTGTGGATGAGTCAGGAAAAGTTGTAGATATTTATTCCAAATTTGATGTAATTAATCTTGCTGCTGAGAAAACATACAATAACCTAGATATCACGGTGACCCAGGCCCTTCAGCACCGTTCACAGTATTTTGAAGGTGTTGTGAAGTGCAATAAGCTGGAAATACTGGAGACAATCGTGGACAGAATAGTAAGAGCTGAGGTCCATCGGCTGGTGGTGGTAAATGAAGCAGATAGTATTGTGGGTATTATTTCCCTGTCGGACATTCTGCAAGCCCTGATCCTCACACCAGCCGGTGCCAAACAAAAGGAGACAGAAACGGAG'.upper())

# CEBPA (ctag)
CEBPA = Seq('ATGGAGTCGGCCGACTTCTACGAGGCGGAGCCGCGGCCCCCGATGAGCAGCCACCTCCAGAGCCCCCCGCACGCGCCCAGCAGCGCCGCCTTCGGCTTTCCTAGAGGAGCTGGACCAGCTCAACCACCAGCTCCACCAGCAGCTCCAGAACCGCTGGGCGGCATCTGCGAGCACGAAACGTCCATCGACATCAGCGCCTACATCGACCCGGCCGCCTTCAACGACGAGTTCCTGGCCGACCTGTTCCAGCACAGCCGGCAGCAGGAGAAGGCCAAGGCGGCCGTGGGCCCCACGGGTGGTGGAGGAGGTGGTGACTTTGACTACCCTGGAGCTCCAGCTGGACCAGGTGGTGCTGTTATGCCAGGTGGAGCACACGGGCCCCCGCCCGGCTACGGCTGCGCGGCCGCCGGCTACCTGGACGGCAGGCTGGAGCCCCTGTACGAGCGCGTCGGGGCGCCGGCGCTGCGGCCGCTGGTGATCAAGCAGGAGCCCCGCGAGGAGGATGAAGCCAAGCAGCTGGCGCTGGCCGGCCTCTTCCCTTACCAGCCTCCTCCACCACCACCACCCTCGCATCCACATCCACATCCACCACCCGCGCACCTGGCCGCCCCGCACCTCCAGTTCCAGATCGCGCACTGCGGCCAGACCACCATGCACCTCCAGCCCGGTCATCCAACACCACCACCTACACCTGTTCCATCACCACATCCTGCACCAGCTTTAGGTGCAGCTGGATTACCAGGTCCTGGATCTGCTTTAAAAGGATTAGGTGCAGCACACCCCGACCTCCGCGCGAGTGGCGGCAGCGGCGCGGGCAAGGCCAAGAAGTCGGTGGACAAGAACAGCAACGAGTACCGGGTGCGGCGCGAGCGCAACAACATCGCGGTGCGCAAGAGCCGCGACAAGGCCAAGCAGCGCAACGTGGAGACGCAGCAGAAGGTGCTGGAGCTGACCAGTGACAATGACCGCCTGCGCAAGCGGGTGGAACAGCTGAGCCGCGAACTGGACACGCTGCGGGGCATCTTCCGCCAGCTGCCAGAGAGCTCCTTGGTCAAGGCCATGGGCAACTGCGCG'.upper())

# AKT1 (ctag)
AKT1 = Seq('ATGAGCGACGTGGCTATTGTGAAGGAGGGTTGGCTGCACAAACGAGGGGAGTACATCAAGACCTGGCGGCCACGCTACTTCCTCCTCAAGAATGATGGCACCTTCATTGGCTACAAGGAGCGGCCGCAGGATGTGGACCAACGTGAGGCTCCCCTCAACAACTTCTCTGTGGCGCAGTGCCAGCTGATGAAGACGGAGCGGCCCCGGCCCAACACCTTCATCATCCGCTGCCTGCAGTGGACCACTGTCATCGAACGCACCTTCCATGTGGAGACTCCTGAGGAGCGGGAGGAGTGGACAACCGCCATCCAGACTGTGGCTGACGGCCTCAAGAAGCAGGAGGAGGAGGAGATGGACTTCCGGTCGGGCTCACCCAGTGACAACTCAGGGGCTGAAGAGATGGAGGTGTCCCTGGCCAAGCCCAAGCACCGCGTGACCATGAACGAGTTTGAGTACCTGAAGCTGCTGGGCAAGGGCACTTTCGGCAAGGTGATCCTGGTGAAGGAGAAGGCCACAGGCCGCTACTACGCCATGAAGATCCTCAAGAAGGAAGTCATCGTGGCCAAGGACGAGGTGGCCCACACACTCACCGAGAACCGCGTCCTGCAGAACTCCAGGCACCCCTTCCTCACAGCCCTGAAGTACTCTTTCCAGACCCACGACCGCCTCTGCTTTGTCATGGAGTACGCCAACGGGGGCGAGCTGTTCTTCCACCTGTCCCGGGAGCGTGTGTTCTCCGAGGACCGGGCCCGCTTCTATGGCGCTGAGATTGTGTCAGCCCTGGACTACCTGCACTCGGAGAAGAACGTGGTGTACCGGGACCTCAAGCTGGAGAACCTCATGCTGGACAAGGACGGGCACATTAAGATCACAGACTTCGGGCTGTGCAAGGAGGGGATCAAGGACGGTGCCACCATGAAGACCTTTTGCGGCACACCTGAGTACCTGGCCCCCGAGGTGCTGGAGGACAATGACTACGGCCGTGCAGTGGACTGGTGGGGGCTGGGCGTGGTCATGTACGAGATGATGTGCGGTCGCCTGCCCTTCTACAACCAGGACCATGAGAAGCTTTTTGAGCTCATCCTCATGGAGGAGATCCGCTTCCCGCGCACGCTTGGTCCCGAGGCCAAGTCCTTGCTTTCAGGGCTGCTCAAGAAGGACCCCAAGCAGAGGCTTGGCGGGGGCTCCGAGGACGCCAAGGAGATCATGCAGCATCGCTTCTTTGCCGGTATCGTGTGGCAGCACGTGTACGAGAAGAAGCTCAGCCCACCCTTCAAGCCCCAGGTCACGTCGGAGACTGACACCAGGTATTTTGATGAGGAGTTCACGGCCCAGATGATCACCATCACACCACCTGACCAAGATGACAGCATGGAGTGTGTGGACAGCGAGCGCAGGCCCCACTTCCCCCAGTTCTCCTACTCGGCCAGCGGCACGGCC'.upper())

# ntags:
# RHEB (ntag)
RHEB = Seq('ATGCCGCAGTCCAAGTCCCGGAAGATCGCGATCCTGGGCTACCGGTCTGTGGGGAAATCCTCATTGACGATTCAATTTGTTGAAGGCCAATTTGTGGACTCCTACGATCCAACCATAGAAAACACTTTTACAAAGTTGATCACAGTAAATGGACAAGAATATCATCTTCAACTTGTAGACACAGCCGGGCAAGATGAATATTCTATCTTTCCTCAGACATACTCCATAGATATTAATGGCTATATTCTTGTGTATTCTGTTACATCAATCAAAAGTTTTGAAGTGATTAAAGTTATCCATGGCAAATTGTTGGATATGGTGGGGAAAGTACAAATACCTATTATGTTGGTTGGGAATAAGAAAGACCTGCATATGGAAAGGGTGATCAGTTATGAAGAAGGGAAAGCTTTGGCAGAATCTTGGAATGCAGCTTTTTTGGAATCTTCTGCTAAAGAAAATCAGACTGCTGTGGATGTTTTTCGAAGGATAATTTTGGAGGCAGAAAAAATGGACGGGGCAGCTTCACAAGGCAAGTCCTCATGCTCGGTGATG'.upper())

# RAP1A (ntag)
RAP1A = Seq('ATGCGTGAGTACAAGCTAGTGGTCCTTGGTTCAGGAGGCGTTGGGAAGTCTGCTCTGACAGTTCAGTTTGTTCAGGGAATTTTTGTTGAAAAATATGACCCAACGATAGAAGATTCCTACAGAAAGCAAGTTGAAGTCGATTGCCAACAGTGTATGCTCGAAATCCTGGATACTGCAGGGACAGAGCAATTTACAGCAATGAGGGATTTGTATATGAAGAACGGCCAAGGTTTTGCACTAGTATATTCTATTACAGCTCAGTCCACGTTTAACGACTTACAGGACCTGAGGGAACAGATTTTACGGGTTAAGGACACGGAAGATGTTCCAATGATTTTGGTTGGCAATAAATGTGACCTGGAAGATGAGCGAGTAGTTGGCAAAGAGCAGGGCCAGAATTTAGCAAGACAGTGGTGTAACTGTGCCTTTTTAGAATCTTCTGCAAAGTCAAAGATCAATGTTAATGAGATATTTTATGACCTGGTCAGACAGATAAATAGGAAAACACCAGTGGAAAAGAAGAAGCCTAAAAAGAAATCATGTCTGCTGCTC'.upper())

# NRAS (ntag)
NRAS = Seq('ATGACTGAGTACAAACTGGTGGTGGTTGGAGCCGGTGGTGTTGGGAAAAGCGCACTGACAATCCAGCTAATCCAGAACCACTTTGTAGATGAATATGATCCCACCATAGAGGATTCTTACAGAAAACAAGTGGTTATAGATGGTGAAACCTGTTTGTTGGACATACTGGATACAGCTGGACAAGAAGAGTACAGTGCCATGAGGGACCAATACATGAGGACAGGCGAAGGCTTCCTCTGTGTATTTGCCATCAATAATAGCAAGTCATTTGCGGATATTAACCTCTACAGGGAGCAGATTAAGCGAGTAAAAGACTCGGATGATGTACCTATGGTGCTAGTGGGAAACAAGTGTGATTTGCCAACAAGGACAGTTGATACAAAACAAGCCCACGAACTGGCCAAGAGTTACGGGATTCCATTCATTGAAACCTCAGCCAAGACCAGACAGGGTGTTGAAGATGCTTTTTACACACTGGTAAGAGAAATACGCCAGTACCGAATGAAAAAACTCAACAGCAGTGATGATGGGACTCAGGGTTGTATGGGATTGCCATGTGTGGTGATG'.upper())

# CDC42 (ntag)
CDC42 = Seq('ATGCAGACAATTAAGTGTGTTGTTGTGGGCGATGGTGCTGTTGGTAAAACATGTCTCCTGATATCCTACACAACAAACAAATTTCCATCGGAATATGTACCGACTGTTTTTGACAACTATGCAGTCACAGTTATGATTGGTGGAGAACCATATACTCTTGGACTTTTTGATACTGCAGGGCAAGAGGATTATGACAGATTACGACCGCTGAGTTATCCACAAACAGATGTATTTCTAGTCTGTTTTTCAGTGGTGTCTCCATCTTCATTTGAAAACGTGAAAGAAAAGTGGGTGCCTGAGATAACTCACCACTGTCCAAAGACTCCTTTCTTGCTTGTTGGGACTCAAATTGATCTCAGAGATGACCCCTCTACTATTGAGAAACTTGCCAAGAACAAACAGAAGCCTATCACTCCAGAGACTGCTGAAAAGCTGGCCCGTGACCTGAAGGCTGTCAAGTATGTGGAGTGTTCTGCACTTACACAGAAAGGCCTAAAGAATGTATTTGACGAAGCAATATTGGCTGCCCTGGAGCCTCCAGAACCGAAGAAAAGCCGCAGATGTGTGCTGCTA'.upper())

# Rac1 (ntag)
Rac1 = Seq('ATGCAGGCCATCAAGTGTGTGGTGGTGGGAGATGGAGCTGTAGGTAAAACTTGCCTACTGATCAGTTACACAACCAATGCATTTCCTGGAGAATATATCCCTACTGTCTTTGACAATTATTCTGCCAATGTTATGGTAGATGGAAAACCGGTGAATCTGGGCTTATGGGATACAGCTGGACAAGAAGATTATGACAGATTACGCCCCCTATCCTATCCGCAAACAGATGTGTTCTTAATTTGCTTTTCCCTTGTGAGTCCTGCATCATTTGAAAATGTCCGTGCAAAGTGGTATCCTGAGGTGCGGCACCACTGTCCCAACACTCCCATCATCCTAGTGGGAACTAAACTTGATCTTAGGGATGATAAAGACACGATCGAGAAACTGAAGGAGAAGAAGCTGACTCCCATCACCTATCCGCAGGGTCTAGCCATGGCTAAGGAGATTGGTGCTGTAAAATACCTGGAGTGCTCGGCGCTCACACAGCGAGGCCTCAAGACAGTGTTTGACGAAGCGATCCGAGCAGTCCTCTGCCCGCCTCCCGTGAAGAAGAGGAAGAGAAAATGCCTGCTGTTG'.upper())

# RHOA (ntag)
RHOA = Seq('ATGGCTGCCATCCGGAAGAAACTGGTGATTGTTGGTGATGGAGCCTGTGGAAAGACATGCTTGCTCATAGTGTTCAGCAAGGACCAGTTCCCAGAGGTGTATGTGCCCACAGTGTTTGAGAACTATGTGGCAGATATCGAGGTGGATGGAAAGCAGGTAGAGTTGGCTTTGTGGGACACAGCTGGGCAGGAAGATTATGATCGCCTGAGGCCCCTCTCCTACCCAGATACCGATGTTATACTGATGTGTTTTTCCATCGACAGCCCTGATAGTTTAGAAAACATCCCAGAAAAGTGGACCCCAGAAGTCAAGCATTTCTGTCCCAACGTGCCCATCATCCTGGTTGGGAATAAGAAGGATCTTCGGAATGATGAGCACACAAGGCGGGAGCTGGCCAAGATGAAGCAGGAGCCGGTGAAACCTGAAGAAGGCAGAGATATGGCAAACAGGATTGGCGCTTTTGGGTACATGGAGTGTTCAGCAAAGACCAAAGATGGAGTGAGAGAGGTTTTTGAAATGGCTACGAGAGCTGCTCTGCAAGCTAGACGTGGGAAGAAAAAATCTGGGTGCCTTGTCTTG'.upper())

# RAB1A (ntag)
RAB1A = Seq('ATGTCCAGCATGAATCCCGAATATGATTATTTATTCAAGTTACTTCTGATTGGCGACTCAGGGGTTGGAAAGTCTTGCCTTCTTCTTAGGTTTGCAGATGATACATATACAGAAAGCTACATCAGCACAATTGGTGTGGATTTCAAAATAAGAACTATAGAGTTAGACGGGAAAACAATCAAGCTTCAAATATGGGACACAGCAGGCCAGGAAAGATTTCGAACAATCACCTCCAGTTATTACAGAGGAGCCCATGGCATCATAGTTGTGTATGATGTGACAGATCAGGAGTCCTTCAATAATGTTAAACAGTGGCTGCAGGAAATAGATCGTTATGCCAGTGAAAATGTCAACAAATTGTTGGTAGGGAACAAATGTGATCTGACCACAAAGAAAGTAGTAGACTACACAACAGCGAAGGAATTTGCTGATTCCCTTGGAATTCCGTTTTTGGAAACCAGTGCTAAGAATGCAACGAATGTAGAACAGTCTTTCATGACGATGGCAGCTGAGATTAAAAAGCGAATGGGTCCCGGAGCAACAGCTGGTGGTGCTGAGAAGTCCAATGTTAAAATTCAGAGCACTCCAGTCAAGCAGTCAGGTGGAGGTTGCTGC'.upper())

# RALA (ntag)
RALA = Seq('ATGGCTGCAAATAAGCCCAAGGGTCAGAATTCTTTGGCTTTACACAAAGTCATCATGGTGGGCAGTGGTGGCGTGGGCAAGTCAGCTCTGACTCTACAGTTCATGTACGATGAGTTTGTGGAGGACTATGAGCCTACCAAAGCAGACAGCTATCGGAAGAAGGTAGTGCTAGATGGGGAGGAAGTCCAGATCGATATCTTAGATACAGCTGGGCAGGAGGACTACGCTGCAATTAGAGACAACTACTTCCGAAGTGGGGAGGGGTTCCTCTGTGTTTTCTCTATTACAGAAATGGAATCCTTTGCAGCTACAGCTGACTTCAGGGAGCAGATTTTAAGAGTAAAAGAAGATGAGAATGTTCCATTTCTACTGGTTGGTAACAAATCAGATTTAGAAGATAAAAGACAGGTTTCTGTAGAAGAGGCAAAAAACAGAGCTGAGCAGTGGAATGTTAACTACGTGGAAACATCTGCTAAAACACGAGCTAATGTTGACAAGGTATTTTTTGATTTAATGAGAGAAATTCGAGCGAGAAAGATGGAAGATAGCAAAGAAAAGAATGGAAAAAAGAAGAGGAAAAGTTTAGCCAAGAGAATCAGAGAAAGATGCTGCATTTTA'.upper())

# RAN (ntag)
RAN = Seq('ATGGCTGCGCAGGGAGAGCCCCAGGTCCAGTTCAAACTTGTATTGGTTGGTGATGGTGGTACTGGAAAAACGACCTTCGTGAAACGTCATTTGACTGGTGAATTTGAGAAGAAGTATGTAGCCACCTTGGGTGTTGAGGTTCATCCCCTAGTGTTCCACACCAACAGAGGACCTATTAAGTTCAATGTATGGGACACAGCCGGCCAGGAGAAATTCGGTGGACTGAGAGATGGCTATTATATCCAAGCCCAGTGTGCCATCATAATGTTTGATGTAACATCGAGAGTTACTTACAAGAATGTGCCTAACTGGCATAGAGATCTGGTACGAGTGTGTGAAAACATCCCCATTGTGTTGTGTGGCAACAAAGTGGATATTAAGGACAGGAAAGTGAAGGCGAAATCCATTGTGTTCCACCGAAAGAAGAATCTTCAGTACTACGACATTTCTGCCAAAAGTAACTACAACTTTGAAAAGCCCTTCCTCTGGCTTGCTAGGAAGCTCATTGGCGACCCTAACTTGGAATTTGTTGCCATGCCTGCTCTCGCCCCACCAGAAGTTGTCATGGACCCAGCTTTGGCAGCACAGTATGAGCACGACTTAGAGGTTGCTCAGACAACTGCTCTCCCGGATGAGGATGATGACCTG'.upper())

# RIT1 (ntag)
RIT1 = Seq('ATGGATTCTGGAACTCGCCCAGTTGGTAGCTGCTGTAGCAGCCCCGCTGGGCTCTCACGGGAGTACAAACTAGTGATGCTGGGTGCTGGTGGTGTAGGGAAGAGTGCCATGACCATGCAGTTCATCAGCCACCGATTCCCAGAAGATCATGATCCCACCATTGAAGATGCTTATAAGATCAGGATCCGTATTGATGATGAGCCTGCCAATCTGGACATTTTGGATACAGCTGGACAGGCAGAGTTTACAGCCATGCGGGACCAGTATATGAGGGCAGGAGAAGGGTTTATCATCTGTTACTCTATCACGGATCGTCGAAGTTTCCATGAAGTTCGTGAGTTTAAACAGCTTATTTATCGAGTCCGACGTACTGACGATACACCTGTGGTTCTTGTGGGAAACAAGTCAGACCTCAAACAGCTAAGACAGGTCACCAAGGAAGAAGGATTGGCCTTGGCCCGAGAATTCAGCTGTCCCTTTTTTGAGACATCTGCTGCATACCGCTACTATATTGATGATGTTTTCCATGCCCTTGTACGGGAGATACGTAGGAAAGAAAAGGAGGCAGTACTGGCCATGGAGAAAAAATCTAAGCCCAAAAACAGTGTATGGAAGAGGCTAAAATCACCATTCCGGAAGAAGAAAGATTCAGTAACT'.upper())

# NHLRC1 (ntag)
NHLRC1 = Seq('ATGGCGGCCGAAGCCTCGGAGAGCGGGCCAGCGCTGCATGAGCTCATGCGCGAGGCGGAGATCAGCCTGCTCGAGTGCAAGGTGTGCTTTGAGAAGTTTGGCCACCGGCAGCAGCGGCGCCCGCGCAACCTGTCCTGTGGACATGTTGTGTGTTTGGCATGTGTTGCAGCTTTAGCCCATCCAAGAACACTGGCCCTCGAGTGCCCATTCTGCAGGCGAGCTTGCCGGGGCTGCGACACCAGCGACTGCCTGCCGGTGCTGCACCTCATAGAGCTCCTGGGCTCAGCTTTACGTCAATCTCCTGCTGCTCATAGAGCGGCACCTAGTGCACCTGGAGCCCTCACCTGTCACCACACCTTCGGCGGCTGGGGGACCCTGGTCAACCCCACCGGACTGGCGCTTTGTCCCAAGACGGGGCGTGTCGTGGTGGTGCACGACGGCAGGAGGCGTGTCAAGATTTTTGACTCAGGGGGAGGATGCGCGCATCAGTTTGGAGAGAAGGGGGACGCTGCCCAAGACATTAGGTACCCTGTGGATGTCACCATCACCAACGACTGCCATGTGGTTGTCACTGACGCCGGCGATCGCTCCATCAAAGTGTTTGATTTTTTTGGCCAGATCAAGCTTGTCATTGGAGGCCAATTCTCCTTACCTTGGGGTGTGGAAACCACCCCTCAGAATGGGATTGTGGTAACTGATGCGGAGGCAGGGTCCCTGCACCTCCTGGACGTCGACTTCGCGGAAGGGGTCCTTCGGAGAACTGAAAGGTTGCAAGCTCATCTGTGCAATCCCCGAGGGGTGGCAGTGTCTTGGCTCACCGGGGCCATTGCGGTCCTGGAGCACCCCCTGGCCCTGGGGACTGGGGTTTGCAGCACCAGGGTGAAAGTGTTTAGCTCAAGTATGCAGCTTGTCGGCCAAGTGGATACCTTTGGGCTGAGCCTCTACTTTCCCTCCAAAATAACTGCCTCCGCTGTGACCTTTGATCACCAGGGAAATGTGATTGTTGCAGATACATCTGGTCCAGCTATCCTTTGCTTAGGAAAACCTGAGGAGTTTCCAGTACCGAAGCCCATGGTCACTCATGGTCTTTCGCATCCTGTGGCTCTTACCTTCACCAAGGAGAATTCTCTTCTTGTGCTGGACACAGCATCTCATTCTATAAAAGTCTATAAAGTTGACTGGGGG'.upper())

# SMS (ntag)
SMS = Seq('ATGGCAGCAGCACGGCACAGCACGCTCGACTTCATGCTCGGCGCCAAAGCTGATGGTGAAACCATTCTAAAAGGCCTCCAGTCCATTTTCCAGGAGCAGGGGATGGCGGAGTCGGTGCACACCTGGCAGGACCATGGCTATTTAGCAACCTACACAAACAAGAACGGCAGCTTTGCCAATTTGAGAATTTACCCACATGGATTGGTGTTGCTGGACCTTCAGAGTTATGATGGTGATGCGCAAGGCAAAGAAGAGATCGACAGTATTTTGAACAAAGTAGAGGAAAGAATGAAAGAATTGAGTCAGGACAGTACTGGGCGGGTGAAACGATTACCACCCATAGTGCGAGGAGGAGCCATCGACAGATACTGGCCCACCGCCGACGGGCGCCTGGTTGAATATGACATAGATGAAGTGGTATATGACGAAGATTCACCTTATCAAAATATAAAAATTCTACACTCGAAGCAGTTTGGAAATATTCTCATCCTTAGTGGGGATGTTAATTTGGCAGAGAGTGATTTGGCATATACCCGGGCCATCATGGGCAGTGGCAAAGAAGATTACACTGGCAAAGATGTACTCATTCTGGGAGGTGGAGATGGAGGCATATTGTGTGAAATAGTCAAACTAAAACCAAAGATGGTCACTATGGTAGAGATTGACCAAATGGTGATTGATGGGTGTAAGAAATACATGCGAAAAACGTGTGGCGATGTCTTAGACAATCTTAAAGGAGACTGCTATCAGGTTCTAATAGAAGATTGTATCCCGGTACTGAAGAGGTACGCCAAAGAAGGGAGAGAATTTGATTATGTGATTAATGATTTGACAGCTGTTCCAATCTCCACGTCCCCAGAAGAAGATTCCACATGGGAGTTTCTCAGACTGATTCTTGACCTCTCAATGAAAGTGTTGAAACAGGATGGGAAATATTTTACACAGGGGAACTGTGTCAATCTGACAGAAGCACTGTCGCTCTATGAAGAACAGCTGGGGCGCCTGTATTGTCCTGTGGAATTTTCAAAGGAGATCGTCTGTGTCCCTTCATACTTGGAATTGTGGGTATTTTACACTGTTTGGAAGAAAGCTAAACCC'.upper())

# ETV6 (ntag)
ETV6 = Seq('ATGTCTGAGACTCCTGCTCAGTGTAGCATTAAGCAGGAACGAATTTCATATACACCTCCAGAGAGCCCAGTGCCGAGTTACGCTTCCTCGACGCCACTTCATGTTCCAGTGCCTCGAGCGCTCAGGATGGAGGAGGACTCGATCCGCCTGCCTGCGCACCTCCGCTTGCAGCCAATTTACTGGAGCAGGGATGACGTAGCCCAGTGGCTCAAGTGGGCTGAAAATGAGTTTTCTTTAAGGCCAATTGACAGCAACACGTTTGAAATGAATGGCAAAGCTCTCCTGCTGCTGACCAAAGAGGACTTTCGCTATCGATCTCCTCATTCAGGTGATGTGCTCTATGAACTCCTTCAGCATATTCTGAAGCAGAGGAAACCTCGGATTCTTTTTTCACCATTCTTCCACCCTGGAAACTCTATACACACACAGCCGGAGGTCATACTGCATCAGAACCATGAAGAAGATAACTGTGTCCAGAGGACCCCCAGGCCATCCGTGGATAATGTGCACCATAACCCTCCCACCATTGAACTGTTGCACCGCTCCAGGTCACCTATCACGACAAATCACCGGCCTTCTCCTGACCCCGAGCAGCGGCCCCTCCGGTCCCCCCTGGACAACATGATCCGCCGCCTCTCCCCGGCTGAGAGAGCTCAGGGACCCAGGCCGCACCAGGAGAACAACCACCAGGAGTCCTACCCTCTGTCAGTGTCTCCCATGGAGAATAATCACTGCCCAGCGTCCTCCGAGTCCCACCCGAAGCCATCCAGCCCCCGGCAGGAGAGCACACGCGTGATCCAGCTGATGCCCAGCCCCATCATGCACCCTCTGATCCTGAACCCCCGGCACTCCGTGGATTTCAAACAGTCCAGGCTCTCCGAGGACGGGCTGCATAGGGAAGGGAAGCCCATCAACCTCTCTCATCGGGAAGAACTGGCTTACATGAACCACATCATGGTGTCTGTCTCCCCGCCTGAGGAGCACGCCATGCCCATTGGGAGAATAGCAGACTGTAGACTGCTTTGGGATTACGTCTATCAGTTGCTTTCTGACAGCCGGTACGAAAACTTCATCCGATGGGAGGACAAAGAATCCAAAATATTCCGGATAGTGGATCCCAACGGACTGGCTCGACTGTGGGGAAACCATAAGAACAGAACAAACATGACCTATGAGAAAATGTCCAGAGCCCTGCGCCACTACTACAAACTAAACATTATCAGGAAGGAGCCAGGACAAAGGCTTTTGTTCAGGTTTATGAAAACCCCAGATGAAATCATGAGTGGCCGAACAGACCGTCTGGAGCACCTAGAGTCCCAGGAGCTGGATGAACAAATATACCAAGAAGATGAATGC'.upper())

# RUNX1 (ntag)
RUNX1 = Seq('ATGGCTTCAGACAGCATATTTGAGTCATTTCCTTCGTACCCACAGTGCTTCATGAGAGAATGCATACTTGGAATGAATCCTTCTAGAGATGTCCACGATGCCAGCACGAGCCGCCGCTTCACGCCGCCTTCCACCGCGCTGAGCCCAGGCAAGATGAGTGAAGCTCTCCCACTTGGAGCACCTGATGCTGGAGCAGCTTTAGCTGGTAAGCTGAGGAGCGGCGACCGCAGCATGGTGGAGGTGCTGGCCGACCACCCGGGCGAGCTGGTGCGCACCGACAGCCCCAACTTCCTCTGCTCCGTGCTGCCTACGCACTGGCGCTGCAACAAGACCCTGCCCATCGCTTTCAAGGTGGTGGCCCTAGGGGATGTTCCAGATGGCACTCTGGTCACTGTGATGGCTGGCAATGATGAAAACTACTCGGCTGAGCTGAGAAATGCTACCGCAGCCATGAAGAACCAGGTTGCAAGATTTAATGACCTCAGGTTTGTCGGTCGAAGTGGAAGAGGGAAAAGCTTCACTCTGACCATCACTGTGTTCACAAACCCACCGCAAGTCGCCACCTACCACAGAGCCATCAAAATCACAGTGGATGGGCCCCGAGAACCTCGGAGACATCGGCAGAAACTAGATGATCAGACCAAGCCCGGGAGCTTGTCCTTTTCCGAGCGGCTCAGTGAACTGGAGCAGCTGCGGCGCACAGCCATGAGGGTCAGCCCACACCACCCAGCCCCCACGCCCAACCCTCGTGCCTCCCTGAACCACTCCACTGCCTTTAACCCTCAGCCTCAGAGTCAGATGCAGGATACAAGGCAGATCCAACCATCCCCACCGTGGTCCTACGATCAGTCCTACCAATACCTGGGATCCATTGCCTCTCCTTCTGTGCACCCAGCAACGCCCATTTCACCTGGACGTGCCAGCGGCATGACAACCCTCTCTGCAGAACTTTCCAGTCGACTCTCAACGGCACCCGACCTGACAGCGTTCAGCGACCCGCGCCAGTTCCCCGCGCTGCCCTCCATCTCCGACCCCCGCATGCACTATCCAGGCGCCTTCACCTACTCCCCGACGCCGGTCACCTCGGGCATCGGCATCGGCATGTCGGCCATGGGCTCGGCCACGCGCTACCACACCTACCTGCCGCCGCCCTACCCCGGCTCGTCGCAAGCGCAGGGAGGCCCGTTCCAAGCCAGCTCGCCCTCCTACCACCTGTACTACGGCGCCTCGGCCGGCTCCTACCAGTTCTCCATGGTGGGCGGCGAGCGCTCGCCGCCGCGCATCCTGCCGCCCTGCACCAACGCCTCCACCGGCTCCGCGCTGCTCAACCCCAGCCTCCCGAACCAGAGCGACGTGGTGGAGGCCGAGGGCAGCCACAGCAACTCCCCCACCAACATGGCGCCCTCCGCGCGCCTGGAGGAGGCCGTGTGGAGGCCCTAC'.upper())

# GATA2 (ntag)
GATA2 = Seq('ATGGAGGTGGCGCCCGAGCAGCCGCGCTGGATGGCGCACCCGGCCGTGCTGAATGCGCAGCACCCCGACTCACACCACCCGGGCCTGGCGCACAACTACATGGAACCCGCGCAGCTGCTGCCTCCAGACGAGGTGGACGTGTTCTTCAATCACCTCGACTCGCAGGGCAACCCCTACTATGCCAATCCGGCACATGCTAGGGCCCGGGTATCTTATAGTCCAGCTCATGCGAGATTAACCGGAGGCCAGATGTGCCGCCCACACTTGTTGCACAGCCCGGGTTTACCATGGTTAGATGGGGGCAAAGCAGCCCTCTCTGCCGCTGCGGCCCACCACCACAACCCCTGGACCGTGAGCCCCTTCTCCAAGACGCCACTGCACCCCTCAGCTGCTGGAGGCCCTGGAGGCCCACTCTCTGTGTACCCAGGGGCTGGGGGTGGGAGCGGGGGAGGCAGCGGGAGCTCAGTGGCCTCCCTCACCCCTACAGCAGCCCACTCTGGCTCCCACCTTTTCGGCTTCCCACCCACGCCACCCAAAGAAGTGTCTCCTGACCCTAGCACCACGGGGGCTGCGTCCCCAGCCTCATCTTCCGCGGGGGGTAGTGCAGCCCGAGGAGAGGACAAGGACGGCGTCAAGTACCAGGTGTCACTGACGGAGAGCATGAAGATGGAAAGTGGCAGTCCCCTGCGCCCAGGCCTAGCTACTATGGGCACCCAGCCTGCTACACACCACCCCATCCCCACCTACCCCTCCTATGTGCCGGCGGCTGCCCACGACTACAGCAGCGGACTCTTCCACCCCGGAGGCTTCCTGGGGGGACCGGCCTCCAGCTTCACCCCTAAGCAGCGCAGCAAGGCTCGTTCCTGTTCAGAAGGCCGGGAGTGTGTCAACTGTGGGGCCACAGCCACCCCTCTCTGGCGGCGGGACGGCACCGGCCACTACCTGTGCAATGCATGTGGTCTGTATCACAAGATGAATGGGCAGAACCGACCACTCATCAAGCCCAAGCGAAGGCTGTCGGCCGCCAGGAGAGCCGGCACCTGTTGTGCAAATTGTCAGACGACAACCACCACCTTATGGCGCCGAAACGCCAACGGGGACCCTGTCTGCAACGCCTGTGGCCTCTACTACAAGCTGCACAATGTTAACAGGCCACTGACCATGAAGAAGGAAGGGATCCAGACTCGGAACCGGAAGATGTCCAACAAGTCCAAGAAAAGCAAGAAAGGGGCGGAGTGCTTCGAGGAGCTGTCAAAGTGCATGCAGGAGAAGTCATCCCCCTTCAGTGCAGCTGCCCTGGCTGGACACATGGCACCTGTGGGCCACCTCCCGCCCTTCAGCCACTCCGGACACATCCTGCCCACTCCGACGCCCATCCACCCCTCCTCCAGCCTCTCCTTCGGCCACCCCCACCCGTCCAGCATGGTGACCGCCATGGGC'.upper())

# RAB7A (ntag)
RAB7A = Seq('ATGACCTCTAGGAAGAAAGTGTTGCTGAAGGTTATCATCCTGGGAGATTCTGGAGTCGGGAAGACATCACTCATGAACCAGTATGTGAATAAGAAATTCAGCAATCAGTACAAAGCCACAATAGGAGCTGACTTTCTGACCAAGGAGGTGATGGTGGATGACAGGCTAGTCACAATGCAGATATGGGACACAGCAGGACAGGAACGGTTCCAGTCTCTCGGTGTGGCCTTCTACAGAGGTGCAGACTGCTGCGTTCTGGTATTTGATGTGACTGCCCCCAACACATTCAAAACCCTAGATAGCTGGAGAGATGAGTTTCTCATCCAGGCCAGTCCCCGAGATCCTGAAAACTTCCCATTTGTTGTGTTGGGAAACAAGATTGACCTCGAAAACAGACAAGTGGCCACAAAGCGGGCACAGGCCTGGTGCTACAGCAAAAACAACATTCCCTACTTTGAAACCAGTGCCAAGGAGGCCATCAACGTGGAGCAGGCGTTCCAGACGATTGCACGGAATGCACTTAAGCAGGAAACGGAGGTGGAGCTGTACAACGAATTTCCTGAACCTATCAAACTGGACAAGAATGACCGGGCCAAGGCCTCGGCAGAAAGCTGCAGTTGC'.upper())

# RAB11B (ntag)
RAB11B = Seq('ATGGGGACCCGGGACGACGAGTACGACTACCTATTCAAAGTGGTGCTCATCGGGGACTCAGGCGTGGGCAAGAGCAACCTGCTGTCGCGCTTCACCCGCAACGAGTTCAACCTGGAGAGCAAGAGCACCATCGGCGTGGAGTTCGCCACCCGCAGCATCCAGGTGGACGGCAAGACCATCAAGGCGCAGATCTGGGACACCGCTGGCCAGGAGCGCTACCGCGCCATCACCTCCGCGTACTACCGTGGTGCAGTGGGCGCCCTGCTGGTGTACGACATCGCCAAGCACCTGACCTATGAGAACGTGGAGCGCTGGCTGAAGGAGCTGCGGGACCACGCAGACAGCAACATCGTCATCATGCTGGTGGGCAACAAGAGTGACCTGCGCCATCTGCGGGCTGTGCCCACTGACGAGGCCCGCGCCTTCGCAGAAAAGAACAACTTGTCCTTCATCGAAACCTCAGCCTTGGATTCCACTAACGTAGAGGAAGCATTCAAGAACATCCTCACAGAGATCTACCGCATCGTGTCACAGAAACAGATCGCAGACCGCGCTGCCCACGACGAGTCCCCGGGGAACAACGTGGTGGACATCAGCGTGCCGCCCACCACGGACGGACAGAAGCCCAACAAGCTGCAGTGCTGCCAGAACCTG'.upper())

# RAB18 (ntag)
RAB18 = Seq('ATGGACGAGGACGTGCTAACCACCCTGAAGATCCTCATCATCGGCGAGAGTGGGGTGGGCAAGTCCAGCCTGCTCTTGAGGTTCACAGATGATACGTTTGATCCAGAACTTGCAGCAACAATAGGTGTTGACTTTAAGGTGAAAACAATTTCAGTGGATGGAAATAAGGCTAAACTTGCAATATGGGATACTGCTGGTCAAGAGAGGTTTAGAACATTAACTCCCAGCTATTATAGAGGTGCACAGGGTGTTATATTAGTTTATGATGTCACAAGAAGAGATACATTTGTTAAACTGGATAATTGGTTAAATGAATTGGAAACATACTGTACAAGAAATGACATAGTAAACATGCTAGTTGGAAATAAAATCGATAAGGAAAATCGTGAAGTCGATAGAAATGAAGGCCTGAAATTTGCACGAAAGCATTCCATGTTATTTATAGAGGCAAGTGCAAAAACCTGTGATGGTGTACAATGTGCCTTTGAAGAACTTGTTGAAAAGATCATTCAGACCCCTGGACTGTGGGAAAGTGAGAACCAGAATAAAGGAGTCAAACTGTCACACAGGGAAGAAGGCCAAGGAGGAGGAGCCTGTGGTGGTTATTGCTCTGTGTTA'.upper())

# RAB23 (ntag)
#RAB23 = Seq('ATGTTGGAGGAAGATATGGAAGTCGCCATAAAGATGGTGGTTGTAGGGAATGGAGCAGTTGGAAAATCAAGTATGATTCAGCGATATTGCAAAGGCATTTTTACAAAAGACTACAAGAAAACCATTGGAGTTGATTTTTTGGAGCGACAAATTCAAGTTAATGATGAAGATGTCAGACTAATGTTATGGGACACTGCAGGTCAGGAGGAATTTGATGCAATTACAAAGGCCTACTATCGAGGAGCCCAGGCTTGTGTGCTCGTGTTCTCTACCACAGATAGGGAATCTTTTGAAGCAGTTTCCAGTTGGAGAGAGAAAGTAGTAGCCGAAGTGGGAGATATACCAACTGTACTTGTGCAAAACAAGATTGATCTTCTGGATGATTCTTGTATAAAGAATGAGGAAGCTGAGGCACTGGCAAAAAGGTTAAAGTTAAGATTCTACAGAACATCAGTGAAAGAAGATCTAAATGTGAATGAAGTTTTTAAGTATTTGGCTGAAAAATACCTTCAGAAACTCAAACAACAAATAGCTGAGGATCCAGAACTAACGCATTCAAGTAGTAACAAGATTGGTGTCTTTAATACATCTGGTGGAAGTCACTCCGGTCAGAATTCAGGTACCCTCAATGGTGGAGATGTCATCAATCTTAGACCCAACAAACAAAGGACCAAGAAAAACAGAAATCCTTTTAGCAGCTGTAGCATACCC'.upper())

# RAB27A (ntag)
RAB27A = Seq('ATGTCTGATGGAGATTATGATTACCTCATCAAGTTTTTAGCTTTGGGAGACTCTGGTGTAGGGAAGACCAGTGTACTTTACCAATATACAGATGGTAAATTTAACTCCAAATTTATCACAACAGTGGGCATTGATTTCAGGGAAAAAAGAGTGGTGTACAGAGCCAGTGGGCCGGATGGAGCCACTGGCAGAGGCCAGAGAATCCATCTGCAGTTATGGGACACAGCAGGGCAGGAGAGGTTTCGTAGCTTAACGACAGCGTTCTTCAGAGATGCTATGGGTTTTCTTCTACTTTTTGATCTGACAAATGAGCAAAGTTTCCTCAATGTCAGAAACTGGATAAGCCAGCTACAGATGCATGCATATTGTGAAAACCCAGATATAGTGCTGTGTGGAAACAAGAGTGATCTGGAGGACCAGAGAGTAGTGAAAGAGGAGGAAGCCATAGCACTCGCAGAGAAATATGGAATCCCCTACTTTGAAACTAGTGCTGCCAATGGGACAAACATAAGCCAAGCAATTGAGATGCTTCTGGACCTGATAATGAAGCGAATGGAACGGTGTGTGGACAAGTCCTGGATTCCTGAAGGAGTGGTGCGATCAAATGGTCATGCCTCTACGGATCAGTTAAGTGAAGAAAAGGAGAAAGGGGCATGTGGCTGT'.upper())

# FOXP3 (ntag)
FOXP3 = Seq('ATGCCCAACCCCAGGCCTGGCAAGCCCTCGGCCCCTTCCTTGGCCCTTGGCCCATCCCCAGGAGCCTCGCCCAGCTGGAGGGCTGCACCCAAAGCCTCAGACCTGCTGGGGGCCCGGGGCCCAGGGGGAACCTTCCAGGGCCGAGATCTTCGAGGCGGGGCCCATGCCTCCTCTTCTTCCTTGAACCCCATGCCACCATCGCAGCTGCAGCTGCCCACACTGCCCCTAGTCATGGTGGCACCCTCCGGGGCACGGCTGGGCCCCTTGCCCCACTTACAGGCACTCCTCCAGGACAGGCCACATTTCATGCACCAGCTCTCAACGGTGGATGCCCACGCCCGGACCCCTGTGCTGCAGGTCCACCCCCTGGAGAGCCCAGCCATGATCAGCCTCACACCACCCACCACCGCCACTGGGGTCTTCTCCCTCAAGGCCCGGCCTGGCCTCCCACCTGGGATCAACGTGGCCAGCCTGGAATGGGTGTCCAGGGAGCCGGCACTGCTCTGCACCTTCCCAAATCCCAGTGCACCCAGGAAGGACAGCACCCTTTCGGCTGTGCCCCAGAGCTCCTACCCACTGCTGGCAAATGGTGTCTGCAAGTGGCCCGGATGTGAGAAGGTCTTCGAGGAGCCAGAGGACTTCCTCAAGCACTGCCAGGCGGACCATCTTCTGGATGAGAAGGGCAGGGCACAATGTCTCCTCCAGAGAGAGATGGTACAGTCTCTGGAGCAGCAGCTGGTGCTGGAGAAGGAGAAGCTGAGTGCCATGCAGGCCCACCTGGCTGGGAAAATGGCACTGACCAAGGCTTCATCTGTGGCATCATCCGACAAGGGCTCCTGCTGCATCGTAGCTGCTGGCAGCCAAGGCCCTGTCGTCCCAGCCTGGTCTGGCCCCCGGGAGGCCCCTGACAGCCTGTTTGCTGTCCGGAGGCACCTGTGGGGTAGCCATGGAAACAGCACATTCCCAGAGTTCCTCCACAACATGGACTACTTCAAGTTCCACAACATGCGACCCCCTTTCACCTACGCCACGCTCATCCGCTGGGCCATCCTGGAGGCTCCAGAGAAGCAGCGGACACTCAATGAGATCTACCACTGGTTCACACGCATGTTTGCCTTCTTCAGAAACCATCCTGCCACCTGGAAGAACGCCATCCGCCACAACCTGAGTCTGCACAAGTGCTTTGTGCGGGTGGAGAGCGAGAAGGGGGCTGTGTGGACCGTGGATGAGCTGGAGTTCCGCAAGAAACGGAGCCAGAGGCCCAGCAGATGTTCCAACCCTACACCTGGCCCC'.upper())

# BBS1 (ntag)
BBS1 = Seq('ATGGCCGCTGCGTCCTCATCGGATTCCGACGCCTGCGGAGCTGAGAGCAATGAGGCCAATTCGAAGTGGTTGGATGCGCACTACGACCCAATGGCCAATATCCACACCTTTTCTGCCTGCCTAGCGCTGGCAGATTTACATGGGGATGGGGAATACAAGCTGGTGGTAGGGGACCTTGGCCCTGGTGGGCAGCAGCCCCGCCTGAAGGTGCTCAAAGGACCACTGGTGATGACCGAAAGCCCGCTACCTGCTCTGCCAGCTGCTGCTGCCACCTTCCTCATGGAGCAACATGAGCCCCGGACCCCAGCTCTGGCACTTGCTTCAGGCCCTTGTGTCTATGTGTATAAGAATCTCAGACCCTACTTCAAGTTCAGCCTGCCCCAATTGCCTCCAAATCCTCTGGAACAAGACCTTTGGAACCAGGCCAAAGAGGACCGAATCGACCCCTTAACCCTGAAGGAGATGCTGGAGAGCATCCGGGAAACGGCAGAGGAGCCTTTGTCCATCCAGTCACTCAGGTTTCTGCAGCTGGAGCTAAGTGAAATGGAGGCATTTGTAAACCAACACAAGTCCAACTCCATCAAGCGGCAGACAGTCATCACCACCATGACCACCTTGAAGAAGAACCTGGCTGACGAGGATGCTGTGTCTTGCCTGGTGCTGGGCACCGAGAACAAGGAGCTCCTGGTGCTTGACCCCGAGGCCTTCACCATTTTAGCCAAGATGAGCCTTCCCAGCGTCCCCGTCTTCCTAGAGGTTTCTGGCCAGTTTGATGTTGAGTTCCGGCTTGCCGCGGCCTGCCGCAATGGAAACATCTATATTCTGAGAAGAGACTCCAAGCACCCCAAGTACTGCATCGAGCTGAGCGCCCAGCCTGTGGGACTTATCCGGGTACACAAGGTCCTAGTGGTGGGCAGCACCCAAGACAGCCTGCATGGCTTCACCCACAAGGGGAAGAAGCTGTGGACAGTGCAGATGCCCGCAGCCATCCTGACCATGAACCTCCTGGAGCAGCATTCCCGGGGCCTGCAGGCCGTCATGGCTGGGCTGGCCAATGGAGAGGTCCGCATTTATCGTGACAAGGCCCTGCTCAATGTCATCCACACCCCGGATGCAGTGACCAGCCTTTGCTTTGGCCGGTACGGGCGGGAGGACAACACCCTCATCATGACCACTCGAGGTGGTGGCCTGATCATCAAGATCCTGAAGCGTACAGCAGTGTTTGTAGAGGGAGGAAGTGAGGTGGGTCCCCCACCAGCCCAGGCCATGAAACTCAATGTGCCCCGAAAGACCCGGCTTTACGTGGATCAGACACTGCGAGAGCGGGAGGCTGGCACCGCCATGCACCGGGCCTTCCAGACAGACCTATACCTGCTGCGCCTACGTGCTGCCCGCGCCTACCTGCAGGCCCTCGAGTCCAGCCTGAGCCCCCTGTCCACGACAGCCCGAGAGCCACTCAAGCTGCACGCCGTGGTTCAGGGCCTTGGCCCCACCTTTAAGCTCACACTTCATCTGCAGAACACCTCAACAACCCGTCCTGTCCTGGGGCTGCTGGTCTGCTTCCTGTACAACGAGGCGCTCTATTCCCTGCCCCGGGCCTTCTTCAAGGTACCCTTGCTGGTGCCAGGGCTCAACTACCCCCTGGAGACATTTGTGGAGAGTCTCAGTAACAAGGGCATCTCAGACATCATCAAGGTGCTGGTGCTTCGAGAAGGCCAAAGTGCACCCCTGCTGAGTGCCCACGTCAACATGCCTGGGAGCGAGGGGCTGGCGGCCGCC'.upper())

# SMAD4 (ntag)
SMAD4 = Seq('ATGGACAATATGTCTATTACGAATACACCAACAAGTAATGATGCCTGTCTGAGCATTGTGCATAGTTTGATGTGCCATAGACAAGGTGGAGAGAGTGAAACATTTGCAAAAAGAGCAATTGAAAGTTTGGTAAAGAAGCTGAAGGAGAAAAAAGATGAATTGGATTCTTTAATAACAGCTATAACTACAAATGGAGCTCATCCTAGTAAATGTGTTACCATACAGAGAACATTGGATGGGAGGCTTCAGGTGGCTGGTCGGAAAGGATTTCCTCATGTGATCTATGCCCGGCTCTGGAGGTGGCCTGATCTTCACAAAAATGAACTAAAACATGTTAAATATTGTCAGTATGCGTTTGACTTAAAATGTGATAGTGTCTGTGTGAATCCATATCACTACGAACGAGTTGTATCACCTGGAATTGATCTCTCAGGATTAACACTGCAGAGTAATGCTCCATCAAGTATGATGGTGAAGGATGAATATGTGCATGACTTTGAGGGACAGCCATCGTTGTCCACTGAAGGACATTCAATTCAAACCATCCAGCATCCACCAAGTAATCGTGCATCGACAGAGACATACAGCACCCCAGCTCTGTTAGCCCCATCTGAGTCTAATGCTACCAGCACTGCCAACTTTCCCAACATTCCTGTGGCTTCCACAAGTCAGCCTGCCAGTATACTGGGGGGCAGCCATAGTGAAGGACTGTTGCAGATAGCATCAGGGCCTCAGCCAGGACAGCAGCAGAATGGATTTACTGGTCAGCCAGCTACTTACCATCATAACAGCACTACCACCTGGACTGGAAGTAGGACTGCACCATACACACCTAATTTGCCTCACCACCAAAACGGCCATCTTCAGCACCACCCGCCTATGCCGCCCCATCCCGGACATTACTGGCCTGTTCACAATGAGCTTGCATTCCAGCCTCCCATTTCCAATCATCCTGCTCCTGAGTATTGGTGTTCCATTGCTTACTTTGAAATGGATGTTCAGGTAGGAGAGACATTTAAGGTTCCTTCAAGCTGCCCTATTGTTACTGTTGATGGATACGTGGACCCTTCTGGAGGAGATCGCTTTTGTTTGGGTCAACTCTCCAATGTCCACAGGACAGAAGCCATTGAGAGAGCAAGGTTGCACATAGGCAAAGGTGTGCAGTTGGAATGTAAAGGTGAAGGTGATGTTTGGGTCAGGTGCCTTAGTGACCACGCGGTCTTTGTACAGAGTTACTACTTAGACAGAGAAGCTGGGCGTGCACCTGGAGATGCTGTTCATAAGATCTACCCAAGTGCATATATAAAGGTCTTTGATTTGCGTCAGTGTCATCGACAGATGCAGCAGCAGGCGGCTACTGCACAAGCTGCAGCAGCTGCCCAGGCAGCAGCCGTGGCAGGAAACATCCCTGGCCCAGGATCAGTAGGTGGAATAGCTCCAGCTATCAGTCTGTCAGCTGCTGCTGGAATTGGTGTTGATGACCTTCGTCGCTTATGCATACTCAGGATGAGTTTTGTGAAAGGCTGGGGACCGGATTACCCAAGACAGAGCATCAAAGAAACACCTTGCTGGATTGAAATTCACTTACACCGGGCCCTCCAGCTCCTAGACGAAGTACTTCATACCATGCCGATTGCAGACCCACAACCTTTAGAC'.upper())

#complicated (special cases)

# STAT3 (ctag, nterm)
STAT3nterm = Seq('CAGCTCTCAGAGGATCCCGGAAATTTAACATTCTGGGCACAAACACAAAAGTGATGAACATGGAAGAATCCAACAACGGCAGCCTCTCTGCAGAATTCAAACACTTGACCCTGAGGGAGCAGAGATGTGGGAATGGGGGCCGAGCCAATTGTGATGCTTCCCTGATTGTGACTGAGGAGCTGCACCTGATCACCTTTGAGACAGAGGTGTATCACCAAGGCCTCAAGATTGACCTAGAAACCCACTCCTTGCCAGTTGTGGTGATCTCCAACATCTGTCAGATGCCAAATGCCTGGGCGTCCATCCTGTGGTACAACATGCTGACCAACAATCCCAAGAATGTAAACTTTTTTACCAAGCCCCCAATTGGAACCTGGGATCAAGTGGCCGAGGTCCTGAGCTGGCAGTTCTCCTCCACCACCAAGCGAGGACTGAGCATCGAGCAGCTGACTACACTGGCAGAGAAACTCTTGGGACCTGGTGTGAATTATTCAGGGTGTCAGATCACATGGGCTAAATTTTGCAAAGAAAACATGGCTGGCAAGGGCTTCTCCTTCTGGGTCTGGCTGGACAATATCATTGACCTTGTGAAAAAGTACATCCTGGCCCTTTGGAACGAAGGGTACATCATGGGCTTTATCAGTAAGGAGCGGGAGCGGGCCATCTTGAGCACTAAGCCTCCAGGCACCTTCCTGCTAAGATTCAGTGAAAGCAGCAAAGAAGGAGGCGTCACTTTCACTTGGGTGGAGAAGGACATCAGCGGTAAGACCCAGATCCAGTCCGTGGAACCATACACAAAGCAGCAGCTGAACAACATGTCATTTGCTGAAATCATCATGGGCTATAAGATCATGGATGCTACCAATATCCTGGTGTCTCCACTGGTCTATCTCTATCCTGACATTCCCAAGGAGGAGGCATTCGGAAAGTATTGTCGGCCAGAGAGCCAGGAGCATCCTGAAGCTGACCCAGGTAGCGCTGCCCCATACCTGAAGACCAAGTTTATCTGTGTGACACCAACGACCTGCAGCAATACCATTGACCTGCCGATGTCCCCCCGCACTTTAGATTCATTGATGCAGTTTGGAAATAATGGTGAAGGTGCTGAACCCTCAGCAGGAGGGCAGTTTGAGTCCCTCACCTTTGACATGGAGTTGACCTCGGAGTGCGCTACCTCCCCCATG'.upper())

# STAT3 (ctag, cterm)
STAT3cterm = Seq('ATGGCCCAATGGAATCAGCTACAGCAGCTTGACACACGGTACCTGGAGCAGCTCCATCAGCTCTACAGTGACAGCTTCCCAATGGAGCTGCGGCAGTTTCTGGCCCCTTGGATTGAGAGTCAAGATTGGGCATATGCGGCCAGCAAAGAATCACATGCCACTTTGGTGTTTCATAATCTCCTGGGAGAGATTGACCAGCAGTATAGCCGCTTCCTGCAAGAGTCGAATGTTCTCTATCAGCACAATCTACGAAGAATCAAGCAGTTTCTTCAGAGCAGGTATCTTGAGAAGCCAATGGAGATTGCCCGGATTGTGGCCCGGTGCCTGTGGGAAGAATCACGCCTTCTACAGACTGCAGCCACTGCGGCCCAGCAAGGGGGCCAGGCCAACCACCCCACAGCAGCCGTGGTGACGGAGAAGCAGCAGATGCTGGAGCAGCACCTTCAGGATGTCCGGAAGAGAGTGCAGGATCTAGAACAGAAAATGAAAGTGGTAGAGAATCTCCAGGATGACTTTGATTTCAACTATAAAACCCTCAAGAGTCAAGGAGACATGCAAGATCTGAATGGAAACAACCAGTCAGTGACCAGGCAGAAGATGCAGCAGCTGGAACAGATGCTCACTGCGCTGGACCAGATGCGGAGAAGCATCGTGAGTGAGCTGGCGGGGCTTTTGTCAGCGATGGAGTACGTGCAGAAAACTCTCACGGACGAGGAGCTGGCTGACTGGAAGAGGCGGCAACAGATTGCCTGCATTGGAGGCCCGCCCAACATCTGCCTAGATCGGCTAGAAAACTGGATAACGTCATTAGCAGAATCTCAACTTCAGACCCGTCAACAAATTAAGAAACTGGAGGAGTTGCAGCAAAAAGTTTCCTACAAAGGGGACCCCATTGTACAGCACCGGCCGATGCTGGAGGAGAGAATCGTGGAGCTGTTTAGAAACTTAATGAAAAGTGCCTTTGTGGTGGAGCGGCAGCCCTGCATGCCCATGCATCCTGACCGGCCCCTCGTCATCAAGACCGGCGTCCAGTTCACTACTAAAGTCAGGTTGCTGGTCAAATTCCCTGAGTTGAATTATCAGCTTAAAATTAAAGTGTGCATTGACAAAGACTCTGGGGACGTTGCAGCTCTCAGAGGATCCCGGAAATTTAACATTCTGGGCACAA'.upper())

# HPS1 (ntag)
HPS1 = Seq('ATGAAGTGCGTCTTGGTGGCCACTGAGGGCGCAGAGGTCCTCTTCTACTGGACAGATCAGGAGTTTGAAGAGAGTCTCCGGCTGAAGTTCGGGCAGTCAGAGAATGAGGAAGAGGAGCTCCCTGCCCTGGAGGACCAGCTCAGCACCCTCCTAGCCCCGGTCATCATCTCCTCCATGACGATGCTGGAGAAGCTCTCGGACACCTACACATGCTTCTCCACGGAAAATGGCAACTTCCTGTATGTCCTTCACCTGTTTGGAGAATGCCTGTTCATTGCCATCAATGGTGACCACACCGAGAGCGAGGGGGACCTGCGGCGGAAGCTGTATGTGCTCAAGTACCTGTTTGAAGTGCACTTTGGGCTGGTGACTGTGGACGGTCATCTTATCCGAAAGGAGCTGCGGCCCCCAGACCTGGCGCAGCGTGTCCAGCTGTGGGAGCACTTCCAGAGCCTGCTGTGGACCTACAGCCGCCTGCGGGAGCAGGAGCAGTGCTTCGCCGTGGAGGCCCTGGAGCGACTGATTCACCCCCAGCTCTGTGAGCTGTGCATAGAGGCGCTGGAGCGGCACGTCATCCAGGCTGTCAACACCAGCCCCGAGCGGGGAGGCGAGGAGGCCCTGCATGCCTTCCTGCTCGTGCACTCCAAGCTGCTGGCATTCTACTCTAGCCACAGTGCCAGCTCCCTGCGCCCGGCCGACCTGCTTGCCCTCATCCTCCTGGTTCAGGACCTCTACCCCAGCGAGAGCACAGCAGAGGACGACATTCAGCCTTCCCCGCGGAGGGCCCGGAGCAGCCAGAACATCCCCGTGCAGCAGGCCTGGAGCCCTCACTCCACGGGCCCAACTGGGGGGAGCTCTGCAGAAACGGAGACAGACAGCTTCTCCCTCCCTGAGGAGTACTTCACACCAGCTCCTTCCCCTGGCGATCAGAGCTCAGGTAGCACCATCTGGCTGGAGGGGGGCACCCCCCCCATGGATGCCCTTCAGATAGCAGAGGACACCCTCCAAACACTGGTTCCCCACTGCCCTGTGCCTTCCGGCCCCAGAAGGATCTTCCTGGATGCCAACGTGAAGGAAAGCTACTGCCCCCTAGTGCCCCACACCATGTACTGCCTGCCCCTGTGGCAGGGCATCAACCTGGTGCTCCTGACCAGGAGCCCCAGCGCGCCCCTGGCCCTGGTTCTGTCCCAGCTGATGGATGGCTTCTCCATGCTGGAGAAGAAGCTGAAGGAAGGGCCGGAGCCCGGGGCCTCCCTGCGCTCCCAGCCCCTCGTGGGGGACCTGCGCCAGAGGATGGACAAGTTTGTCAAGAATCGAGGGGCACAGGAGATTCAGAGCACCTGGCTGGAGTTTAAGGCCAAGGCTTTCTCCAAAAGTGAGCCCGGATCCTCCTGGGAGCTGCTCCAGGCATGTGGGAAGCTGAAGCGGCAGCTCTGCGCCATCTACCGGCTGAACTTTCTGACCACAGCCCCCAGCAGGGGAGGCCCACATCTGCCCCAGCATCTGCAGGACCAAGTGCAGAGGCTCATGCGGGAGAAGCTGACGGACTGGAAGGACTTCTTGCTGGTGAAAAGCAGGAGGAACATCACCATGGTGTCCTACCTAGAAGACTTCCCAGGCTTGGTGCACTTCATCTATGTGGACCGCACCACTGGGCAGATGGTGGCGCCTTCCCTCAACTGCAGTCAAAAGACCTCGTCGGAGTTGGGCAAGGGGCCGCTGGCTGCCTTTGTCAAAACTAAGGTCTGGTCCCTGATCCAGCTGGCGCGCAGATACCTGCAGAAGGGCTACACCACGCTGCTGTTCCAGGAGGGGGATTTCTACTGCTCCTACTTCCTGTGGTTCGAGAATGACATGGGGTACAAACTCCAGATGATCGAGGTGCCCGTCCTCTCCGACGACTCAGTGCCTATCGGCATGCTGGGAGGAGACTACTACAGGAAGCTCCTGCGCTACTACAGCAAGAACCGCCCAACCGAGGCTGTCAGGTGCTACGAGCTGCTGGCCCTGCACCTGTCTGTCATCCCCACTGACCTGCTGGTGCAGCAGGCCGGCCAGCTGGCCCGGCGCCTCTGGGAGGCCTCCCGTATCCCCCTGCTC'.upper())

# PIK3CA (ntag)
PIK3CA = Seq('ATGCCTCCACGACCATCATCAGGTGAACTGTGGGGCATCCACTTGATGCCCCCAAGAATCCTAGTAGAATGTTTACTACCAAATGGAATGATAGTGACTTTAGAATGCCTCCGTGAGGCTACATTAATAACCATAAAGCATGAACTATTTAAAGAAGCAAGAAAATACCCCCTCCATCAACTTCTTCAAGATGAATCTTCTTACATTTTCGTAAGTGTTACTCAAGAAGCAGAAAGGGAAGAATTTTTTGATGAAACAAGACGACTTTGTGACCTTCGGCTTTTTCAACCCTTTTTAAAAGTAATTGAACCAGTAGGCAACCGTGAAGAAAAGATCCTCAATCGAGAAATTGGTTTTGCTATCGGCATGCCAGTGTGTGAATTTGATATGGTTAAAGATCCAGAAGTACAGGACTTCCGAAGAAATATTCTGAACGTTTGTAAAGAAGCTGTGGATCTTAGGGACCTCAATTCACCTCATAGTAGAGCAATGTATGTCTATCCTCCAAATGTAGAATCTTCACCAGAATTGCCAAAGCACATATATAATAAATTAGATAAAGGGCAAATAATAGTGGTGATCTGGGTAATAGTTTCTCCAAATAATGACAAGCAGAAGTATACTCTGAAAATCAACCATGACTGTGTACCAGAACAAGTAATTGCTGAAGCAATCAGGAAAAAAACTCGAAGTATGTTGCTATCCTCTGAACAACTAAAACTCTGTGTTTTAGAATATCAGGGCAAGTATATTTTAAAAGTGTGTGGATGTGATGAATACTTCCTAGAAAAATATCCTCTGAGTCAGTATAAGTATATAAGAAGCTGTATAATGCTTGGGAGGATGCCCAATTTGATGTTGATGGCTAAAGAAAGCCTTTATTCTCAACTGCCAATGGACTGTTTTACAATGCCATCTTATTCCAGACGCATTTCCACAGCTACACCATATATGAATGGAGAAACATCTACAAAATCCCTTTGGGTTATAAATAGTGCACTCAGAATAAAAATTCTTTGTGCAACCTACGTGAATGTAAATATTCGAGACATTGATAAGATCTATGTTCGAACAGGTATCTACCATGGAGGAGAACCCTTATGTGACAATGTGAACACTCAAAGAGTACCTTGTTCCAATCCCAGGTGGAATGAATGGCTGAATTATGATATATACATTCCTGATCTTCCTCGTGCTGCTCGACTTTGCCTTTCCATTTGCTCTGTTAAAGGCCGAAAGGGTGCTAAAGAGGAACACTGTCCATTGGCATGGGGAAATATAAACTTGTTTGATTACACAGACACTCTAGTATCTGGAAAAATGGCTTTGAATCTTTGGCCAGTACCTCATGGATTAGAAGATTTGCTGAACCCTATTGGTGTTACTGGATCAAATCCAAATAAAGAAACTCCATGCTTAGAGTTGGAGTTTGACTGGTTCAGCAGTGTGGTAAAGTTCCCAGATATGTCAGTGATTGAGGAGCATGCCAATTGGTCTGTATCCCGAGAAGCAGGATTTAGCTATTCCCACGCAGGACTGAGTAACAGACTAGCTAGAGACAATGAATTAAGGGAAAATGACAAAGAACAGCTCAAAGCAATTTCTACACGAGATCCTCTCTCTGAAATCACTGAGCAGGAGAAAGATTTTCTATGGAGTCACAGACACTATTGTGTAACTATCCCCGAAATTCTACCCAAATTGCTTCTGTCTGTTAAATGGAATTCTAGAGATGAAGTAGCCCAGATGTATTGCTTGGTAAAAGATTGGCCTCCAATCAAACCTGAACAGGCTATGGAACTTCTGGACTGTAATTACCCAGATCCTATGGTTCGAGGTTTTGCTGTTCGGTGCTTGGAAAAATATTTAACAGATGACAAACTTTCTCAGTATTTAATTCAGCTAGTACAGGTCCTAAAATATGAACAATATTTGGATAACTTGCTTGTGAGATTTTTACTGAAGAAAGCATTGACTAATCAAAGGATTGGGCACTTTTTCTTTTGGCATTTAAAATCTGAGATGCACAATAAAACAGTTAGCCAGAGGTTTGGCCTGCTTTTGGAGTCCTATTGTCGTGCATGTGGGATGTATTTGAAGCACCTGAATAGGCAAGTCGAGGCAATGGAAAAGCTCATTAACTTAACTGACATTCTCAAACAGGAGAAGAAGGATGAAACACAAAAGGTACAGATGAAGTTTTTAGTTGAGCAAATGAGGCGACCAGATTTCATGGATGCTCTACAGGGCTTTCTGTCTCCTCTAAACCCTGCTCATCAACTAGGAAACCTCAGGCTTGAAGAGTGTCGAATTATGTCCTCTGCAAAAAGGCCACTGTGGTTGAATTGGGAGAACCCAGACATCATGTCAGAGTTACTGTTTCAGAACAATGAGATCATCTTTAAAAATGGGGATGATTTACGGCAAGATATGCTAACACTTCAAATTATTCGTATTATGGAAAATATCTGGCAAAATCAAGGTCTTGATCTTCGAATGTTACCTTATGGTTGTCTGTCAATCGGTGACTGTGTGGGACTTATTGAGGTGGTGCGAAATTCTCACACTATTATGCAAATTCAGTGCAAAGGCGGCTTGAAAGGTGCACTGCAGTTCAACAGCCACACACTACATCAGTGGCTCAAAGACAAGAACAAAGGAGAAATATATGATGCAGCCATTGACCTGTTTACACGTTCATGTGCTGGATACTGTGTAGCTACCTTCATTTTGGGAATTGGAGATCGTCACAATAGTAACATCATGGTGAAAGACGATGGACAACTGTTTCATATAGATTTTGGACACTTTTTGGATCACAAGAAGAAAAAATTTGGTTATAAACGAGAACGTGTGCCATTTGTTTTGACACAGGATTTCTTAATAGTGATTAGTAAAGGAGCCCAAGAATGCACAAAGACAAGAGAATTTGAGAGGTTTCAGGAGATGTGTTACAAGGCTTATCTAGCTATTCGACAGCATGCCAATCTCTTCATAAATCTTTTCTCAATGATGCTTGGCTCTGGAATGCCAGAACTACAATCTTTTGATGACATTGCATACATTCGAAAGACCCTAGCCTTAGATAAAACTGAGCAAGAGGCTTTGGAGTATTTCATGAAACAAATGAATGATGCACATCATGGTGGCTGGACAACAAAAATGGATTGGATCTTCCACACAATTAAACAGCATGCATTGAAC'.upper())


#just for checking primers 
#Felicia lib
felicia_lib = Seq('GGTCTCTAATCGCCGATCCGAAATGGGAATTTCCTAGGAAGAATCTCGTTCTTGGCAAGACGCTTGGGGAGGGGGAGTTCGGGAAGGTAGTAAAAGCGACGGCTTTTCATCTCAAAGGGCGCGCGGGTTATACTACCGTCGCTGTTAAAATAGAGACC'.upper())

#Arielle lib
arielle_lib = Seq('gattataccgcaactacacgccaccatgctgcagaacgagcttgctcttaagttggctggacttgatattaacaagactggaggaggttctcatcatcatcatcatcatggtatggctagcatgactggtggacagcaaatgggtcgggatctgtacgacgatgacgataaggatctcgcaacaatggtcgactcatcgcgacgtaagtggaataagtggggtcacgcagtcagagctataggtcggctgagctcagcgaacaacaccgaaatgatgtacccagcggatggtggtctgcgtggttacactcacatggcgctgaaagttgatggcggcggtcacctgtcctgttctttcgtgaccacctaccgctccaaaaagactgtcggcaacattaagatgcctgccattcattacgtcagccaccgtctggagcgcctggaggagagcgataacgaaatgtttgtcgtacagcgtgaacacgcagttgccaagtttgtgggcctgggtggtggcggcggtaccggagggagcatgaactccctgatcaaggagaacatgcgtatgaaagtggttctggaaggctccgtaaacggccaccagttcaaatgcactggtgaaggcgaaggcaacccgtatatgggcacccagactatgcgtatcaaagtgatcgagggtggtccgctgccgtttgcgttcgacatcctggcgacgtcctttatgtatggctcccgtaccttcatcaaatatccgaaaggcatcccggatttctttaagcagtccttcccggaaggttttacctgggaacgtgtgacccgttacgaagacggcggcgtaattaccgttatgcaagacacgtctctggaggatggctgcctggtgtatcacgtgcaggttcgcggtgtgaacttcccgagcaatggtgctgtaatgcaaaagaaaaccaaaggttgggagcctacggactcccaactgactgaagagcagatcgcagaatttaaagaggctttctccctatttgacaaggacggggatgggacaataacaaccaaggagatggggacggtgatgcggtctctggggcagaaccccacagaagcagagctgcaggacatgatcaatgaagtagatgccgacggtgacggcacaatcgacttccctgagttcctgattatgatggcaggcaaaatgaaatacacagacagtgaagaagaaattagagaagcgttcggcgtgtttgataaggatggcaatggctacatcagtgcagcagagcttcgccacgtgatgacaaaccttggagagaagttaacagatgaagaggttgatgaaatgatcagggaagcagacagcgatggggatggtcaggtaaactacgaagagtttgtacaaatgatgacagcgaagggcagcggagctactaattca'.upper())

#Jessica lib 
circRNA_lib = Seq('CGTCTCACACGGCTCTGATGCGTCTGGATTACGACTCGTCGTCACTAGATGTACATGAAGAGCTATGCCTGCAGGCACTCGCTCTTCATGTACAACGTCCGGGGCCCACACTCTACTCGACACGTACCCTGATGGTGTACGTGGACCCGACCGTCCCCGGACGTGCGGTGAGACGTGAGGTTTGGCCGGAAGTTGAGTCTCTCCAAGAC'.upper())

#LMNA Lib
LMNA_tile = Seq('GGTCTCTCCAGGTGGGCGGACCCATCTCCTCTGGCTCCTCTGCCTCCAGTGTCACGGTCACTCGCAGCTACCGCAGTGTGGGGGGCAGTGGGGGTGGCAGCTTCGGGGACAATCTGGTCACCCGCTCCTACCTCCTGGGCAACTCCAGCCCCCGAACCCAGAGCCCCCAGAACTGCAGCATCATGtagAGAGACC'.upper())

#Gria3 (tile)
GRIA3 = Seq('GGTCTCTTCGCTGCTTTCCTGACTGTGGAGAGGATGGTTTCTCCCATAGAGAGTGCTGAAGATTTAGCTAAACAGACTGAAATTGCATATGGGACCCTGGACTCCGGTTCAACAAAAGAATTTTTCAGAAGATCCAAAATTGCTGTGTACGAGAAAATGTGGTCTTACATGAAATCAGCGGAGCCATCTGTGAGAGACC'.upper())

#GRIN2A (tile)
GRIN2A = Seq('CGTCCACCTGCCTAGATGGGACGGGTAGGATATTGGACTCTTTTGGTACTTCCAGCTCTGCTTGTGTGGCGAGGGCCCGCTCCATCCGCAGCAGCCGAGAAAGGACCCCCTGCACTCAACATCGCTGTTATGCTCGGACATAGTCATGATGTCACTGAACGAGAGCTCCGTACTCTTTGGGGTCCAGAACAAGCAGCCGGCCTTCCTCTTGATGTCAATGTCGTGGCGTTGTTGATGAATCGGACTGATCCGAAATCTTTGATTACCCATGTCTGTGATTTAATGTCAGGCGCGAGGATTCATGGTCTGGTATTCGGAGATGATACCGATCAAGAAGCTGTTGCTCAAATGCTTGACTTCATTTCTTCACATACTTTTGTTCCAATATTAGGAATCCACGGCGGAGCATCCATGATTATGGCGGATAAAGACCCCACTAGCACATTCTTTCAATTCGGTGCCAGCATTCAACAACAGGCGACCGTTATGCTCAAAATTATGCAAGACTACGATTGGCACGTTTTCAGCCTCGTCACTACGATCTTTCCCGGATATCGTGAGTTCATATCTTTTGTGAAAACTACTGTCGATAATTCTTTCGTAGGTTGGGATATGCAAAACGTCATTACGTTGGATACAAGTTTCGAAGATGCCAAAACCCAGGTGCAACTCAAGAAAATTCATAGCTCAGTGATACTTCTGTATTGCAGTAAGGATGAAGCGGTGCTGATCTTGTCAGAAGCTCGGAGCCTGGGGCTGACAGGCTACGACTTCTTTTGGATAGTACCTAGTCTGGTTAGCGGCAATACCGAACTGATTCCCAAGGAATTCCCTAGCGGCCTTATCTCAGTTTCTTATGACGATTGGGATTATTCTTTGGAAGCCAGGGTCAGAGATGGTATCGGAATTTTAACGACTGCAGCTTCCAGTATGCTCGAAAAGTTTAGCTATATTCCTGAAGCTAAAGCTTCCTGTTATGGTCAAATGGAAAGACCTGAAGTACCCATGCATACACTCCATCCTTTCATGGTAAACGTGACCTGGGACGGGAAGGATCTCAGCTTTACAGAAGAGGGATATCAAGTCCATCCACGGCTTGTCGTAATAGTCTTGAATAAGGATAGGGAGTGGGAGAAAGTAGGGAAATGGGAAAATCACACCCTTTCACTTCGCCATGCTGTTTGGCCACGGTATAAAAGCTTTTCTGATTGCGAACCCGACGATAATCACCTGTCAATAGTGACGTTGGAAGAAGCACCTTTTGTTATTGTTGAGGATATCGATCCTCTCACTGAAACATGCGTTCGAAATACTGTTCCCTGCCGTAAATTTGTGAAGATTAATAACAGCACAAACGAAGGTATGAACGTCAAGAAGTGTTGTAAAGGCTTTTGTATCGACATACTCAAGAAACTCAGTCGCACAGTTAAATTCACCTATGATTTGTACTTGGTTACAAACGGCAAACACGGTAAGAAGGTGAATAACGTTTGGAACGGTATGATAGGCGAGGTCGTGTACCAGAGGGCTGTGATGGCTGTAGGATCATTAACAATTAACGAGGAGCGCAGCGAGGTTGTCGATTTTAGTGTTCCGTTCGTTGAGACAGGCATATCTGTGATGGTGTCCCGGAGCAACGGGACAGTGTCCCCCAGTGCCTTCCTGGAGCCGTTTTCCGCTTCAGTGTGGGTTATGATGTTCGTCATGTTGCTGATCGTGTCCGCTATCGCAGTGTTCGTGTTCGAGTATTTTAGCCCCGTGGGCTATAATCGGAATCTTGCGAAGGGCAAGGCGCCTCACGGCCCATCCTTCACCATCGGGAAGGCCATTTGGTTGCTCTGGGGTCTTGTCTTTAACAATTCTGTCCCCGTGCAAAACCCAAAGGGTACTACTTCTAAAATTATGGTGAGTGTGTGGGCTTTCTTTGCAGTTATCTTTCTTGCATCCTATACTGCTAACTTAGCGGCTTTTATGATTCAGGAAGAGTTCGTTGATCAGGTCACTGGATTGAGCGATAAGAAATTCCAAAGGCCCCACGATTACAGTCCCCCGTTCAGGTTCGGAACCGTCCCAAACGGGTCTACAGAAAGGAATATCCGCAACAATTACCCATATATGCACCAATATATGACTAAGTTCAACCAGAAAGGGGTGGAAGATGCACTCGTTTCCTTAAAGACCGGAAAATTGGATGCCTTTATTTATGACGCGGCCGTTTTAAACTATAAAGCCGGTCGTGACGAGGGGTGTAAACTCGTCACAATTGGAAGCGGCTATATTTTCGCTACAACTGGCTACGGCATCGCTCTGCAAAAGGGTAGTCCATGGAAACGGCAAATTGATTTGGCGCTTCTGCAATTCGTCGGAGATGGCGAAATGGAAGAACTTGAAACATTATGGCTGACCGGGATTTGTCATAATGAAAAGAATGAAGTCATGTCATCTCAACTTGATATCGATAATATGGCTGGGGTGTTTTATATGCTTGCGGCTGCTATGGCATTGAGTCTGATTACTTTTATTTGGGAACATCTGTTTTATTGGAAACTTAGGTTTTGCTTTACAGGAGTTTGTTCTGATCGCCCAGGCCTTCTGTTTAGCATATCACGGGGAATTTATTCCTGTATCCACGGGGTTCATATCGAGGAGAAGAAGAAAAGCCCTGATTTTAACCTTACAGGGAGCCAATCCAATATGCTGAAGTTGCTGAGGTCTGCTAAGAATATCAGTTCCATGTCAAATATGAATTCTAGTCGTATGGATAGTCCTAAGCGGGCCGCAGATTTTATTCAGCGGGGTAGTCTGATTATGGATATGGTGTCCGACAAAGGCAACCTGATGTATAGCGATAATCGTAGTTTCCAAGGTAAGGAATCAATATTCGGCGATAATATGAATGAGCTGCAGACTTTCGTCGCAAATCGCCAAAAGGACAATCTGAACAATTACGTGTTTCAAGGCCAGCACCCCTTAACGCTGAACGAATCAAATCCCAATACCGTTGAAGTTGCAGTCTCCACCGAGTCTAAGGCTAATAGCCGGCCACGCCAACTCTGGAAGAAGTCTGTCGACAGTATCAGACAAGACAGCTTGAGCCAAAACCCGGTGAGTCAACGTGACGAAGCTACTGCCGAAAACAGAACTCATAGCCTGAAATCTCCCCGTTACCTGCCCGAGGAAATGGCACATTCCGATATCTCTGAGACATCCAACAGAGCTACATGTCATAGAGAGCCAGATAATTCCAAGAATCATAAGACAAAAGATAATTTCAAGAGAAGCGTCGCCAGCAAGTATCCAAAAGATTGCTCCGAAGTTGAAAGGACATATCTCAAGACAAAGAGCTCTAGCCCACGAGATAAAATTTATACAATCGACGGGGAGAAAGAACCCGGGTTTCATCTTGACCCGCCTCAATTCGTGGAGAACGTTACATTGCCAGAAAATGTAGATTTTCCTGATCCATATCAAGACCCGAGCGAGAATTTTCGGAAAGGTGATAGCACCTTACCGATGAATCGCAATCCTCTGCACAACGAGGAAGGCCTGAGCAATAATGATCAATACAAGCTGTATAGCAAACATTTTACACTGAAGGATAAAGGCTCACCTCATAGCGAAACGTCTGAAAGGTATAGACAAAATTCAACTCATTGTCGCAGTTGTCTGAGCAATATGCCAACTTACAGTGGTCATTTTACTATGCGGTCCCCATTTAAATGTGACGCTTGTCTCAGGATGGGTAATTTGTACGATATTGACGAGGATCAAATGCTGCAAGAAACTGGGAATCCTGCTACAGGCGAACAAGTGTATCAACAAGATTGGGCCCAAAATAACGCTCTCCAGCTGCAGAAGAATAAACTGCGGATAAGTAGGCAACACAGCTATGACAATATCGTAGATAAGCCCCGGGAATTAGATCTGTCACGCCCTTCCAGATCCATCTCACTGAAAGATCGCGAGAGATTGCTCGAAGGGAACTTCTATGGGTCCCTCTTCAGCGTACCTAGCTCCAAGCTTTCAGGCAAGAAGAGTTCACTGTTTCCTCAGGGGTTAGAAGATAGTAAAAGATCTAAAAGTCTGCTCCCTGATCATACAAGTGACAATCCATTTCTGCATAGCCATCGGGACGATCAGCGATTAGTAATCGGTCGATGTCCTTCCGATCCCTATAAGCATTCCCTTCCCTCTCAAGCCGTTAACGATTCATACCTGCGATCAAGTCTGCGTAGCACAGCCTCTTATTGCTCACGCGATTCAAGAGGTCATAACGACGTCTACATCTCCGAACACGTGATGCCCTACGCCGCTAACAAGAACAACATGTATAGTACACCGCGGGTGCTGAACAGCTGTTCCAACCGCCGTGTCTATAAGAAGATGCCCAGCATAGAGAGTGACGTATGACTAGGCAGGTGGCAT'.upper())

#TRIO (tile) xbaI site added downstream of tile
TRIO = Seq('CGTCCACCTGCCTAGATGTCTGGTTCATCTGGCGGTGCAGCTGCTCCTGCGGCATCTAGCGGCCCAGCTGCAGCTGCTTCTGCTGCCGGGAGTGGGTGTGGGGGGGGCGCGGGTGAAGGCGCTGAAGAAGCAGCAAAAGATCTTGCTGATATTGCAGCATTCTTTCGGAGCGGCTTCAGGAAGAATGACGAGATGAAGGCAATGGACGTGCTGCCTATCTTAAAAGAGAAGGTAGCGTATCTGTCCGGCGGCCGCGACAAGAGAGGCGGGCCTATACTGACATTCCCCGCAAGGTCTAACCACGATCGTATTCGTCAAGAGGACTTACGACGCCTGATATCATACCTGGCTTGCATCCCAAGTGAAGAAGTGTGTAAACGCGGATTTACCGTTATTGTTGATATGAGAGGCAGCAAATGGGATAGCATTAAACCACTCCTCAAAATTTTACAAGAAAGTTTTCCGTGTTGTATACACGTCGCTCTCATAATTAAACCTGATAATTTCTGGCAAAAGCAACGGACGAACTTCGGTTCATCCAAGTTCGAGTTCGAAACCAACATGGTTAGCCTTGAGGGACTCACTAAGGTCGTGGACCCAAGCCAACTTACCCCAGAATTCGACGGATGTCTCGAGTATAATCATGAGGAGTGGATCGAGATACGTGTCGCCTTCGAGGATTATATCAGTAACGCTACACATATGCTTAGCAGGCTTGAAGAACTGCAAGATATTCTTGCAAAGAAAGAACTCCCACAAGACCTGGAAGGCGCCAGGAACATGATAGAGGAGCACAGTCAACTTAAGAAGAAAGTCATCAAAGCTCCTATTGAAGATTTAGACCTCGAAGGGCAGAAATTGCTGCAACGAATCCAATCTTCCGAGTCTTTCCCTAAGAAGAATAGTGGATCTGGAAACGCCGATTTACAAAATTTGCTGCCAAAAGTCTCAACTATGTTGGATAGACTCCATAGCACTCGCCAACACTTGCATCAAATGTGGCACGTTCGTAAACTTAAATTAGATCAATGTTTTCAACTCAGACTCTTCGAGCAAGACGCAGAAAAGATGTTCGATTGGATTACGCATAATAAGGGGTTGTTCCTGAATTCCTATACGGAAATCGGCACAAGTCATCCCCACGCGATGGAACTCCAAACCCAACATAACCATTTCGCGATGAATTGCATGAATGTATACGTGAACATTAATAGGATTATGAGTGTCGCTAACAGGTTAGTCGAATCCGGACATTACGCTTCACAACAAATTAGACAAATTGCCTCACAATTGGAACAAGAATGGAAAGCCTTCGCTGCTGCTTTGGACGAACGATCAACACTCTTGGATATGAGCAGTATCTTTCATCAGAAAGCTGAGAAATACATGAGTAATGTTGACTCCTGGTGCAAGGCCTGTGGAGAAGTGGATCTGCCGAGCGAACTTCAAGATCTCGAGGACGCTATACACCATCATCAAGGCATCTACGAGCACATTACACTCGCCTACAGTGAAGTATCTCAGGACGGAAAGAGCCTTCTCGATAAACTGCAAAGGCCGCTGACGCCTGGGTCCTCTGACTCACTTACTGCGAGCGCTAATTATAGCAAAGCTGTCCATCACGTGCTCGACGTGATACATGAAGTACTCCATCATCAACGACAATTAGAAAATATTTGGCAGCATCGGAAAGTGCGGCTCCACCAACGCTTACAACTTTGCGTGTTTCAACAAGATGTGCAACAAGTTCTGGATTGGATTGAAAATCATGGCGAGGCGTTCCTCTCAAAGCACACCGGAGTTGGAAAGTCCCTGCACAGGGCTCGTGCCCTGCAAAAGAGACACGAGGACTTCGAGGAAGTCGCGCAAAATACTTATACTAACGCCGACAAGCTGCTCGAGGCCGCGGAGCAATTGGCGCAAACAGGAGAGTGCGATCCTGAGGAAATCTACCAAGCAGCACACCAATTGGAAGATAGAATACAGGACTTTGTGAGGCGGGTGGAACAAAGAAAGATTCTCCTCGATATGAGTGTAAGTTTCCATACACACGTAAAGGAACTCTGGACCTGGTTAGAAGAACTCCAGAAAGAACTCTTGGATGATGTCTACGCTGAATCTGTCGAAGCTGTTCAAGATCTTATTAAAAGATTCGGGCAACAACAACAAACAACGCTCCAAGTCACCGTGAATGTCATTAAAGAGGGCGAAGATCTGATACAACAACTGCGCGATTCAGCAATAAGCTCAAATAAGACTCCACATAATTCCTCTATTAATCATATCGAAACAGTTTTGCAACAGCTTGATGAAGCACAATCCCAAATGGAAGAACTGTTTCAGGAACGTAAAATAAAGCTCGAACTGTTCCTTCAGCTCAGAATTTTCGAAAGAGATGCTATTGACATAATTTCTGATCTGGAAAGCTGGAACGACGAACTGAGTCAACAGATGAACGATTTTGATACCGAGGACCTGACCATCGCGGAACAACGGTTGCAACATCACGCCGATAAGGCTTTAACTATGAATAATCTGACCTTCGATGTGATACATCAGGGCCAGGACCTGCTCCAATACGTGAACGAAGTTCAAGCTAGTGGAGTCGAACTCCTCTGCGACCGTGACGTGGATATGGCTACCAGAGTTCAAGATTTGTTGGAATTCTTGCACGAGAAGCAACAAGAGCTGGACCTCGCTGCGGAACAACACAGGAAGCATCTCGAACAATGTGTTCAATTACGTCATTTGCAAGCCGAGGTCAAGCAAGTCCTCGGATGGATTAGGAATGGTGAAAGCATGCTGAACGCAGGCCTGATTACCGCTTCTTCACTCCAGGAAGCCGAACAACTGCAACGGGAACATGAACAATTTCAACACGCTATCGAAAAGACTCACCAATCAGCTTTGCAAGTCCAACAGAAAGCTGAGGCAATGCTGCAAGCAAATCATTATGATATGGATATGATTCGCGATTGTGCAGAGAAAGTCGCCTCACATTGGCAGCAACTGATGCTTAAAATGGAAGATCGTTTGAAACTGGTGAATGCTTCAGTTGCCTTTTATAAGACATCCGAACAAGTTTGTTCCGTGCTGGAATCCCTCGAGCAAGAATATAAAAGGGAAGAGGATTGGTGCGGCGGCGCCGACAAACTCGGACCCAATAGCGAAACTGATCATGTTACCCCAATGATAAGTAAACATCTTGAACAGAAAGAAGCTTTTCTTAAAGCCTGTACTCTGGCAAGACGGAACGCCGATGTGTTTCTTAAGTATCTCCATCGAAATTCTGTCAATATGCCCGGTATGGTTACTCATATTAAGGCCCCCGAGCAACAGGTCAAGAACATTCTGAACGAGCTGTTTCAGAGAGAAAATAGAGTCCTGCACTATTGGACAATGCGGAAAAGGAGACTCGATCAATGCCAACAATATGTTGTATTCGAACGGAGCGCTAAACAAGCACTTGAGTGGATTCACGATAACGGAGAATTTTATCTGTCAACCCATACTAGCACAGGTTCATCAATTCAACATACACAAGAACTGCTTAAGGAACATGAAGAATTTCAAATCACCGCTAAACAGACTAAGGAACGGGTCAAACTCCTGATCCAATTAGCCGACGGTTTCTGCGAGAAGGGACACGCTCACGCCGCCGAAATCAAGAAGTGCGTGACAGCAGTCGACAAACGCTATCGGGACTTTTCACTCAGAATGGAAAAGTATCGGACATCCCTCGAGAAGGCTTTGGGTATAAGCTCCGACTCAAATAAGTCTAGCAAGTCCTTACAATTGGACATAATCCCGGCCAGCATACCCGGGAGCGAAGTTAAGTTGAGAGATGCCGCCCACGAGTTGAACGAGGAAAAGCGCAAGAGTGCTAGACGGAAGGAATTTATCATGGCCGAACTGATACAGACAGAGAAAGCCTACGTGCGCGATCTGAGGGAGTGCATGGACACATATCTCTGGGAGATGACGTCCGGAGTCGAGGAAATCCCGCCCGGTATCGTCAATAAAGAGCTGATTATTTTCGGCAATATGCAGGAGATTTATGAGTTCCACAACAATATCTTTCTTAAAGAACTTGAGAAGTACGAGCAATTACCCGAAGATGTGGGTCACTGCTTCGTGACCTGGGCTGATAAATTCCAAATGTACGTGACTTACTGTAAGAACAAACCCGACTCAACACAACTCATTCTTGAGCACGCCGGAAGCTACTTCGATGAAATCCAACAAAGGCACGGTCTGGCTAACAGCATCTCCAGCTATCTGATCAAGCCTGTACAAAGGATCACTAAATACCAACTGCTGCTGAAGGAACTCCTCACATGTTGCGAAGAGGGCAAAGGTGAAATCAAGGACGGGTTAGAAGTTATGCTGTCAGTTCCTAAAAGAGCTAACGACGCTATGCATTTAAGTATGCTCGAGGGCTTCGACGAGAATATCGAAAGCCAAGGGGAGTTAATACTGCAAGAGTCATTTCAGGTCTGGGATCCAAAGACGCTTATCAGGAAAGGGAGAGAGCGACACTTGTTTCTGTTCGAGATGTCACTGGTGTTCAGCAAAGAGGTAAAGGACAGTTCCGGACGATCCAAATATCTCTACAAGTCAAAGCTGTTCACGAGTGAACTGGGCGTGACGGAGCACGTCGAGGGCGATCCATGTAAGTTCGCATTATGGGTAGGACGCACTCCGACATCTGACAACAAGATCGTGCTGAAAGCAAGCTCAATCGAAAATAAACAAGATTGGATTAAACACATTCGAGAGGTGATTCAAGAGAGAACAATTCATCTCAAAGGCGCTTTAAAAGAACCTATCCATATTCCCAAAACAGCCCCAGCTACCCGGCAGAAAGGTCGGCGCGACGGCGAGGACTTAGATTCTCAGGGCGATGGATCCAGTCAACCAGACACCATCTCAATTGCAAGCAGGACAAGTCAAAATACCCTCGATAGTGACAAACTTTCAGGCGGGTGCGAATTGACGGTTGTCATACACGATTTTACTGCCTGTAATTCTAATGAACTCACTATACGGAGAGGGCAAACAGTAGAGGTGTTGGAAAGACCACACGATAAACCAGATTGGTGCCTTGTAAGAACCACCGATAGGTCACCGGCTGCTGAGGGTCTTGTGCCATGCGGAAGCTTGTGTATAGCGCATAGCCGATCATCCATGGAGATGGAAGGGATATTTAATCATAAGGATTCCCTGTCAGTGAGTAGTAACGATGCATCACCCCCTGCCAGCGTCGCCAGCTTGCAACCTCATATGATTGGCGCTCAATCTTCCCCTGGACCTAAAAGGCCCGGGAATACTCTCAGGAAATGGCTTACAAGTCCAGTCAGACGCCTGTCCAGCGGAAAAGCTGATGGACATGTCAAGAAGCTCGCCCATAAACATAAGAAATCCCGAGAAGTGCGGAAAAGTGCTGATGCTGGGAGCCAAAAGGATAGCGATGATTCAGCAGCTACTCCACAAGATGAAACTGTGGAAGAACGGGGTCGAAATGAAGGGCTTTCCTCTGGCACACTGTCAAAGTCAAGTAGTTCCGGCATGCAATCTTGCGGCGAGGAAGAGGGTGAAGAAGGAGCTGATGCAGTCCCTTTGCCCCCACCAATGGCAATTCAACAACATTCTCTGTTGCAACCTGATAGCCAAGACGATAAAGCATCCTCTAGACTGTTGGTTAGACCAACATCTAGCGAGACACCCAGCGCTGCAGAACTGGTGTCTGCGATCGAAGAGTTGGTCAAGTCAAAGATGGCCCTCGAGGACAGACCTTCTTCCCTGCTGGTAGATCAAGGCGACTCTTCCTCACCCTCTTTTAATCCGAGTGACAACAGCTTGCTTTCCAGCAGCAGTCCGATAGACGAAATGGAAGAACGCAAGAGCAGTTCCTTGAAACGTCGACATTATGTGCTGCAGGAGTTGGTCGAAACTGAACGCGATTACGTCCGAGATCTCGGATACGTCGTCGAAGGTTATATGGCCTTGATGAAAGAGGACGGAGTACCAGACGATATGAAGGGCAAGGATAAGATCGTCTTTGGTAATATACACCAAATCTATGATTGGCATCGTGATTTCTTCCTGGGCGAATTGGAGAAATGTCTGGAAGATCCTGAGAAGTTGGGTAGCCTGTTCGTGAAGCATGAACGCCGGCTTCATATGTATATTGCCTACTGCCAGAACAAGCCGAAAAGCGAACATATCGTGTCCGAGTATATCGACACATTCTTCGAAGATCTGAAACAAAGACTGGGTCATAGACTTCAATTGACTGACCTTTTAATAAAGCCCGTTCAACGAATTATGAAATACCAACTCCTTCTCAAAGATTTCCTGAAATACTCAAAGAAAGCAAGTCTTGACACTAGCGAGCTGGAAAGGGCCGTTGAGGTAATGTGTATCGTGCCAAGAAGGTGTAATGATATGATGAATGTAGGTAGATTGCAGGGCTTTGATGGCAAGATTGTGGCTCAAGGCAAGCTTCTGCTTCAAGATACGTTTCTGGTGACTGATCAGGACGCTGGGCTGTTGCCAAGGTGTCGCGAACGTAGAATTTTCTTATTCGAACAAATTGTGATCTTTTCCGAGCCCCTGGACAAGAAGAAAGGATTTTCTATGCCCGGTTTTCTCTTCAAGAATAGCATTAAAGTTTCTTGTCTGTGTTTGGAAGAGAACGTCGAGAACGACCCATGCAAGTTCGCCCTCACGTCCCGTACTGGGGATGTTGTTGAAACTTTTATTCTCCACTCTAGCAGCCCTAGCGTTCGCCAGACATGGATACACGAGATTAATCAGATCCTGGAGAATCAAAGAAACTTCCTGAACGCATTAACGAGCCCTATTGAATATCAACGCAATCATTCCGGTGGTGGTGGTGGAGGAGGATCTGGCGGAAGTGGTGGTGGTGGCGGGTCCGGTGGAGGAGGAGCTCCATCTGGCGGGTCCGGACATAGCGGTGGGCCTTCAAGTTGTGGTGGAGCGCCATCTACAAGTCGATCCCGCCCTAGCAGGATACCTCAACCCGTTAGGCATCATCCACCGGTCCTTGTTTCAAGTGCCGCTTCCAGTCAAGCTGAAGCGGATAAAATGTCCGGAACAAGCACACCAGGACCGTCACTGCCCCCACCAGGTGCTGCGCCGGAAGCTGGACCAAGTGCACCTTCACGGCGACCACCAGGAGCTGATGCTGAAGGAAGCGAACGGGAAGCTGAACCAATTCCTAAAATGAAAGTTTTAGAATCTCCAAGAAAGGGAGCGGCCAATGCTTCTGGCAGCTCTCCCGATGCTCCAGCGAAAGATGCCAGGGCCTCATTGGGAACACTTCCCCTGGGCAAACCTAGAGCAGGAGCAGCCAGTCCACTTAATAGTCCATTAAGCAGTGCCGTGCCAAGTCTTGGGAAAGAACCTTTTCCACCATCATCACCTTTGCAGAAAGGTGGGAGCTTTTGGTCCAGCATACCAGCAAGCCCAGCTTCCCGGCCGGGAAGCTTTACGTTTCCTGGAGATTCTGATAGTCTGCAAAGGCAAACTCCGCGGCATGCCGCGCCCGGAAAAGACACGGATAGGATGTCCACCTGTAGCAGTGCTTCAGAACAAAGCGTCCAATCTACGCAAAGTAATGGATCTGAGAGTTCCTCTTCTTCCAATATATCTACTATGCTTGTAACGCATGACTATACCGCGGTCAAAGAGGACGAAATAAATGTTTATCAAGGGGAAGTAGTGCAGATCCTCGCAAGTAATCAACAAAATATGTTCTTAGTCTTTAGGGCTGCGACAGATCAATGTCCAGCTGCCGAAGGTTGGATCCCCGGTTTCGTGCTCGGACATACATCTGCTGTGATTGTTGAAAATCCCGATGGCACCCTGAAGAAATCCACTTCATGGCATACTGCCCTTCGGCTCAGAAAGAAGTCAGAAAAGAAGGACAAGGATGGTAAGCGTGAGGGTAAACTGGAAAATGGATACAGGAAAAGTCGTGAGGGGCTTTCAAATAAAGTGAGCGTCAAACTCCTGAACCCAAATTATATATACGATGTGCCGCCCGAGTTTGTGATACCCCTTTCTGAAGTTACCTGCGAAACTGGCGAAACAGTAGTCCTCAGGTGCCGTGTGTGCGGACGGCCTAAGGCTAGTATCACATGGAAAGGGCCCGAGCATAATACACTTAATAATGACGGCCATTATTCAATTAGTTATTCTGATTTGGGCGAAGCAACACTTAAAATCGTTGGGGTCACTACCGAGGACGATGGAATATATACCTGTATAGCCGTAAACGATATGGGAAGTGCTAGTTCTTCCGCTTCCCTGCGTGTTTTGGGCCCTGGAATGGACGGCATTATGGTCACATGGAAGGATAATTTCGATTCTTTCTATTCAGAGGTTGCCGAACTCGGGCGGGGAAGGTTTAGTGTGGTGAAGAAGTGCGACCAAAAGGGGACTAAAAGGGCTGTTGCAACGAAATTCGTAAATAAGAAGCTGATGAAACGGGATCAAGTTACACACGAACTCGGTATTTTGCAAAGTTTACAACATCCTTTACTGGTGGGTCTTTTAGATACATTCGAAACGCCTACGTCATATATTCTCGTGCTGGAGATGGCCGATCAAGGGAGATTGCTTGATTGTGTTGTACGTTGGGGCTCATTGACCGAGGGCAAAATTCGGGCACATCTTGGCGAAGTCCTTGAGGCGGTAAGATATTTGCATAATTGTAGAATCGCCCATCTCGATCTGAAACCAGAAAACATTCTCGTAGACGAATCACTCGCTAAACCCACAATAAAGTTAGCCGATTTCGGGGACGCGGTGCAACTGAATACCACTTATTATATACATCAATTGCTCGGAAATCCAGAGTTTGCGGCTCCGGAGATTATTCTGGGAAATCCCGTGAGCTTGACTTCAGACACATGGAGCGTGGGCGTCCTGACTTATGTGTTGCTGAGCGGGGTTTCTCCATTCTTAGACGATTCCGTTGAGGAAACTTGTCTTAATATCTGTCGATTGGATTTCTCCTTTCCCGACGATTATTTCAAGGGCGTGTCACAAAAGGCTAAAGAATTTGTCTGTTTTCTTTTACAAGAAGATCCTGCAAAACGCCCTAGCGCAGCCTTGGCACTTCAAGAACAATGGCTTCAAGCAGGAAATGGCAGGTCCACAGGAGTTCTTGATACTTCTCGCCTCACCAGCTTTATCGAAAGACGTAAGCATCAAAACGACGTCCGGCCAATTAGGTCCATCAAGAATTTCCTTCAATCCAGACTCCTTCCGAGGGTGTAACTAGGCAGGTGGCAT'.upper())

#SLC6A8 (tile)
SLC6A8_T1 = Seq('GGTCTCTGGGCCCTCACAGCCCTGGGCAGCTACAACCGCTTCAACAACAACTGCTACAAGGACGCCATCATCCTGGCTCTCATCAACAGTGGGACCAGCTTCTTTGCTGGCTTCGTGGTGTTCTCCATCCTGGAGAGACC'.upper())
SLC6A8_T2 = Seq('GGTCTCTACAGTGGGACCAGCTTCTTTGCTGGCTTCGTGGTGTTCTCCATCCTGGGCTTCATGGCTGCAGAGCAGGGCGTGCACATCTCCAAGGTGGCAGAGTCAGGGCCGGGCCTGGCCTTCATCGCCTACCCGCGGGCTGTCACGCTGATAGAGACC'.upper())



In [14]:
# Define genes and check primers against them
genes = {
           'ARF1':ARF1,
           'RHEB':RHEB,
           'RAP1a':RAP1A,
           'NRAS':NRAS,
           'CDC42':CDC42,
           'Rac1':Rac1,
           'RHOA':RHOA,
           'RAB1A':RAB1A,
           'RALA':RALA,
           'RAN':RAN, 
           'RIT':RIT1,
           'EPM2A':EPM2A,
           'NHLRC1':NHLRC1,
           'TANGO2':TANGO2,
           #'GATM':GATM,
           'SMS':SMS,
           'SUFU':SUFU,
           'ETV6':ETV6,
           'RUNX1':RUNX1,
           'GATA2':GATA2,
           'CEBPA':CEBPA,
           'RAB7A':RAB7A,
           'RAB11B':RAB11B,
           'RAB18':RAB18,
           #'RAB23':RAB23,
           'RAB27A':RAB27A,
           'AKT1':AKT1,
           'FOXP3':FOXP3,
           'STK11':STK11,
           'PRKAG2':PRKAG2,
           'BBS1':BBS1,
           'SMAD4':SMAD4,
           'STAT3nterm':STAT3nterm,
           'STAT3cterm':STAT3cterm,
           'BCL10':BCL10,
           'HPS1':HPS1,
           'PIK3CA':PIK3CA,
           'GRIA3':GRIA3,
           'GRIN2A':GRIN2A,
           'TRIO':TRIO,
           'SLC6A8_T1':SLC6A8_T1,
           'SLC6A8_T2':SLC6A8_T2,
           'felicia_lib':felicia_lib,
           'arielle_lib':arielle_lib,
           'circRNA_lib':circRNA_lib,
           'LMNA_tile':LMNA_tile}

forward_primer_array = np.zeros((len(orthogonal_F['PrimerEnd']),len(genes)))
for j, genename in enumerate(genes.keys()):
    print('\n Processing forward primers against ' + genename)
    gene = genes[genename]
    for i, primer in enumerate(orthogonal_F['PrimerEnd']):
        forward_primer_array[i,j] = check_nonspecific(primer, gene)
        
reverse_primer_array = np.zeros((len(orthogonal_R['PrimerEnd']),len(genes)))
for j, genename in enumerate(genes.keys()):
    print('\n Processing reverse primers against ' + genename)
    gene = genes[genename]
    for i, primer in enumerate(orthogonal_R['PrimerEnd']):
        reverse_primer_array[i,j] = check_nonspecific(primer, gene)
        
# Compute number of nonspecific binding sites for each primer
orthogonal_F['Num_Nonspecific_Binding_Sites'] = np.sum(forward_primer_array, axis=1)
orthogonal_R['Num_Nonspecific_Binding_Sites'] = np.sum(reverse_primer_array, axis=1)



 Processing forward primers against ARF1
Found non-specific match at 471bp:
 match:CACCTGTGCCACCAGCGGCG
primer:GGAAACAATAACCATCGGCG Tm:21.6

 Processing forward primers against RHEB
Found non-specific match at 171bp:
 match:ACTTGTAGACACAGCCGGGC
primer:AGCTATAAGAATTGCCGGGC Tm:23.5

 Processing forward primers against RAP1a


  return THERMO_ANALYSIS.calcHeterodimer(


Found non-specific match at 210bp:
 match:GTATATGAAGAACGGCCAAG
primer:TATACTGAAGAACGGCCCAG Tm:37.5

 Processing forward primers against NRAS

 Processing forward primers against CDC42

 Processing forward primers against Rac1

 Processing forward primers against RHOA
Found non-specific match using Primer3 at 351bp:
 match:TATCTGGGTAGGAGAGGGGC
primer:GACCATGCAAGGAGAGGTAC Tm:19.7
Found non-specific match at 558bp:
 match:GTTTCTTCCGGATGGCAGCC
primer:AATCAGTTTCTTTGGCAGCC Tm:22.038901619925298

 Processing forward primers against RAB1A
Found non-specific match at 445bp:
 match:CCGTTTTTGGAAACCAGTGC
primer:TATCAATCCGGAACCAGTGC Tm:20.9
Found non-specific match at 346bp:
 match:CACAACTATGATGCCATGGG
primer:TAGTGACCTAATGCCATGGG Tm:28.295278203887733

 Processing forward primers against RALA

 Processing forward primers against RAN

 Processing forward primers against RIT

 Processing forward primers against EPM2A
Found non-specific match at 828bp:
 match:CCTGCAGGGCCAGGGCCCCG
primer:GATACATAGACTTG

In [15]:
# Process primers
orthogonal_F_touse = orthogonal_F[(orthogonal_F.BsaI_Site_Present == False) & \
                                  (orthogonal_F.Num_Nonspecific_Binding_Sites == 0) & \
                                  (orthogonal_F.Exclude == False)][['Well Position','PrimerEnd','Worse']]
orthogonal_R_touse = orthogonal_R[(orthogonal_R.BsaI_Site_Present == False) & \
                                  (orthogonal_R.Num_Nonspecific_Binding_Sites == 0) & \
                                  (orthogonal_R.Exclude == False)][['Well Position','PrimerEnd','Worse']]

# Remove primer sequences manually - they bind to one of the genes on the gene list and don't pass oligo qc
primer_seqs_to_remove = ['AGTTGTAATATCACCCGCGC', 'TATACTGAAGAACGGCCCAG' ,'AGTTGTAATATCACCCGCGC', 'ATCCTAGAAAAGGCGAAGGC', 'GGAAACAATAACCATCGGCG','TCCAATTATACGGAGCAGGC', 'ATACTGTAAGAACCACGCGG','AGTTGTAATATCACCCGCGC','TAAGATAGCACCACGGATGG', 'AGAACATAGCATTCACGGGG', 'AGATAGATGCTCCGTCAAGC', 'TTAGTAGGCAAGCATACCCG']
orthogonal_F_touse_filt = orthogonal_F_touse[~orthogonal_F_touse['PrimerEnd'].isin(primer_seqs_to_remove)]
orthogonal_R_touse_filt = orthogonal_R_touse[~orthogonal_R_touse['PrimerEnd'].isin(primer_seqs_to_remove)]


In [16]:
# Import prevalidated primer combos
validated_primer_combos = pd.read_csv('./Validated Gene Primer Pair Combinations.csv')
validated_primer_combos


Unnamed: 0,Gene,Block,Validated F,Validated R
0,ARAFcterm,6,F1,A4
1,ARAFnterm,5,A11,A11
2,ARAFnterm,2,E10,F10
3,BRAFcterm,6,A4,A4
4,BRAFcterm,7,B4,B4
...,...,...,...,...
90,ERBB2cterm,2,B2,A7
91,ERBB2cterm,3,C2,B7
92,ERBB2cterm,4,D2,C7
93,ERBB2cterm,10,B3,A8


In [17]:
# Remove validated combos that are not usable since they bind to other genes
print('Forward wells validated but not usable:')
validated_not_usable_F = []
validated_not_usable_R = []
for f_well in np.unique(validated_primer_combos['Validated F'].values):
    if f_well not in orthogonal_F_touse_filt['Well Position'].values:
        print(f_well)
        validated_not_usable_F.append(f_well)
        
print('Reverse wells validated but not usable')
for r_well in np.unique(validated_primer_combos['Validated R'].values):
    if r_well not in orthogonal_R_touse_filt['Well Position'].values:
        print(r_well)
        validated_not_usable_R.append(r_well)

# filter out unusable primers
validated_primer_combos_filtered = validated_primer_combos[~validated_primer_combos['Validated F'].isin(validated_not_usable_F) & \
                                                           ~validated_primer_combos['Validated R'].isin(validated_not_usable_R)] \
                        .rename(columns={'Validated F':'Forward Name',
                                                 'Validated R':'Reverse Name'})

# add primer sequences
validated_primer_combos_filtered = pd.merge(validated_primer_combos_filtered,
         orthogonal_F[['Well Position','PrimerEnd']].rename(columns={'Well Position':'Forward Name',
                                                                     'PrimerEnd': 'Forward Primer'}), on='Forward Name')
validated_primer_combos_filtered = pd.merge(validated_primer_combos_filtered,
         orthogonal_R[['Well Position','PrimerEnd']].rename(columns={'Well Position':'Reverse Name',
                                                                     'PrimerEnd': 'Reverse Primer'}), on='Reverse Name')


Forward wells validated but not usable:
B10
B8
C10
C6
C7
D4
D6
D9
E1
G5
H7
Reverse wells validated but not usable
A10
A4
A7
C4
E4
F3
G1
H10


In [18]:
# Generate more random combos of orthogonal F and reverse primers that are not "red" and that are not validated
# concatenate F and R primers into pandas dataframe
niters = 4
minlens_FR = min(len(orthogonal_F_touse_filt), len(orthogonal_R_touse_filt))
orthogonal_primers_iter = []
for i in range(niters):
    orthogonal_primers_iter.append(pd.concat([orthogonal_F_touse_filt.sample(n=minlens_FR).reset_index().\
                                                  drop(['index'], axis=1).rename(columns={'Well Position':'Forward Name',
                                                                                         'PrimerEnd':'Forward Primer'}),
                                              orthogonal_R_touse_filt.sample(n=minlens_FR).reset_index().\
                                                  drop(['index'], axis=1).rename(columns={'Well Position':'Reverse Name',
                                                                                         'PrimerEnd':'Reverse Primer'})],
                                                axis=1)\
                                    .drop(columns='Worse'))
orthogonal_primers_touse = pd.concat(orthogonal_primers_iter).drop_duplicates()
    
# remove any prevalidated combos-removed this for 20250610 library
# orthogonal_primers_touse = orthogonal_primers_touse[~orthogonal_primers_touse[['Forward Name','Reverse Name']]\
#                                 .apply(lambda a: (validated_primer_combos_filtered[['Forward Name','Reverse Name']] == a).all(1).any(),axis=1)]\
#                                 .reset_index(drop=True)
orthogonal_primers_touse.to_csv('./orthogonal_primer_random_combos_used_DH_20250613.csv')


In [19]:
#Import BsaI data
bsaI_empirical = pd.read_csv('./bsaI_empirical.csv')
bsaI_empirical.index = bsaI_empirical['Overhang']
bsaI_empirical = bsaI_empirical.drop(columns=['Overhang'])
bsaI_empirical = bsaI_empirical + 1
bsaI_empirical


Unnamed: 0_level_0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
Overhang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TTTT,636,9,41,17,3,1,1,1,8,1,...,1,1,1,1,1,1,1,1,1,1
GTTT,4,477,5,46,1,21,1,2,1,16,...,1,1,1,1,1,1,1,1,1,1
CTTT,2,2,597,3,1,1,19,1,1,1,...,1,1,1,1,1,1,1,1,1,1
ATTT,9,5,2,643,1,1,1,7,1,2,...,1,1,1,1,1,1,1,1,1,1
TGTT,1,1,1,1,494,17,65,57,3,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACAA,1,1,1,1,1,1,1,1,1,1,...,1,1,11,3,8,480,1,1,1,1
TAAA,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,362,1,11,4
GAAA,1,1,1,1,1,1,1,1,1,1,...,1,1,1,3,1,1,6,716,2,20
CAAA,1,1,1,1,1,1,1,1,1,1,...,5,1,1,1,1,1,4,1,486,1


In [20]:
#Import information about codon usage for mutagenesis
codons_ranked_by_usage = {
    "A": ["GCC", "GCT", "GCA", "GCG"],
    "C": ["TGC", "TGT"],
    "D": ["GAC", "GAT"],
    "E": ["GAG", "GAA"],
    "F": ["TTC", "TTT"],
    "G": ["GGC", "GGA", "GGG", "GGT"],
    "H": ["CAC", "CAT"],
    "I": ["ATC", "ATT", "ATA"],
    "K": ["AAG", "AAA"],
    "L": ["CTG", "CTC", "CTT", "TTG", "TTA", "CTA"],
    "M": ["ATG"],
    "N": ["AAC", "AAT"],
    "P": ["CCC", "CCT", "CCA", "CCG"],
    "Q": ["CAG", "CAA"],
    "R": ["CGG", "AGA", "AGG", "CGC", "CGA", "CGT"],
    "S": ["AGC", "TCC", "TCT", "AGT", "TCA", "TCG"],
    "T": ["ACC", "ACA", "ACT", "ACG"],
    "V": ["GTG", "GTC", "GTT", "GTA"],
    "W": ["TGG"],
    "Y": ["TAC", "TAT"],
}


In [21]:
#Set blacklist of inefficient or nonspecific codons (DH updated 20250501)
overhang_blacklist = []

for codon in bsaI_empirical.index:
    # make sure we're working with plain strings
    codon_str = str(codon)
    rc_str    = str(Seq(codon_str).reverse_complement())

    # first, drop any palindromic (self-complementary) codons
    if codon_str == rc_str:
        overhang_blacklist.append(codon_str)
        continue

    # then test your efficiency threshold, but only if that rc exists as a column
    if rc_str in bsaI_empirical.columns:
        eff = bsaI_empirical.loc[codon_str, rc_str]
        if eff < 300:
            overhang_blacklist.append(codon_str)
    else:
        # optionally warn if you have no measurement for that pair
        print(f"Warning: no data for {codon_str} → {rc_str}")

overhang_blacklist

['AATT',
 'ACGT',
 'AGCT',
 'ATAT',
 'GTTG',
 'GGTG',
 'CATG',
 'GTGG',
 'GGGG',
 'GCGG',
 'CCGG',
 'GGCG',
 'CGCG',
 'CTAG',
 'GATC',
 'GCGC',
 'CCGC',
 'GGCC',
 'CGCC',
 'CCCC',
 'CACC',
 'GTAC',
 'CCAC',
 'CAAC',
 'TATA',
 'TCGA',
 'TGCA',
 'TTAA']

In [23]:
# Set defaults
block_size_range = [150, 198]
max_oligo_size = 250
first_last_block_reduction = 8
slack = 5
randomsequencepad = "ACGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGCACATAAGTAATACCATGGTCCCTGAAATTCGGCTCAGTTACTTCGAGCGTAATGTCTCAAATGGCGTAGAACGGCAATGACTGTTTGACACTAGGTGGTGTTCAGTTCGGTAACGGAGAGTCTGTGCGGCATTCTTATTAATACATTTGAAACGCGCCCAACTGACGCTAGGCAAGTCAGTGCAGGCTCCCGTGTTAGGATAAGGGTAAACATACAAGTCGATAGAAGATGGGTAGGGGCCTTCAATTCATCCAGCACTCTACG"
#20250611 Import primer file so that it isn't remade
orthogonal_primers_touse= pd.read_csv('./orthogonal_primer_random_combos_used_DH_20250613.csv')

In [24]:
def post_qc(amp_primer_set, wt_oligos, primer_set, melt_temp_threshold = 35, check_all_primers=True):
    print("Running QC for primer specificity on WT oligos")
    f_primer_map = {}
    r_primer_map = {}
    # invert the primer to subpool map
    for k, v in amp_primer_set.items():
        f_primer_map[v[1]] = f_primer_map.get(v[1], []) + [k]
        r_primer_map[v[3]] = r_primer_map.get(v[3], []) + [k]
    
    # initialize list of nonspecific problems
    nonspecific = {}
    
    # add unused primers if check_all_primers
    if check_all_primers:
        all_f_primers = np.unique(primer_set['Forward Primer'])
        all_r_primers = np.unique(primer_set['Reverse Primer'])
        for f_primer in all_f_primers:
            if f_primer not in f_primer_map.keys():
                f_primer_map[f_primer] = []
        for r_primer in all_r_primers:
            if r_primer not in r_primer_map.keys():
                r_primer_map[r_primer] = []
        
    for f_primer, subpools_used in f_primer_map.items():
    # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
        anneal_locs = []
        for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
            if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
                if check_nonspecific(f_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
                    anneal_locs.append(subpoolcheck)
        if anneal_locs:
            nonspecific.update({f_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
    for r_primer, subpools_used in r_primer_map.items():
    # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
        anneal_locs = []
        for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
            if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
                if check_nonspecific(r_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
                    anneal_locs.append(subpoolcheck)
        if anneal_locs:
            nonspecific.update({r_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
    if nonspecific:
        print("Nonspecific Primers: (Manually removing primer sequence recommended)")
        print(nonspecific)
    else:
        print("No non-specific primers detected")
        
    return nonspecific

def build_kmers(sequence, 
                ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)

    return kmers

def compute_overlaps(breakpoints, 
                     inclusion_array, 
                     gene):
    
    overlaps = [[gene[val:val+4].reverse_complement(), gene[val:val+4]] for val in breakpoints]
    counter = 0
    for val in inclusion_array:
        if val == -1:
            (overlaps[counter][1],overlaps[counter+1][0]) = (overlaps[counter+1][0],overlaps[counter][1])
            counter += 1
        elif val == 0:
            overlaps[counter][1] = overlaps[counter+1][1]
            del overlaps[counter+1]
        
    return overlaps

def score_breakpoints(gene, 
                      breakpoint_pair, 
                      empirical, 
                      overhang_blacklist=overhang_blacklist):
    
    #subset empirical matrix by the set of all overlaps
    all_overlaps = []
    for breakpoint in breakpoint_pair:
        all_overlaps.append(gene[breakpoint:(breakpoint+4)])
        all_overlaps.append(gene[breakpoint:(breakpoint+4)].reverse_complement())
    all_overlaps = [str(o) for o in all_overlaps]
    if (len(np.unique(all_overlaps)) == len(all_overlaps)) & (len(set(all_overlaps).intersection(set(overhang_blacklist))) == 0):
        empirical_subset = empirical.loc[all_overlaps,all_overlaps]

        #compute fidelity score
        empirical_subset = empirical_subset/empirical_subset.sum(axis=1)
        fidelity_score = 1
        for breakpoint in breakpoint_pair:
            fidelity_score = fidelity_score * \
                empirical_subset.loc[str(gene[breakpoint:(breakpoint+4)]),
                                     str(gene[breakpoint:(breakpoint+4)].reverse_complement())]
    
    else:
        fidelity_score = 0
    
    return fidelity_score
    
def optimize_breakpoints(gene, 
                         breakpoint_pair, 
                         indices_to_shift, 
                         indices_of_array,
                         slack, 
                         empirical=bsaI_empirical, 
                         overhang_blacklist=overhang_blacklist):
    
    #compute all enrichments
    shifts = list(range(-slack,slack+1))
    if (len(indices_to_shift) > 2) | (len(indices_to_shift) < 1):
        print('Error -- too many or too few breakpoints!')
        optimum_breakpoint = breakpoint_pair
        optimum_score = 0
    elif (len(indices_to_shift) == 1): #external pair
        scores = [0]*len(shifts)
        for i,shift in enumerate(shifts):
            scores[i] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift] + breakpoint_pair[(indices_to_shift[0]+1):], 
                                          empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
        
        optimum_shift = np.argmax(scores)
        optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift]] + breakpoint_pair[(indices_to_shift[0]+1):]
        optimum_score = scores[optimum_shift]
        optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]]
            
            
    else: #internal pair
        indices_to_shift = sorted(indices_to_shift)
        scores = np.zeros((len(shifts),len(shifts)))
        for i,shift1 in enumerate(shifts):
            for j,shift2 in enumerate(shifts):
                scores[i,j] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift1] + \
                                                            breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
                                                            [breakpoint_pair[indices_to_shift[1]]+shift2] + breakpoint_pair[(indices_to_shift[1]+1):], 
                                              empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                
        optimum_shift = np.unravel_index(np.argmax(scores,axis=None), scores.shape)
        optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift[0]]] + \
                                            breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
                                            [breakpoint_pair[indices_to_shift[1]]+shifts[optimum_shift[1]]] + breakpoint_pair[(indices_to_shift[1]+1):]
        optimum_score = scores[optimum_shift]
        optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]] + 4
    
    return optimum_breakpoint, optimum_score, optimum_length

def optimize_gene(gene, 
                  max_tile_size,
                  first_last_block_reduction,
                  block_size_range=block_size_range, 
                  slack=slack, 
                  empirical=bsaI_empirical, 
                  overhang_blacklist=overhang_blacklist): 
    
    #setup initial inputs to optimization
    gene_size = len(gene)
    protein_size = len(gene.translate())
        
    #exclude gene if it is too big
    if protein_size > 1000:
        print('Protein size too big!')
        
    #divide genes between 500 and 1000aa into two blocks
    elif protein_size > 620:
        print('Protein size too big! Will add two superblock (620aa+ proteins) soon.')
        
    else:
        #gene is one superblock 
        #print('Protein is one superblock.')
        block_size = block_size_range[0] + np.argmin(
            [abs((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]) - \
                     round((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]))) \
                 for i in range(0, block_size_range[1]-block_size_range[0])])
        
        
        # now, set initial breakpoints
        fragment_number = int((gene_size+2*first_last_block_reduction)/block_size)
        
        # if any of the tiles are too big?
        tile_lengths = [1000]
        while max(tile_lengths) > (max_tile_size-2*slack):
            fragment_number = fragment_number + 1
            first_breakpoint = 0
            last_breakpoint = gene_size-4
            step = (last_breakpoint - first_breakpoint + 2*first_last_block_reduction)/fragment_number
            evenly_spaced_floats = [first_breakpoint] + [step * i - first_last_block_reduction for i in range(1,fragment_number)] + [last_breakpoint]        
            initial_breakpoints = [[first_breakpoint,int(evenly_spaced_floats[1])+slack+2,last_breakpoint]] + \
                                    [[0,int(evenly_spaced_floats[i])-2-slack,int(evenly_spaced_floats[i+1])+slack+2,last_breakpoint] for i in range(1,len(evenly_spaced_floats)-2)] + \
                                    [[0,int(evenly_spaced_floats[-2])-2-slack,last_breakpoint]]
            tile_lengths = [int(evenly_spaced_floats[1])+slack+2-first_breakpoint+first_last_block_reduction+4] + \
                                    [int(evenly_spaced_floats[i+1])-int(evenly_spaced_floats[i])+2*(slack+2)+4 for i in range(1,len(evenly_spaced_floats)-2)] + \
                                    [last_breakpoint+4-int(evenly_spaced_floats[-2])+slack+2+first_last_block_reduction]
            

        #optimize each breakpoint
        optimum_breakpoints = []
        optimum_scores = []
        optimum_lengths = []
        oligo_array_indices = []
        for k,breakpoint in enumerate(initial_breakpoints):
            if len(breakpoint) == 3:
                indices_of_array = [0, 1] if k==0 else [1, 2]
                optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1], indices_of_array,
                                                            slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                optimum_breakpoints.append(optimum_breakpoint)
                optimum_scores.append(optimum_score)
                optimum_lengths.append(optimum_length)
                oligo_array_indices.append(indices_of_array)
            else:
                indices_of_array = [1, 2]
                optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1, 2], indices_of_array,
                                                            slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                optimum_breakpoints.append(optimum_breakpoint)
                optimum_scores.append(optimum_score)
                optimum_lengths.append(optimum_length)
                oligo_array_indices.append(indices_of_array)
    
    optimum_overlaps = [[str(gene[t:(t+4)]) for t in s] for s in optimum_breakpoints]
    if all([s >= 0.95 for s in optimum_scores]):
        print('All regions are high fidelity!')
    elif all([s >= 0.9 for s in optimum_scores]):
        print('Some regions are medium fidelity.')
    else:
        print('Some regions are low fidelity. Look closer')
        
    return optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices

def generate_primer(DNA_seq,
                     Fwd=True,
                     extendtoCG=False,
                     smallest_primer_size=16,
                     largest_primer_size=30,
                     Tm=55):
    
    #Setup melting temperature arrays
    melt_temp_array = np.zeros(largest_primer_size-smallest_primer_size+1)
    
    if Fwd:
        DNA_seq_touse = DNA_seq
    else:
        DNA_seq_touse = DNA_seq.reverse_complement()
            
    #Make melting temperature arrays
    primer_length = 0
    for i in range(smallest_primer_size,largest_primer_size+1):
        melt_temp_array[i-smallest_primer_size] = mt.Tm_NN(DNA_seq_touse[0:i])
        
        #Pick F primer when Tm is first >F_Tm
        if (melt_temp_array[i-smallest_primer_size] >= Tm) & (primer_length==0):
            primer_length = i
    
    #If Tm isnt high enough after max bases, just set primer length to be max and hope it works
    if (primer_length == 0):
        primer_length = largest_primer_size
        
    if extendtoCG:
        while ((DNA_seq_touse[primer_length-1] == 'A') | (DNA_seq_touse[primer_length-1] == 'T')) & \
                    (primer_length < largest_primer_size):
            primer_length += 1
    
    return DNA_seq_touse[0:primer_length]

def make_mutations(region_name,
                       region,
                       region_flanks=[Seq(''),Seq('')],
                       nt_start=0, #zero-indexed!
                       wt_only=False,
                       synonymous=True,
                       stops='TAA',
                       all3ntdeletions=True,
                       mutation_list=False,
                       codons_ranked_by_usage=codons_ranked_by_usage,
                       aa_start=0):
                       
    oligo_array = {}
    #Check that region has size divisible by three
    if (len(region)/3 != len(region)//3) | (nt_start/3 != nt_start//3):
        print('Region is not translatable!')
        
    else:
        #add wt seq to oligo array
        oligo_name = region_name + '_WT'
        wt_seq = \
            region_flanks[0] + region + region_flanks[1]
        oligo_array[oligo_name] = wt_seq
        
        if not wt_only:
            
            # see if a mutation list was given
            if mutation_list == False:
                    
                #loop over amino acids
                for j in range(0,len(region),3):

                    #add all missense variants
                    aa = region[j:(j+3)].translate()
                    for aa_to in codons_ranked_by_usage.keys():
                        if aa_to != aa:
                            oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa_to)
                            seq_to_append = \
                                region_flanks[0] + \
                                region[0:j] + Seq(codons_ranked_by_usage[aa_to][0]) + \
                                region[(j+3):] + \
                                region_flanks[1]
                            oligo_array[oligo_name] = seq_to_append

                    #add synonymous variant if True and if possible, 
                    # using the most common codon that is NOT the codon in the gene
                    if synonymous:
                        if len(codons_ranked_by_usage[aa]) > 1:
                            oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa)
                            possible_codons = codons_ranked_by_usage[aa].copy()
                            possible_codons.remove(region[j:(j+3)])
                            seq_to_append = \
                                region_flanks[0] + \
                                region[0:j] + Seq(possible_codons[0]) + \
                                region[(j+3):] + \
                                region_flanks[1]
                            oligo_array[oligo_name] = seq_to_append

                    #add stops if true
                    if stops:
                        oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + 'X'
                        seq_to_append = \
                            region_flanks[0] + \
                            region[0:j] + Seq(stops) + \
                            region[(j+3):] + \
                            region_flanks[1]
                        oligo_array[oligo_name] = seq_to_append

                    #add all 3nt deletions if True
                    if all3ntdeletions:
                        for k in range(0,3):
                            if j+k+3 <= len(region):
                                oligo_name = region_name + '_' + 'del' + str(nt_start+j+k+1+3*aa_start)
                                seq_to_append = \
                                    region_flanks[0] + \
                                    region[0:(j+k)] + \
                                    region[(j+k+3):] + \
                                    region_flanks[1]
                                oligo_array[oligo_name] = seq_to_append
                                
            else:
                
                #loop over mutation list
                for i in range(len(mutation_list)):
                        
                    #iterate over every single aa change
                    oligo_name = region_name + '_' + 'variant' + str(i+1)
                    seq_to_append = region
                    for k,v in enumerate(mutation_list[i]):
                        aa_from = v[0]
                        aa_to = v[-1]
                        pos = int(v[1:-1])
                        j=3*(pos-(nt_start//3+1))
                        aa = region[j:(j+3)].translate()
                        if aa != aa_from:
                            print('Check mutation list!')
                        else:
                            seq_to_append = \
                                seq_to_append[0:j] + \
                                Seq(codons_ranked_by_usage[aa_to][0]) + \
                                seq_to_append[(j+3):]

                    # append oligo to array
                    seq_to_append = region_flanks[0] + \
                                    seq_to_append + \
                                    region_flanks[1]
                    oligo_array[oligo_name] = seq_to_append
        
    return oligo_array


def write_oligo_library(genes,
                        oligo_file='./oligo_test.csv',
                        primer_file='./primer_test.tsv',
                        gbl_file='./gbl_test.tsv',
                        gbl_large_file='./gbl_test_large.tsv',
                        amp_primer_key_file='./amp_primer_key.tsv',
                        breakpoint_file='./breakpoints.tsv',
                        primer_set=orthogonal_primers_touse,
                        codons_ranked_by_usage=codons_ranked_by_usage,
                        block_size_range=block_size_range, 
                        max_oligo_size=max_oligo_size,
                        slack=slack, 
                        empirical=bsaI_empirical, 
                        overhang_blacklist=overhang_blacklist,
                        validated_primer_set=False,
                        aa_start=False,
                        wt_only=False,
                        synonymous=True,
                        stops='TAA',
                        all3ntdeletions=True,
                        mutations_to_use=False,
                        find_pcr_primers=True,
                        smallest_primer_size=16,
                        largest_primer_size=30,
                        Tm=55,
                        extendtoCG=True,
                        bsaI_firstoverlap='CGTC',
                        bsaI_lastoverlap='GCAT',
                        all_blocks=True,
                        blocks_to_include=False,
                        tile_boundaries=False,
                        paqcIcapF=True,
                        paqcIcapR=True,
                        check_all_primers=True,
                        qc_melt_temp_threshold=32,
                        gblock_min_size=300,
                        gblock_large_threshold=1000,
                        randomsequencepad=randomsequencepad):
    
    #Split up primer set into F and R primers, cannot do more than 82 sublibraries
    oligo_primer_counter = 0
    oligo_array = {}
    amp_primers = {}
    gblocks = {}
    num_blocks = {}
    amp_primer_dict = {}
    breakpoint_dict = {}
    
    #Convert genes to Seq and genes to list
    gene_names = list(genes.keys())
    genes = [Seq(genes[gene_name].upper()) for gene_name in gene_names]
    
    #PaqCI and BsaI site sequences and overhangs for paqcI
    paqcI_seq = Seq('CACCTGC')
    paqcI_overhang_nterm_ctag = Seq('CCAC')
    paqcI_overhang_cterm_ctag = Seq('CGGG')
    paqcI_overhang_nterm_ntag = Seq('TGGC')
    paqcI_overhang_cterm_ntag = Seq('TAGG')
    paqcI_seqplusfour = Seq('CACCTGCCTAG')
    bsaI_seq = Seq('GGTCTC')
    bsaI_seqplusone = Seq('GGTCTCT')
    pcr_capseq = Seq('GGCTAC') + bsaI_seqplusone
    gbl_capseq_F = Seq('CCGCGTGATTACGAGTCG') + pcr_capseq
    gbl_capseq_R = Seq('GGGTTAGCAAGTGGCAGCCT') + pcr_capseq
    
    # set max size of a tile
    primer_len = len(primer_set['Forward Primer'][0])
    max_tile_size = max_oligo_size - 2*primer_len - 2*len(bsaI_seqplusone)
    first_last_block_reduction = len(paqcI_seqplusfour)
    
    # iterate over genes
    for r,gene in enumerate(genes):
        
        gene_name = gene_names[r]
        print('Processing gene ' + str(r+1) + ': ' + gene_name)
        
        # amino acid to start at
        if aa_start != False:
            if gene_name in aa_start.keys():
                aa_start_gene=aa_start[gene_name]-1
            else:
                aa_start_gene=0
        else:
            aa_start_gene=0
        
        #exclude if gene size is not divisible by three
        if len(gene)/3 != len(gene)//3:
            print('Gene length is not divisible by 3!')
    
        #exclude if there is a paqcI site in the gene
        elif any([True for kmer in build_kmers(gene, len(paqcI_seq)) if kmer==paqcI_seq]) | \
            any([True for kmer in build_kmers(gene.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq]):
            print('Gene has paqcI site!')
        
        #exclude if there is a BsaI site in the gene
        elif any([True for kmer in build_kmers(gene, len(bsaI_seq)) if kmer==bsaI_seq]) | \
            any([True for kmer in build_kmers(gene.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq]):
            print('Gene has BsaI site!')
            
        else:
            print('Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...')
            
            #cap gene with BsaI breakpoints and possible paqcI sites 
            if paqcIcapF & paqcIcapR:
                gene_capped = bsaI_firstoverlap + paqcI_seqplusfour +  paqcI_overhang_nterm_ntag + \
                        gene + paqcI_overhang_cterm_ntag + paqcI_seqplusfour.reverse_complement() + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ntag)
                capping_length_R = len(paqcI_overhang_cterm_ntag + paqcI_seqplusfour.reverse_complement() + \
                                   bsaI_lastoverlap)
            elif paqcIcapF:
                gene_capped = bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ntag + \
                        gene + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ntag)
                capping_length_R = len(bsaI_lastoverlap)
            elif paqcIcapR:
                gene_capped = bsaI_firstoverlap + \
                        gene + paqcI_overhang_cterm_ntag + paqcI_seqplusfour.reverse_complement() + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap)
                capping_length_R = len(paqcI_overhang_cterm_ntag + paqcI_seqplusfour.reverse_complement() + \
                                   bsaI_lastoverlap)
            else:
                gene_capped = bsaI_firstoverlap + gene + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap)
                capping_length_R = len(bsaI_lastoverlap)
            
            #Optimize gene if tile boundaries are not given
            if tile_boundaries is False:
                optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices = \
                optimize_gene(gene_capped, 
                          max_tile_size=max_tile_size,
                          first_last_block_reduction=first_last_block_reduction,
                          block_size_range=block_size_range, 
                          slack=slack, 
                          empirical=bsaI_empirical, 
                          overhang_blacklist=overhang_blacklist)
                pprint.pprint({'Optimum Breakpoints': optimum_breakpoints, 
                       'Optimum Overlaps': optimum_overlaps, 
                       'Optimum Scores': optimum_scores})
                num_blocks[gene_name] = len(optimum_breakpoints)
            else:
                optimum_breakpoints = tile_boundaries[gene_name]
                if len(optimum_breakpoints[0])==4:
                    if len(optimum_breakpoints)>1:
                        #multiple tiles, including one at beginning of gene
                        oligo_array_indices = [[0,1]] + [[1,2]]*(len(optimum_breakpoints)-1)
                    else:
                        # one tile
                        if ((optimum_breakpoints[1]-optimum_breakpoints[0]) > (optimum_breakpoints[2]-optimum_breakpoints[1])):
                            # at end of gene
                            oligo_array_indices = [[1,2]]
                        else:
                            # at beginning of gene
                            oligo_array_indices = [[0,1]]
                else:
                    # multiple tiles, starting in the middle
                    oligo_array_indices = [[1,2]]*(len(optimum_breakpoints))
                num_blocks[gene_name] = len(optimum_breakpoints)
                
            
            #add primers for gene_F and gene_R that are repeated constantly throughout the PCRs
            #note: should probably prevalidate these primers!
            if find_pcr_primers:
                F_primer = generate_primer(gene,
                                           Fwd=True,
                                           extendtoCG=extendtoCG,
                                           smallest_primer_size=smallest_primer_size,
                                           largest_primer_size=largest_primer_size,
                                           Tm=Tm)
                F_primer = pcr_capseq + bsaI_firstoverlap + paqcI_seqplusfour + F_primer
                amp_primers[gene_name+'_gene'+'_ampF'] = F_primer
                R_primer = generate_primer(gene,
                                           Fwd=False,
                                           extendtoCG=extendtoCG,
                                           smallest_primer_size=smallest_primer_size,
                                           largest_primer_size=largest_primer_size,
                                           Tm=Tm)
                R_primer = pcr_capseq + Seq(bsaI_lastoverlap).reverse_complement() + \
                            paqcI_seqplusfour + R_primer
                amp_primers[gene_name+'_gene'+'_ampR'] = R_primer
            
            #make oligos, primers, gblocks for each block
            for i,breakpoint in enumerate(optimum_breakpoints):
                
                #find indices of breakpoint that correspond to oligo vs need to be PCRed/gblock
                pcr_indices = [[j,j+1] for j in range(len(breakpoint)-1)]
                pcr_indices.remove(oligo_array_indices[i])
                
                #find mutagenic window of oligo
                oligo_breaks = [breakpoint[j] for j in oligo_array_indices[i]]
                oligo_mutagenic_window = [int(3*np.ceil(max(oligo_breaks[0]+4-capping_length_F,3)/3)), int(3*np.floor(min(oligo_breaks[1]-capping_length_F,len(gene)-1)/3))]
                
                #subset the right block if needed
                if (all_blocks == True) | ((i+1) in blocks_to_include[r] if blocks_to_include != False else True): #subset on allowed blocks
                
                    #add pcr primers and gblocks, one segment at a time
                    for k,pcr_index in enumerate(pcr_indices):
                        piece_name = gene_name + '_block' + str(i+1) + '_s' + str(k+1)
                        pcr_breaks = [breakpoint[j] for j in pcr_index]
                                            
                        #get pcr primers
                        if find_pcr_primers:
                            if pcr_breaks[0] == breakpoint[0]: #Fragment beginning at gene start 
                                R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=False,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                R_primer = pcr_capseq + R_primer
                                amp_primers[piece_name+'_ampR'] = R_primer
                            elif pcr_breaks[1] == breakpoint[-1]: #Fragment ending at gene end
                                F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=True,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                F_primer = pcr_capseq + F_primer
                                amp_primers[piece_name+'_ampF'] = F_primer
                            else:
                                F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=True,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                F_primer = pcr_capseq + F_primer
                                R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=False,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                R_primer = pcr_capseq + R_primer
                                amp_primers[piece_name+'_ampF'] = F_primer
                                amp_primers[piece_name+'_ampR'] = R_primer

                        #make gblocks
                        gbl = gbl_capseq_F + gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)] + gbl_capseq_R.reverse_complement()
                        gblocks[piece_name] = gbl
                        
                    #add oligos to oligo array, checking first for validated primers if they are given
                    validated=False
                    if validated_primer_set is not False:
                        validated_combo = validated_primer_set.query('Gene == @gene_name & Block == (@i+1)')
                        if not validated_combo.empty:
                            validated=True
                            name_primer_F, primer_F, name_primer_R, primer_R = \
                                validated_combo[['Forward Name',
                                                  'Forward Primer',
                                                  'Reverse Name',
                                                  'Reverse Primer']].values[0]
                        else:
                            name_primer_F, primer_F, name_primer_R, primer_R = \
                                primer_set.iloc[oligo_primer_counter,][['Forward Name',
                                                                      'Forward Primer',
                                                                      'Reverse Name',
                                                                      'Reverse Primer']]
                            oligo_primer_counter += 1
                    else:
                        name_primer_F, primer_F, name_primer_R, primer_R = \
                                primer_set.iloc[oligo_primer_counter,][['Forward Name',
                                                                      'Forward Primer',
                                                                      'Reverse Name',
                                                                      'Reverse Primer']]
                        oligo_primer_counter += 1
                        
                    # if mutations are given, use those - otherwise make all mutations or wtonly
                    if mutations_to_use == False:
                        add_on_array = make_mutations(gene_name + '_block' + str(i+1),
                                           gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
                                           region_flanks=[Seq(primer_F) + \
                                                          bsaI_seqplusone + \
                                                          gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
                                                          gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
                                                          bsaI_seqplusone.reverse_complement() + \
                                                          Seq(primer_R).reverse_complement()],
                                           nt_start=oligo_mutagenic_window[0],
                                           wt_only=wt_only,
                                           synonymous=synonymous,
                                           stops=stops,
                                           all3ntdeletions=all3ntdeletions,
                                           codons_ranked_by_usage=codons_ranked_by_usage,
                                           aa_start=aa_start_gene)
                    else:
                        add_on_array = make_mutations(gene_name + '_block' + str(i+1),
                                           gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
                                           region_flanks=[Seq(primer_F) + \
                                                          bsaI_seqplusone + \
                                                          gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
                                                          gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
                                                          bsaI_seqplusone.reverse_complement() + \
                                                          Seq(primer_R).reverse_complement()],
                                           nt_start=oligo_mutagenic_window[0],
                                           mutation_list=mutations_to_use[(gene_name,i+1)],
                                           codons_ranked_by_usage=codons_ranked_by_usage,
                                           aa_start=aa_start_gene)
                    oligo_array.update(add_on_array)
                    amp_primer_dict.update({(gene_name,i+1): (name_primer_F,primer_F,name_primer_R,primer_R,validated)})
                    breakpoint_dict.update({(gene_name,i+1): oligo_mutagenic_window})
                    
    #Check that max oligo is less than the max oligo length
    if sum([len(s)>max_oligo_size for s in oligo_array.values()]) == 0:
        print('All oligos are below the maximum 250bp!')
    else:
        print('Some oligos are TOO BIG!')
        
    #Check for nonspecific amplification
    wt_oligos = {tuple([key.split('_')[0],
                        int((key.split('_block')[1]).split('_')[0])]
                      ):oligo_array[key] \
                     for key in oligo_array.keys() if 'WT' in key}
    nonspecific_primers = post_qc(amp_primer_dict, 
                                  wt_oligos,
                                  primer_set, 
                                  melt_temp_threshold=qc_melt_temp_threshold,
                                  check_all_primers=check_all_primers)
    
    # Check that all mutagenic windows overlap
    breakpoint_df = pd.DataFrame.from_dict(breakpoint_dict, orient='index', columns=['Mutagenesis Start','Mutagenesis End'])
    breakpoint_df.index = pd.MultiIndex.from_tuples(breakpoint_dict.keys())
    breakpoint_df = breakpoint_df.reset_index().rename(columns={'level_0':'Gene',
                                                                'level_1':'Block'})
    if (mutations_to_use == False) and (all_blocks == True):
        missed_counter = 0
        for r,gene_group_breakpoints in breakpoint_df.groupby('Gene'):
            for k,row in gene_group_breakpoints.iterrows():
                # look at whether the current row start is later than the last row end
                if row['Block'] > 1:
                    if row['Mutagenesis Start'] > end:
                        missed_counter += 1
                        print('Mutagenic window missed at ' + str(r) + ' block ' + str(row['Block']))
                start,end = row['Mutagenesis Start'],row['Mutagenesis End']
        if missed_counter == 0:
            print('All mutagenic windows overlap!')
        else:
            print(str(missed_counter) + ' number of times the mutagenic window does not close!')
                
    #Remove any oligos with additional BsaI sites or paqcI sites
    bad_oligos = []
    for name,oligo in oligo_array.items():
        #check for paqcI sites
        paqcI_F = sum([True for kmer in build_kmers(oligo, len(paqcI_seq)) if kmer==paqcI_seq])
        paqcI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq])
        #check that oligo is block 1 if it contains a paqcI site in the forward orientation 
        if paqcI_F > 0:
            if ('block1' not in name) | (paqcI_F > 1):
                bad_oligos.append(name)
        #check that the oligo is block final if it contains a paqcI site in the reverse orientation
        if paqcI_R > 0:
            if ('block'+str(num_blocks[name.split('_')[0]]) not in name) | (paqcI_R > 1):
                bad_oligos.append(name)
        #check for more than one BsaI site
        bsaI_F = sum([True for kmer in build_kmers(oligo, len(bsaI_seq)) if kmer==bsaI_seq])
        bsaI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq])
        if (bsaI_F != 1) | (bsaI_R != 1):
            bad_oligos.append(name)
    bad_oligos=np.unique(bad_oligos)
    for oligo_name in bad_oligos:
        del oligo_array[oligo_name]
    print(str(len(bad_oligos)) + ' oligos deleted due to errant restriction sites.')
    
    #Remove any duplicate oligos
    new_dict = {}
    seen_values = set()
    counter=0
    for key, value in oligo_array.items():
        if value not in seen_values:
            new_dict[key] = value
            seen_values.add(value)
        else:
            counter += 1
    print(str(counter) + ' oligos removed due to duplication.')
    oligo_array = new_dict
    del new_dict
    
    #write oligo array to file
    with open(oligo_file, 'w') as f:
        for key in oligo_array.keys():
            f.write("%s,%s\n"%(key,oligo_array[key]))
    f.close()
            
    #write primers to file
    if find_pcr_primers:
        primer_order_sheet = []
        for key in amp_primers.keys():
            primer_order_sheet.append(key + '\t' + \
                     str(amp_primers[key]) + \
                     '\t' + '25nm' + '\t' + 'STD\n')
        print(*primer_order_sheet)
        with open(primer_file, 'w') as f:
            for line in primer_order_sheet:
                f.write(line)
        f.close()
    
    #write amplification primer key to file
    amp_primer_key = ['Gene' + '\t' + 'Block' + '\t' + \
                      'Forward Primer Well' + '\t' + 'Forward Primer' + '\t' + \
                      'Reverse Primer Well' + '\t' + 'Reverse Primer' + '\t' + 'Validated' + '\n']
    for key in amp_primer_dict.keys():
        genename, geneblock = key[0], str(key[1])
        name_primer_F, primer_F, name_primer_R, primer_R, validated = amp_primer_dict[key]
        amp_primer_key.append(genename + '\t' + geneblock + '\t' + \
                 name_primer_F + '\t' + primer_F + '\t' + \
                 name_primer_R + '\t' + primer_R + '\t' + str(validated) + '\n')
    print(*amp_primer_key)
    with open(amp_primer_key_file, 'w') as f:
        for line in amp_primer_key:
            f.write(line)
    f.close()
    
    #write breakpoint dict to file
    breakpoint_df.to_csv(breakpoint_file, sep='\t')
    
    #write gblocks to file
    gblock_order_sheet = []
    gblock_large_order_sheet = []
    for key in gblocks.keys():
        # pad gblock if it is not 300bp for Twist
        if len(gblocks[key]) < gblock_min_size:
            gblocks[key] = Seq(randomsequencepad[0:(gblock_min_size-len(gblocks[key]))]) + gblocks[key]
        if len(gblocks[key]) < gblock_large_threshold:
            gblock_order_sheet.append(key + '\t' + \
                     str(gblocks[key]) + '\n')
        else:
            gblock_large_order_sheet.append(key + '\t' + \
                     str(gblocks[key]) + '\n')
    print(*gblock_order_sheet)
    print(*gblock_large_order_sheet)
    with open(gbl_file, 'w') as f:
        for line in gblock_order_sheet:
            f.write(line)
    f.close()
    with open(gbl_large_file, 'w') as f:
        for line in gblock_large_order_sheet:
            f.write(line)
    f.close()
    
    return oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df
                

In [25]:
# Make the library for N term
oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df = write_oligo_library({
                                            'RHEB':RHEB,
                                            'RAP1a':RAP1A,
                                            'NRAS':NRAS,
                                            'CDC42':CDC42,
                                            'Rac1':Rac1,
                                            'RHOA':RHOA,
                                            'RAB1A':RAB1A,
                                            'RALA':RALA,
                                            #'RAN':RAN, #-this will be done on its own to avoid the mutagenic window problems
                                            'RIT':RIT1,
                                            'NHLRC1':NHLRC1,
                                            'SMS':SMS,
                                            'ETV6':ETV6,
                                            'RUNX1':RUNX1,
                                            'GATA2':GATA2,
                                            'RAB7A':RAB7A,
                                            'RAB11B':RAB11B,
                                            'RAB18':RAB18,
                                            #'RAB23':RAB23,
                                            'RAB27A':RAB27A,
                                            'FOXP3':FOXP3,
                                            #'BBS1':BBS1, #-this will be done on its own to avoid the mutagenic window problems
                                            'SMAD4':SMAD4},
                                          oligo_file='./L_Seq_Lib2/nterm/circRNA_oligos.csv',
                                          primer_file='./L_Seq_Lib2/nterm/circRNA_primers.tsv',
                                          gbl_file='./L_Seq_Lib2/nterm/circRNA_gblocks.tsv',
                                          gbl_large_file='./L_Seq_Lib2/nterm/circRNA_gblocks_large.tsv',
                                          amp_primer_key_file='./L_Seq_Lib2/nterm/circRNA_ampkey.tsv',
                                          breakpoint_file='./L_Seq_Lib2/nterm/circRNA_breakpoints.tsv',
                                          block_size_range=[165,177],
                                          primer_set=orthogonal_primers_touse,
                                          validated_primer_set=validated_primer_combos_filtered,
                                          aa_start={                    })

Processing gene 1: RHEB
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 151, 586],
                         [0, 135, 299, 586],
                         [0, 291, 456, 586],
                         [0, 436, 586]],
 'Optimum Overlaps': [['CGTC', 'AAGT', 'GCAT'],
                      ['CGTC', 'AGAA', 'AAGT', 'GCAT'],
                      ['CGTC', 'AAGT', 'GGAA', 'GCAT'],
                      ['CGTC', 'TCTT', 'GCAT']],
 'Optimum Scores': [0.9686111584258357,
                    0.9492248770241213,
                    0.9488582960103636,
                    0.9695086923749419]}




Processing gene 2: RAP1a
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 146, 586],
                         [0, 132, 299, 586],
                         [0, 290, 450, 586],
                         [0, 443, 586]],
 'Optimum Overlaps': [['CGTC', 'AAGT', 'GCAT'],
                      ['CGTC', 'TTCC', 'AGGA', 'GCAT'],
                      ['CGTC', 'ACGA', 'AGAA', 'GCAT'],
                      ['CGTC', 'CCTT', 'GCAT']],
 'Optimum Scores': [0.9686111584258357,
                    0.9503362656703902,
                    0.9498283531731762,
                    0.9682407337419413]}




Processing gene 3: NRAS
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 146, 601],
                         [0, 132, 302, 601],
                         [0, 290, 465, 601],
                         [0, 445, 601]],
 'Optimum Overlaps': [['CGTC', 'AAGT', 'GCAT'],
                      ['CGTC', 'TTCT', 'TCTA', 'GCAT'],
                      ['CGTC', 'CGGA', 'ACAG', 'GCAT'],
                      ['CGTC', 'GAAA', 'GCAT']],
 'Optimum Scores': [0.9686111584258357,
                    0.9482281973334376,
                    0.9458881584285265,
                    0.9690616370636065]}
Processing gene 4: CDC42
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 152, 607],
                         [0, 134, 314, 607],
                         [0, 299, 463, 607],
                         [0, 451, 607]],
 'Optimum Overlaps': [['CGTC'



Processing gene 5: Rac1
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 156, 610],
                         [0, 138, 314, 610],
                         [0, 303, 472, 610],
                         [0, 458, 610]],
 'Optimum Overlaps': [['CGTC', 'AGAT', 'GCAT'],
                      ['CGTC', 'TTCT', 'CTGA', 'GCAT'],
                      ['CGTC', 'AAAG', 'GTAA', 'GCAT'],
                      ['CGTC', 'AGGA', 'GCAT']],
 'Optimum Scores': [0.9683683622263004,
                    0.9498025225652811,
                    0.9459803582414805,
                    0.9696915390447625]}




Processing gene 6: RHOA
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 150, 613],
                         [0, 137, 310, 613],
                         [0, 294, 477, 613],
                         [0, 453, 613]],
 'Optimum Overlaps': [['CGTC', 'AGAT', 'GCAT'],
                      ['CGTC', 'AGAA', 'AAGT', 'GCAT'],
                      ['CGTC', 'AGAA', 'TTTT', 'GCAT'],
                      ['CGTC', 'AGAT', 'GCAT']],
 'Optimum Scores': [0.9683683622263004,
                    0.9492248770241213,
                    0.9486858609670515,
                    0.9683683622263004]}
Processing gene 7: RAB1A
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...




Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 168, 649],
                         [0, 146, 327, 649],
                         [0, 319, 501, 649],
                         [0, 484, 649]],
 'Optimum Overlaps': [['CGTC', 'AAGA', 'GCAT'],
                      ['CGTC', 'CAAT', 'ACAG', 'GCAT'],
                      ['CGTC', 'AATG', 'AGAA', 'GCAT'],
                      ['CGTC', 'AAGA', 'GCAT']],
 'Optimum Scores': [0.9695086923749419,
                    0.9441783879763741,
                    0.9485209081858764,
                    0.9695086923749419]}
Processing gene 8: RALA
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 125, 652],
                         [0, 111, 265, 652],
                         [0, 254, 403, 652],
                         [0, 382, 537, 652],
                         [0, 519, 652]],
 'Optimum Overlaps': [['CGTC', 'ACGA', 'GCAT'],
                      ['CGTC', '



Processing gene 9: RIT
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 143, 691],
                         [0, 119, 281, 691],
                         [0, 263, 424, 691],
                         [0, 408, 568, 691],
                         [0, 549, 691]],
 'Optimum Overlaps': [['CGTC', 'TCAG', 'GCAT'],
                      ['CGTC', 'AGAG', 'AGTA', 'GCAT'],
                      ['CGTC', 'TTAC', 'TCAG', 'GCAT'],
                      ['CGTC', 'TCTT', 'AGGA', 'GCAT'],
                      ['CGTC', 'CCTT', 'GCAT']],
 'Optimum Scores': [0.9690333618956276,
                    0.9455261077896948,
                    0.9477673860911271,
                    0.9509100474180537,
                    0.9682407337419413]}
Processing gene 10: NHLRC1
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 151, 1219],
                 



Processing gene 11: SMS
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 140, 1132],
                         [0, 130, 283, 1132],
                         [0, 267, 431, 1132],
                         [0, 416, 575, 1132],
                         [0, 559, 719, 1132],
                         [0, 706, 857, 1132],
                         [0, 845, 1004, 1132],
                         [0, 990, 1132]],
 'Optimum Overlaps': [['CGTC', 'CCTG', 'GCAT'],
                      ['CGTC', 'TCGG', 'ATTT', 'GCAT'],
                      ['CGTC', 'AGAA', 'AAGT', 'GCAT'],
                      ['CGTC', 'AATA', 'AAGA', 'GCAT'],
                      ['CGTC', 'ATCA', 'GAAA', 'GCAT'],
                      ['CGTC', 'AAGA', 'TTCC', 'GCAT'],
                      ['CGTC', 'ATTT', 'AAGA', 'GCAT'],
                      ['CGTC', 'ACTG', 'GCAT']],
 'Optimum Scores': [0.9670848413591582,
                    0.9472074554361593



Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 149, 1390],
                         [0, 142, 307, 1390],
                         [0, 299, 465, 1390],
                         [0, 454, 625, 1390],
                         [0, 607, 776, 1390],
                         [0, 763, 935, 1390],
                         [0, 920, 1091, 1390],
                         [0, 1081, 1251, 1390],
                         [0, 1240, 1390]],
 'Optimum Overlaps': [['CGTC', 'AGGA', 'GCAT'],
                      ['CGTC', 'AGGA', 'CTGA', 'GCAT'],
                      ['CGTC', 'TCCT', 'TCAG', 'GCAT'],
                      ['CGTC', 'GTCA', 'TCCC', 'GCAT'],
                      ['CGTC', 'GAGC', 'CGAA', 'GCAT'],
                      ['CGTC', 'TCCG', 'TCTC', 'GCAT'],
                      ['CGTC', 'GGAA', 'TCCG', 'GCAT'],
                      ['CGTC', 'GAAA', 'AGGA', 'GCAT'],
                      ['CGTC', 'AGGA', 'GCAT']],
 'Optimum Scores': [0.9696915390447625,
                    0.950259



Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 157, 1474],
                         [0, 147, 332, 1474],
                         [0, 311, 489, 1474],
                         [0, 479, 655, 1474],
                         [0, 641, 832, 1474],
                         [0, 815, 997, 1474],
                         [0, 980, 1158, 1474],
                         [0, 1147, 1324, 1474],
                         [0, 1310, 1474]],
 'Optimum Overlaps': [['CGTC', 'CTGA', 'GCAT'],
                      ['CGTC', 'TTCC', 'ACTG', 'GCAT'],
                      ['CGTC', 'TCTG', 'ATTT', 'GCAT'],
                      ['CGTC', 'AGGT', 'CAGA', 'GCAT'],
                      ['CGTC', 'AGAA', 'CAGA', 'GCAT'],
                      ['CGTC', 'AGAT', 'CTGA', 'GCAT'],
                      ['CGTC', 'TCTC', 'CCTG', 'GCAT'],
                      ['CGTC', 'TACC', 'TCCA', 'GCAT'],
                      ['CGTC', 'CCTG', 'GCAT']],
 'Optimum Scores': [0.9690333618956276,
                    0.947627



Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 161, 1474],
                         [0, 146, 332, 1474],
                         [0, 312, 490, 1474],
                         [0, 481, 663, 1474],
                         [0, 641, 826, 1474],
                         [0, 811, 988, 1474],
                         [0, 978, 1159, 1474],
                         [0, 1145, 1320, 1474],
                         [0, 1314, 1474]],
 'Optimum Overlaps': [['CGTC', 'TCTT', 'GCAT'],
                      ['CGTC', 'ACGA', 'TCTC', 'GCAT'],
                      ['CGTC', 'AGAT', 'TCCC', 'GCAT'],
                      ['CGTC', 'TCAG', 'GTCA', 'GCAT'],
                      ['CGTC', 'AGGA', 'TTCC', 'GCAT'],
                      ['CGTC', 'TTCC', 'AAGA', 'GCAT'],
                      ['CGTC', 'TCTG', 'AATG', 'GCAT'],
                      ['CGTC', 'ACTA', 'TGGA', 'GCAT'],
                      ['CGTC', 'CCTG', 'GCAT']],
 'Optimum Scores': [0.9695086923749419,
                    0.948521



Processing gene 16: RAB11B
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 139, 688],
                         [0, 119, 279, 688],
                         [0, 269, 425, 688],
                         [0, 407, 563, 688],
                         [0, 551, 688]],
 'Optimum Overlaps': [['CGTC', 'AAGA', 'GCAT'],
                      ['CGTC', 'ACGA', 'CCTG', 'GCAT'],
                      ['CGTC', 'CAGT', 'CTGA', 'GCAT'],
                      ['CGTC', 'ATCT', 'CAGA', 'GCAT'],
                      ['CGTC', 'AGAA', 'GCAT']],
 'Optimum Scores': [0.9695086923749419,
                    0.9467214379992452,
                    0.9475502359306585,
                    0.9475096741264747,
                    0.9693578138231524]}
Processing gene 17: RAB18
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 126, 652],
               



Processing gene 18: RAB27A
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 140, 697],
                         [0, 124, 281, 697],
                         [0, 271, 425, 697],
                         [0, 418, 573, 697],
                         [0, 556, 697]],
 'Optimum Overlaps': [['CGTC', 'CAGT', 'GCAT'],
                      ['CGTC', 'TCCA', 'TCTT', 'GCAT'],
                      ['CGTC', 'ACGA', 'ATCT', 'GCAT'],
                      ['CGTC', 'AAGA', 'GGAA', 'GCAT'],
                      ['CGTC', 'CTGA', 'GCAT']],
 'Optimum Scores': [0.967710450999788,
                    0.9490519875816658,
                    0.9484751671218126,
                    0.9500860381186731,
                    0.9690333618956276]}
Processing gene 19: FOXP3
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...




Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 145, 1327],
                         [0, 128, 290, 1327],
                         [0, 281, 440, 1327],
                         [0, 430, 598, 1327],
                         [0, 586, 741, 1327],
                         [0, 727, 890, 1327],
                         [0, 881, 1050, 1327],
                         [0, 1035, 1195, 1327],
                         [0, 1183, 1327]],
 'Optimum Overlaps': [['CGTC', 'GGAA', 'GCAT'],
                      ['CGTC', 'GGGC', 'ACTT', 'GCAT'],
                      ['CGTC', 'CCTT', 'TCTC', 'GCAT'],
                      ['CGTC', 'ACTG', 'CTGG', 'GCAT'],
                      ['CGTC', 'TCCT', 'TCTG', 'GCAT'],
                      ['CGTC', 'GAGA', 'TCGT', 'GCAT'],
                      ['CGTC', 'AAGG', 'GCTC', 'GCAT'],
                      ['CGTC', 'TTTC', 'AAGT', 'GCAT'],
                      ['CGTC', 'CTGA', 'GCAT']],
 'Optimum Scores': [0.969089677906989,
                    0.9425354



All oligos are below the maximum 250bp!
Running QC for primer specificity on WT oligos


  return THERMO_ANALYSIS.calcHeterodimer(


No non-specific primers detected
All mutagenic windows overlap!
859 oligos deleted due to errant restriction sites.
5144 oligos removed due to duplication.
RHEB_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGCCGCAGTCCAAGTCCC	25nm	STD
 RHEB_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGCATCACCGAGCATGAGGACTTG	25nm	STD
 RHEB_block1_s1_ampF	GGCTACGGTCTCTAAGTTGATCACAGTAAATGGACAAGAATAT	25nm	STD
 RHEB_block2_s1_ampR	GGCTACGGTCTCTTTCTATGGTTGGATCGTAGGAGTCC	25nm	STD
 RHEB_block2_s2_ampF	GGCTACGGTCTCTAAGTGATTAAAGTTATCCATGGCAAATTGT	25nm	STD
 RHEB_block3_s1_ampR	GGCTACGGTCTCTACTTTTGATTGATGTAACAGAATACACAAG	25nm	STD
 RHEB_block3_s2_ampF	GGCTACGGTCTCTGGAATCTTCTGCTAAAGAAAATCAGACTGC	25nm	STD
 RHEB_block4_s1_ampR	GGCTACGGTCTCTAAGATTCTGCCAAAGCTTTCCCTTC	25nm	STD
 RAP1a_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGCGTGAGTACAAGCTAGTGGTC	25nm	STD
 RAP1a_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGGAGCAGCAGACATGATTTCTTTTTAGG	25nm	STD
 RAP1a_block1_s1_ampF	GGCTACGGTCTCTAAGTTGAAGTCGATTGCCAACAGTG	25nm	STD
 RAP1a_block2_s1_ampR	GGCTACGGT

In [26]:
# Subtract the primer pairs used in making the nterm tags from the remaining pairs to make the list for the next set

#1. Read the two primer tables
df_used = pd.read_csv(
    "L_Seq_Lib2/nterm/circRNA_ampkey.tsv",
    sep="\t",
    usecols=["Forward Primer", "Reverse Primer"]
)

df_all = pd.read_csv("orthogonal_primer_random_combos_used_DH_20250613.csv")   # keep every column

#2. Build helper columns to subracting 
df_used_norm = df_used.assign(
    FWD=df_used["Forward Primer"].str.strip().str.upper(),
    REV=df_used["Reverse Primer"].str.strip().str.upper()
)

df_all_norm = df_all.assign(
    FWD=df_all["Forward Primer"].str.strip().str.upper(),
    REV=df_all["Reverse Primer"].str.strip().str.upper()
)

#3. Identify pairs that are already used
used_index = df_used_norm.set_index(["FWD", "REV"]).index
mask       = ~df_all_norm.set_index(["FWD", "REV"]).index.isin(used_index)

#4. Keep the rows (and *all* original columns) that remain
orthogonal_primers_remaining_iter1 = df_all.loc[mask].reset_index(drop=True)

print(f"{len(orthogonal_primers_remaining_iter1)} primer pairs remain.")
orthogonal_primers_remaining_iter1.head()
orthogonal_primers_remaining_iter1.to_csv("orthogonal_primers_remaining_iter1.csv", index=False)


101 primer pairs remain.


In [27]:
# Just RAN
oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df = write_oligo_library({
                                            'RAN':RAN},
                                          oligo_file='./L_Seq_Lib2/nterm/RAN_circRNA_oligos.csv',
                                          primer_file='./L_Seq_Lib2/nterm/RAN_circRNA_primers.tsv',
                                          gbl_file='./L_Seq_Lib2/nterm/RAN_circRNA_gblocks.tsv',
                                          gbl_large_file='./L_Seq_Lib2/nterm/RAN_circRNA_gblocks_large.tsv',
                                          amp_primer_key_file='./L_Seq_Lib2/nterm/RAN_circRNA_ampkey.tsv',
                                          breakpoint_file='./L_Seq_Lib2/nterm/RAN_circRNA_breakpoints.tsv',
                                          block_size_range=[120,140],
                                          primer_set=orthogonal_primers_remaining_iter1,
                                          validated_primer_set=validated_primer_combos_filtered,
                                          aa_start={                    })

Processing gene 1: RAN
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 112, 682],
                         [0, 99, 227, 682],
                         [0, 214, 345, 682],
                         [0, 335, 465, 682],
                         [0, 447, 584, 682],
                         [0, 573, 682]],
 'Optimum Overlaps': [['CGTC', 'ACTG', 'GCAT'],
                      ['CGTC', 'GAAA', 'AGAA', 'GCAT'],
                      ['CGTC', 'ACAG', 'ACGA', 'GCAT'],
                      ['CGTC', 'GAGA', 'TTCT', 'GCAT'],
                      ['CGTC', 'TCTT', 'TGGA', 'GCAT'],
                      ['CGTC', 'AGAA', 'GCAT']],
 'Optimum Scores': [0.967710450999788,
                    0.949841214886651,
                    0.947538969517558,
                    0.9489393726161777,
                    0.9490519875816659,
                    0.9693578138231524]}




All oligos are below the maximum 250bp!
Running QC for primer specificity on WT oligos


  return THERMO_ANALYSIS.calcHeterodimer(


No non-specific primers detected
All mutagenic windows overlap!
22 oligos deleted due to errant restriction sites.
188 oligos removed due to duplication.
RAN_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGGCTGCGCAGGGAGAG	25nm	STD
 RAN_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGCAGGTCATCATCCTCATCCGGG	25nm	STD
 RAN_block1_s1_ampF	GGCTACGGTCTCTACTGGTGAATTTGAGAAGAAGTATGTAGC	25nm	STD
 RAN_block2_s1_ampR	GGCTACGGTCTCTTTTCACGAAGGTCGTTTTTCCAGTAC	25nm	STD
 RAN_block2_s2_ampF	GGCTACGGTCTCTAGAAATTCGGTGGACTGAGAGATGG	25nm	STD
 RAN_block3_s1_ampR	GGCTACGGTCTCTCTGTGTCCCATACATTGAACTTAATAGGTC	25nm	STD
 RAN_block3_s2_ampF	GGCTACGGTCTCTACGAGTGTGTGAAAACATCCCC	25nm	STD
 RAN_block4_s1_ampR	GGCTACGGTCTCTTCTCTATGCCAGTTAGGCACATTCTTG	25nm	STD
 RAN_block4_s2_ampF	GGCTACGGTCTCTTTCTGCCAAAAGTAACTACAACTTTGAAAA	25nm	STD
 RAN_block5_s1_ampR	GGCTACGGTCTCTAAGATTCTTCTTTCGGTGGAACACAATG	25nm	STD
 RAN_block5_s2_ampF	GGCTACGGTCTCTTGGACCCAGCTTTGGCAGC	25nm	STD
 RAN_block6_s1_ampR	GGCTACGGTCTCTTTCTGGTGGGGCGAGAGC	25nm	STD

Gene	Block	Forwa

In [28]:
# Subtract the primer pairs used in making the RAN library from the remaining pairs to make the list for the next set

#1. Read the two primer tables
df_used = pd.read_csv(
    "L_Seq_Lib2/nterm/RAN_circRNA_ampkey.tsv",
    sep="\t",
    usecols=["Forward Primer", "Reverse Primer"]
)

df_all = orthogonal_primers_remaining_iter1  # keep every column

#2. Build helper columns to subracting 
df_used_norm = df_used.assign(
    FWD=df_used["Forward Primer"].str.strip().str.upper(),
    REV=df_used["Reverse Primer"].str.strip().str.upper()
)

df_all_norm = df_all.assign(
    FWD=df_all["Forward Primer"].str.strip().str.upper(),
    REV=df_all["Reverse Primer"].str.strip().str.upper()
)

#3. Identify pairs that are already used
used_index = df_used_norm.set_index(["FWD", "REV"]).index
mask       = ~df_all_norm.set_index(["FWD", "REV"]).index.isin(used_index)

#4. Keep the rows (and *all* original columns) that remain
orthogonal_primers_remaining_iter2 = df_all.loc[mask].reset_index(drop=True)

print(f"{len(orthogonal_primers_remaining_iter2)} primer pairs remain.")
orthogonal_primers_remaining_iter2.head()
orthogonal_primers_remaining_iter2.to_csv("orthogonal_primers_remaining_iter2.csv", index=False)


95 primer pairs remain.


In [29]:
# Just BBS1
oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df = write_oligo_library({
                                            'BBS1':BBS1},
                                          oligo_file='./L_Seq_Lib2/nterm/BBS1_circRNA_oligos.csv',
                                          primer_file='./L_Seq_Lib2/nterm/BBS1_circRNA_primers.tsv',
                                          gbl_file='./L_Seq_Lib2/nterm/BBS1_circRNA_gblocks.tsv',
                                          gbl_large_file='./L_Seq_Lib2/nterm/BBS1_circRNA_gblocks_large.tsv',
                                          amp_primer_key_file='./L_Seq_Lib2/nterm/BBS1_circRNA_ampkey.tsv',
                                          breakpoint_file='./L_Seq_Lib2/nterm/BBS1_circRNA_breakpoints.tsv',
                                          block_size_range=[185,198],
                                          primer_set=orthogonal_primers_remaining_iter2,
                                          validated_primer_set=validated_primer_combos_filtered,
                                          aa_start={                    })

Processing gene 1: BBS1
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...




Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 161, 1813],
                         [0, 150, 333, 1813],
                         [0, 316, 498, 1813],
                         [0, 479, 667, 1813],
                         [0, 653, 826, 1813],
                         [0, 815, 992, 1813],
                         [0, 982, 1160, 1813],
                         [0, 1144, 1326, 1813],
                         [0, 1317, 1494, 1813],
                         [0, 1487, 1663, 1813],
                         [0, 1652, 1813]],
 'Optimum Overlaps': [['CGTC', 'GGGA', 'GCAT'],
                      ['CGTC', 'AGAT', 'ACTT', 'GCAT'],
                      ['CGTC', 'CGGA', 'GGAA', 'GCAT'],
                      ['CGTC', 'AGAT', 'TCTT', 'GCAT'],
                      ['CGTC', 'ACGA', 'GGAA', 'GCAT'],
                      ['CGTC', 'CCTG', 'AGAT', 'GCAT'],
                      ['CGTC', 'TGGA', 'ACGG', 'GCAT'],
                      ['CGTC', 'CTTT', 'TCAG', 'GCAT'],
                     

  return THERMO_ANALYSIS.calcHeterodimer(


No non-specific primers detected
All mutagenic windows overlap!
121 oligos deleted due to errant restriction sites.
513 oligos removed due to duplication.
BBS1_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGGCCGCTGCGTCCTC	25nm	STD
 BBS1_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGGGCGGCCGCCAGCCCC	25nm	STD
 BBS1_block1_s1_ampF	GGCTACGGTCTCTGGGATGGGGAATACAAGCTGGTG	25nm	STD
 BBS1_block2_s1_ampR	GGCTACGGTCTCTATCTGCCAGCGCTAGGCAG	25nm	STD
 BBS1_block2_s2_ampF	GGCTACGGTCTCTACTTGCTTCAGGCCCTTGTGTC	25nm	STD
 BBS1_block3_s1_ampR	GGCTACGGTCTCTTCCGGGGCTCATGTTGCTC	25nm	STD
 BBS1_block3_s2_ampF	GGCTACGGTCTCTGGAAACGGCAGAGGAGCCTTTG	25nm	STD
 BBS1_block4_s1_ampR	GGCTACGGTCTCTATCTCCTTCAGGGTTAAGGGGTC	25nm	STD
 BBS1_block4_s2_ampF	GGCTACGGTCTCTTCTTGCCTGGTGCTGGGC	25nm	STD
 BBS1_block5_s1_ampR	GGCTACGGTCTCTTCGTCAGCCAGGTTCTTCTTCAAG	25nm	STD
 BBS1_block5_s2_ampF	GGCTACGGTCTCTGGAAACATCTATATTCTGAGAAGAGACTCC	25nm	STD
 BBS1_block6_s1_ampR	GGCTACGGTCTCTCAGGCCGCGGCAAGCC	25nm	STD
 BBS1_block6_s2_ampF	GGCTACGGTCTCTAGATGCCCGCAGCCA

In [30]:
# Subtract the primer pairs used in making the BBS1 library from the remaining pairs to make the list for the next set

#1. Read the two primer tables
df_used = pd.read_csv(
    "L_Seq_Lib2/nterm/BBS1_circRNA_ampkey.tsv",
    sep="\t",
    usecols=["Forward Primer", "Reverse Primer"]
)

df_all = orthogonal_primers_remaining_iter2  # keep every column

#2. Build helper columns to subracting 
df_used_norm = df_used.assign(
    FWD=df_used["Forward Primer"].str.strip().str.upper(),
    REV=df_used["Reverse Primer"].str.strip().str.upper()
)

df_all_norm = df_all.assign(
    FWD=df_all["Forward Primer"].str.strip().str.upper(),
    REV=df_all["Reverse Primer"].str.strip().str.upper()
)

#3. Identify pairs that are already used
used_index = df_used_norm.set_index(["FWD", "REV"]).index
mask       = ~df_all_norm.set_index(["FWD", "REV"]).index.isin(used_index)

#4. Keep the rows (and *all* original columns) that remain
orthogonal_primers_remaining_iter3 = df_all.loc[mask].reset_index(drop=True)

print(f"{len(orthogonal_primers_remaining_iter3)} primer pairs remain.")
orthogonal_primers_remaining_iter3.head()
orthogonal_primers_remaining_iter3.to_csv("orthogonal_primers_remaining_iter3.csv", index=False)


84 primer pairs remain.


In [43]:
##This is the shit version that shifts the frame of the tag 
# #redefine functions for cterm tagging
# def post_qc(amp_primer_set, wt_oligos, primer_set, melt_temp_threshold = 35, check_all_primers=True):
#     print("Running QC for primer specificity on WT oligos")
#     f_primer_map = {}
#     r_primer_map = {}
#     # invert the primer to subpool map
#     for k, v in amp_primer_set.items():
#         f_primer_map[v[1]] = f_primer_map.get(v[1], []) + [k]
#         r_primer_map[v[3]] = r_primer_map.get(v[3], []) + [k]
    
#     # initialize list of nonspecific problems
#     nonspecific = {}
    
#     # add unused primers if check_all_primers
#     if check_all_primers:
#         all_f_primers = np.unique(primer_set['Forward Primer'])
#         all_r_primers = np.unique(primer_set['Reverse Primer'])
#         for f_primer in all_f_primers:
#             if f_primer not in f_primer_map.keys():
#                 f_primer_map[f_primer] = []
#         for r_primer in all_r_primers:
#             if r_primer not in r_primer_map.keys():
#                 r_primer_map[r_primer] = []
        
#     for f_primer, subpools_used in f_primer_map.items():
#     # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
#         anneal_locs = []
#         for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
#             if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
#                 if check_nonspecific(f_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
#                     anneal_locs.append(subpoolcheck)
#         if anneal_locs:
#             nonspecific.update({f_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
#     for r_primer, subpools_used in r_primer_map.items():
#     # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
#         anneal_locs = []
#         for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
#             if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
#                 if check_nonspecific(r_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
#                     anneal_locs.append(subpoolcheck)
#         if anneal_locs:
#             nonspecific.update({r_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
#     if nonspecific:
#         print("Nonspecific Primers: (Manually removing primer sequence recommended)")
#         print(nonspecific)
#     else:
#         print("No non-specific primers detected")
        
#     return nonspecific

# def build_kmers(sequence, 
#                 ksize):
#     kmers = []
#     n_kmers = len(sequence) - ksize + 1

#     for i in range(n_kmers):
#         kmer = sequence[i:i + ksize]
#         kmers.append(kmer)

#     return kmers

# def compute_overlaps(breakpoints, 
#                      inclusion_array, 
#                      gene):
    
#     overlaps = [[gene[val:val+4].reverse_complement(), gene[val:val+4]] for val in breakpoints]
#     counter = 0
#     for val in inclusion_array:
#         if val == -1:
#             (overlaps[counter][1],overlaps[counter+1][0]) = (overlaps[counter+1][0],overlaps[counter][1])
#             counter += 1
#         elif val == 0:
#             overlaps[counter][1] = overlaps[counter+1][1]
#             del overlaps[counter+1]
        
#     return overlaps

# def score_breakpoints(gene, 
#                       breakpoint_pair, 
#                       empirical, 
#                       overhang_blacklist=overhang_blacklist):
    
#     #subset empirical matrix by the set of all overlaps
#     all_overlaps = []
#     for breakpoint in breakpoint_pair:
#         all_overlaps.append(gene[breakpoint:(breakpoint+4)])
#         all_overlaps.append(gene[breakpoint:(breakpoint+4)].reverse_complement())
#     all_overlaps = [str(o) for o in all_overlaps]
#     if (len(np.unique(all_overlaps)) == len(all_overlaps)) & (len(set(all_overlaps).intersection(set(overhang_blacklist))) == 0):
#         empirical_subset = empirical.loc[all_overlaps,all_overlaps]

#         #compute fidelity score
#         empirical_subset = empirical_subset/empirical_subset.sum(axis=1)
#         fidelity_score = 1
#         for breakpoint in breakpoint_pair:
#             fidelity_score = fidelity_score * \
#                 empirical_subset.loc[str(gene[breakpoint:(breakpoint+4)]),
#                                      str(gene[breakpoint:(breakpoint+4)].reverse_complement())]
    
#     else:
#         fidelity_score = 0
    
#     return fidelity_score
    
# def optimize_breakpoints(gene, 
#                          breakpoint_pair, 
#                          indices_to_shift, 
#                          indices_of_array,
#                          slack, 
#                          empirical=bsaI_empirical, 
#                          overhang_blacklist=overhang_blacklist):
    
#     #compute all enrichments
#     shifts = list(range(-slack,slack+1))
#     if (len(indices_to_shift) > 2) | (len(indices_to_shift) < 1):
#         print('Error -- too many or too few breakpoints!')
#         optimum_breakpoint = breakpoint_pair
#         optimum_score = 0
#     elif (len(indices_to_shift) == 1): #external pair
#         scores = [0]*len(shifts)
#         for i,shift in enumerate(shifts):
#             scores[i] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift] + breakpoint_pair[(indices_to_shift[0]+1):], 
#                                           empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
        
#         optimum_shift = np.argmax(scores)
#         optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift]] + breakpoint_pair[(indices_to_shift[0]+1):]
#         optimum_score = scores[optimum_shift]
#         optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]]
            
            
#     else: #internal pair
#         indices_to_shift = sorted(indices_to_shift)
#         scores = np.zeros((len(shifts),len(shifts)))
#         for i,shift1 in enumerate(shifts):
#             for j,shift2 in enumerate(shifts):
#                 scores[i,j] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift1] + \
#                                                             breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
#                                                             [breakpoint_pair[indices_to_shift[1]]+shift2] + breakpoint_pair[(indices_to_shift[1]+1):], 
#                                               empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                
#         optimum_shift = np.unravel_index(np.argmax(scores,axis=None), scores.shape)
#         optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift[0]]] + \
#                                             breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
#                                             [breakpoint_pair[indices_to_shift[1]]+shifts[optimum_shift[1]]] + breakpoint_pair[(indices_to_shift[1]+1):]
#         optimum_score = scores[optimum_shift]
#         optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]] + 4
    
#     return optimum_breakpoint, optimum_score, optimum_length

# def optimize_gene(gene, 
#                   max_tile_size,
#                   first_last_block_reduction,
#                   block_size_range=block_size_range, 
#                   slack=slack, 
#                   empirical=bsaI_empirical, 
#                   overhang_blacklist=overhang_blacklist): 
    
#     #setup initial inputs to optimization
#     gene_size = len(gene)
#     protein_size = len(gene.translate())
        
#     #exclude gene if it is too big
#     if protein_size > 1000:
#         print('Protein size too big!')
        
#     #divide genes between 500 and 1000aa into two blocks
#     elif protein_size > 620:
#         print('Protein size too big! Will add two superblock (620aa+ proteins) soon.')
        
#     else:
#         #gene is one superblock 
#         #print('Protein is one superblock.')
#         block_size = block_size_range[0] + np.argmin(
#             [abs((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]) - \
#                      round((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]))) \
#                  for i in range(0, block_size_range[1]-block_size_range[0])])
        
        
#         # now, set initial breakpoints
#         fragment_number = int((gene_size+2*first_last_block_reduction)/block_size)
        
#         # if any of the tiles are too big?
#         tile_lengths = [1000]
#         while max(tile_lengths) > (max_tile_size-2*slack):
#             fragment_number = fragment_number + 1
#             first_breakpoint = 0
#             last_breakpoint = gene_size-4
#             step = (last_breakpoint - first_breakpoint + 2*first_last_block_reduction)/fragment_number
#             evenly_spaced_floats = [first_breakpoint] + [step * i - first_last_block_reduction for i in range(1,fragment_number)] + [last_breakpoint]        
#             initial_breakpoints = [[first_breakpoint,int(evenly_spaced_floats[1])+slack+2,last_breakpoint]] + \
#                                     [[0,int(evenly_spaced_floats[i])-2-slack,int(evenly_spaced_floats[i+1])+slack+2,last_breakpoint] for i in range(1,len(evenly_spaced_floats)-2)] + \
#                                     [[0,int(evenly_spaced_floats[-2])-2-slack,last_breakpoint]]
#             tile_lengths = [int(evenly_spaced_floats[1])+slack+2-first_breakpoint+first_last_block_reduction+4] + \
#                                     [int(evenly_spaced_floats[i+1])-int(evenly_spaced_floats[i])+2*(slack+2)+4 for i in range(1,len(evenly_spaced_floats)-2)] + \
#                                     [last_breakpoint+4-int(evenly_spaced_floats[-2])+slack+2+first_last_block_reduction]
            

#         #optimize each breakpoint
#         optimum_breakpoints = []
#         optimum_scores = []
#         optimum_lengths = []
#         oligo_array_indices = []
#         for k,breakpoint in enumerate(initial_breakpoints):
#             if len(breakpoint) == 3:
#                 indices_of_array = [0, 1] if k==0 else [1, 2]
#                 optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1], indices_of_array,
#                                                             slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
#                 optimum_breakpoints.append(optimum_breakpoint)
#                 optimum_scores.append(optimum_score)
#                 optimum_lengths.append(optimum_length)
#                 oligo_array_indices.append(indices_of_array)
#             else:
#                 indices_of_array = [1, 2]
#                 optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1, 2], indices_of_array,
#                                                             slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
#                 optimum_breakpoints.append(optimum_breakpoint)
#                 optimum_scores.append(optimum_score)
#                 optimum_lengths.append(optimum_length)
#                 oligo_array_indices.append(indices_of_array)
    
#     optimum_overlaps = [[str(gene[t:(t+4)]) for t in s] for s in optimum_breakpoints]
#     if all([s >= 0.95 for s in optimum_scores]):
#         print('All regions are high fidelity!')
#     elif all([s >= 0.9 for s in optimum_scores]):
#         print('Some regions are medium fidelity.')
#     else:
#         print('Some regions are low fidelity. Look closer')
        
#     return optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices

# def generate_primer(DNA_seq,
#                      Fwd=True,
#                      extendtoCG=False,
#                      smallest_primer_size=16,
#                      largest_primer_size=30,
#                      Tm=55):
    
#     #Setup melting temperature arrays
#     melt_temp_array = np.zeros(largest_primer_size-smallest_primer_size+1)
    
#     if Fwd:
#         DNA_seq_touse = DNA_seq
#     else:
#         DNA_seq_touse = DNA_seq.reverse_complement()
            
#     #Make melting temperature arrays
#     primer_length = 0
#     for i in range(smallest_primer_size,largest_primer_size+1):
#         melt_temp_array[i-smallest_primer_size] = mt.Tm_NN(DNA_seq_touse[0:i])
        
#         #Pick F primer when Tm is first >F_Tm
#         if (melt_temp_array[i-smallest_primer_size] >= Tm) & (primer_length==0):
#             primer_length = i
    
#     #If Tm isnt high enough after max bases, just set primer length to be max and hope it works
#     if (primer_length == 0):
#         primer_length = largest_primer_size
        
#     if extendtoCG:
#         while ((DNA_seq_touse[primer_length-1] == 'A') | (DNA_seq_touse[primer_length-1] == 'T')) & \
#                     (primer_length < largest_primer_size):
#             primer_length += 1
    
#     return DNA_seq_touse[0:primer_length]

# def make_mutations(region_name,
#                        region,
#                        region_flanks=[Seq(''),Seq('')],
#                        nt_start=0, #zero-indexed!
#                        wt_only=False,
#                        synonymous=True,
#                        stops='TAA',
#                        all3ntdeletions=True,
#                        mutation_list=False,
#                        codons_ranked_by_usage=codons_ranked_by_usage,
#                        aa_start=0):
                       
#     oligo_array = {}
#     #Check that region has size divisible by three
#     if (len(region)/3 != len(region)//3) | (nt_start/3 != nt_start//3):
#         print('Region is not translatable!')
        
#     else:
#         #add wt seq to oligo array
#         oligo_name = region_name + '_WT'
#         wt_seq = \
#             region_flanks[0] + region + region_flanks[1]
#         oligo_array[oligo_name] = wt_seq
        
#         if not wt_only:
            
#             # see if a mutation list was given
#             if mutation_list == False:
                    
#                 #loop over amino acids
#                 for j in range(0,len(region),3):

#                     #add all missense variants
#                     aa = region[j:(j+3)].translate()
#                     for aa_to in codons_ranked_by_usage.keys():
#                         if aa_to != aa:
#                             oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa_to)
#                             seq_to_append = \
#                                 region_flanks[0] + \
#                                 region[0:j] + Seq(codons_ranked_by_usage[aa_to][0]) + \
#                                 region[(j+3):] + \
#                                 region_flanks[1]
#                             oligo_array[oligo_name] = seq_to_append

#                     #add synonymous variant if True and if possible, 
#                     # using the most common codon that is NOT the codon in the gene
#                     if synonymous:
#                         if len(codons_ranked_by_usage[aa]) > 1:
#                             oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa)
#                             possible_codons = codons_ranked_by_usage[aa].copy()
#                             possible_codons.remove(region[j:(j+3)])
#                             seq_to_append = \
#                                 region_flanks[0] + \
#                                 region[0:j] + Seq(possible_codons[0]) + \
#                                 region[(j+3):] + \
#                                 region_flanks[1]
#                             oligo_array[oligo_name] = seq_to_append

#                     #add stops if true
#                     if stops:
#                         oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + 'X'
#                         seq_to_append = \
#                             region_flanks[0] + \
#                             region[0:j] + Seq(stops) + \
#                             region[(j+3):] + \
#                             region_flanks[1]
#                         oligo_array[oligo_name] = seq_to_append

#                     #add all 3nt deletions if True
#                     if all3ntdeletions:
#                         for k in range(0,3):
#                             if j+k+3 <= len(region):
#                                 oligo_name = region_name + '_' + 'del' + str(nt_start+j+k+1+3*aa_start)
#                                 seq_to_append = \
#                                     region_flanks[0] + \
#                                     region[0:(j+k)] + \
#                                     region[(j+k+3):] + \
#                                     region_flanks[1]
#                                 oligo_array[oligo_name] = seq_to_append
                                
#             else:
                
#                 #loop over mutation list
#                 for i in range(len(mutation_list)):
                        
#                     #iterate over every single aa change
#                     oligo_name = region_name + '_' + 'variant' + str(i+1)
#                     seq_to_append = region
#                     for k,v in enumerate(mutation_list[i]):
#                         aa_from = v[0]
#                         aa_to = v[-1]
#                         pos = int(v[1:-1])
#                         j=3*(pos-(nt_start//3+1))
#                         aa = region[j:(j+3)].translate()
#                         if aa != aa_from:
#                             print('Check mutation list!')
#                         else:
#                             seq_to_append = \
#                                 seq_to_append[0:j] + \
#                                 Seq(codons_ranked_by_usage[aa_to][0]) + \
#                                 seq_to_append[(j+3):]

#                     # append oligo to array
#                     seq_to_append = region_flanks[0] + \
#                                     seq_to_append + \
#                                     region_flanks[1]
#                     oligo_array[oligo_name] = seq_to_append
        
#     return oligo_array


# def write_oligo_library(genes,
#                         oligo_file='./oligo_test.csv',
#                         primer_file='./primer_test.tsv',
#                         gbl_file='./gbl_test.tsv',
#                         gbl_large_file='./gbl_test_large.tsv',
#                         amp_primer_key_file='./amp_primer_key.tsv',
#                         breakpoint_file='./breakpoints.tsv',
#                         primer_set=orthogonal_primers_touse,
#                         codons_ranked_by_usage=codons_ranked_by_usage,
#                         block_size_range=block_size_range, 
#                         max_oligo_size=max_oligo_size,
#                         slack=slack, 
#                         empirical=bsaI_empirical, 
#                         overhang_blacklist=overhang_blacklist,
#                         validated_primer_set=False,
#                         aa_start=False,
#                         wt_only=False,
#                         synonymous=True,
#                         stops='TAA',
#                         all3ntdeletions=True,
#                         mutations_to_use=False,
#                         find_pcr_primers=True,
#                         smallest_primer_size=16,
#                         largest_primer_size=30,
#                         Tm=55,
#                         extendtoCG=True,
#                         bsaI_firstoverlap='CGTC',
#                         bsaI_lastoverlap='GCAT',
#                         all_blocks=True,
#                         blocks_to_include=False,
#                         tile_boundaries=False,
#                         paqcIcapF=True,
#                         paqcIcapR=True,
#                         check_all_primers=True,
#                         qc_melt_temp_threshold=32,
#                         gblock_min_size=300,
#                         gblock_large_threshold=1000,
#                         randomsequencepad=randomsequencepad):
    
#     #Split up primer set into F and R primers, cannot do more than 82 sublibraries
#     oligo_primer_counter = 0
#     oligo_array = {}
#     amp_primers = {}
#     gblocks = {}
#     num_blocks = {}
#     amp_primer_dict = {}
#     breakpoint_dict = {}
    
#     #Convert genes to Seq and genes to list
#     gene_names = list(genes.keys())
#     genes = [Seq(genes[gene_name].upper()) for gene_name in gene_names]
    
#     #PaqCI and BsaI site sequences and overhangs for paqcI
#     paqcI_seq = Seq('CACCTGC')
#     paqcI_overhang_nterm_ctag = Seq('CCAC')
#     paqcI_overhang_cterm_ctag = Seq('CGGG')
#     paqcI_overhang_nterm_ntag = Seq('TGGC')
#     paqcI_overhang_cterm_ntag = Seq('TAGG')
#     paqcI_seqplusfour = Seq('CACCTGCCTAG')
#     bsaI_seq = Seq('GGTCTC')
#     bsaI_seqplusone = Seq('GGTCTCT')
#     pcr_capseq = Seq('GGCTAC') + bsaI_seqplusone
#     gbl_capseq_F = Seq('CCGCGTGATTACGAGTCG') + pcr_capseq
#     gbl_capseq_R = Seq('GGGTTAGCAAGTGGCAGCCT') + pcr_capseq
    
#     # set max size of a tile
#     primer_len = len(primer_set['Forward Primer'][0])
#     max_tile_size = max_oligo_size - 2*primer_len - 2*len(bsaI_seqplusone)
#     first_last_block_reduction = len(paqcI_seqplusfour)
    
#     # iterate over genes
#     for r,gene in enumerate(genes):
        
#         gene_name = gene_names[r]
#         print('Processing gene ' + str(r+1) + ': ' + gene_name)
        
#         # amino acid to start at
#         if aa_start != False:
#             if gene_name in aa_start.keys():
#                 aa_start_gene=aa_start[gene_name]-1
#             else:
#                 aa_start_gene=0
#         else:
#             aa_start_gene=0
        
#         #exclude if gene size is not divisible by three
#         if len(gene)/3 != len(gene)//3:
#             print('Gene length is not divisible by 3!')
    
#         #exclude if there is a paqcI site in the gene
#         elif any([True for kmer in build_kmers(gene, len(paqcI_seq)) if kmer==paqcI_seq]) | \
#             any([True for kmer in build_kmers(gene.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq]):
#             print('Gene has paqcI site!')
        
#         #exclude if there is a BsaI site in the gene
#         elif any([True for kmer in build_kmers(gene, len(bsaI_seq)) if kmer==bsaI_seq]) | \
#             any([True for kmer in build_kmers(gene.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq]):
#             print('Gene has BsaI site!')
            
#         else:
#             print('Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...')
            
#             #cap gene with BsaI breakpoints and possible paqcI sites 
#             if paqcIcapF & paqcIcapR:
#                 gene_capped = bsaI_firstoverlap + paqcI_seqplusfour +  paqcI_overhang_nterm_ctag  +\
#                         gene + paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
#                         bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag)
#                 capping_length_R = len(paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
#                                    bsaI_lastoverlap)
#             elif paqcIcapF:
#                 gene_capped = bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag + \
#                         gene + bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag)
#                 capping_length_R = len(bsaI_lastoverlap)
#             elif paqcIcapR:
#                 gene_capped = bsaI_firstoverlap + \
#                         gene + paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap)
#                 capping_length_R = len(paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
#                                    bsaI_lastoverlap)
#             else:
#                 gene_capped = bsaI_firstoverlap + gene + bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap)
#                 capping_length_R = len(bsaI_lastoverlap)
            
#             #Optimize gene if tile boundaries are not given
#             if tile_boundaries is False:
#                 optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices = \
#                 optimize_gene(gene_capped, 
#                           max_tile_size=max_tile_size,
#                           first_last_block_reduction=first_last_block_reduction,
#                           block_size_range=block_size_range, 
#                           slack=slack, 
#                           empirical=bsaI_empirical, 
#                           overhang_blacklist=overhang_blacklist)
#                 pprint.pprint({'Optimum Breakpoints': optimum_breakpoints, 
#                        'Optimum Overlaps': optimum_overlaps, 
#                        'Optimum Scores': optimum_scores})
#                 num_blocks[gene_name] = len(optimum_breakpoints)
#             else:
#                 optimum_breakpoints = tile_boundaries[gene_name]
#                 if len(optimum_breakpoints[0])==3:
#                     if len(optimum_breakpoints)>1:
#                         #multiple tiles, including one at beginning of gene
#                         oligo_array_indices = [[0,1]] + [[1,2]]*(len(optimum_breakpoints)-1)
#                     else:
#                         # one tile
#                         if ((optimum_breakpoints[1]-optimum_breakpoints[0]) > (optimum_breakpoints[2]-optimum_breakpoints[1])):
#                             # at end of gene
#                             oligo_array_indices = [[1,2]]
#                         else:
#                             # at beginning of gene
#                             oligo_array_indices = [[0,1]]
#                 else:
#                     # multiple tiles, starting in the middle
#                     oligo_array_indices = [[1,2]]*(len(optimum_breakpoints))
#                 num_blocks[gene_name] = len(optimum_breakpoints)
                
            
#             #add primers for gene_F and gene_R that are repeated constantly throughout the PCRs
#             #note: should probably prevalidate these primers!
#             if find_pcr_primers:
#                 F_primer = generate_primer(gene,
#                                            Fwd=True,
#                                            extendtoCG=extendtoCG,
#                                            smallest_primer_size=smallest_primer_size,
#                                            largest_primer_size=largest_primer_size,
#                                            Tm=Tm)
#                 F_primer = pcr_capseq + bsaI_firstoverlap + paqcI_seqplusfour + F_primer
#                 amp_primers[gene_name+'_gene'+'_ampF'] = F_primer
#                 R_primer = generate_primer(gene,
#                                            Fwd=False,
#                                            extendtoCG=extendtoCG,
#                                            smallest_primer_size=smallest_primer_size,
#                                            largest_primer_size=largest_primer_size,
#                                            Tm=Tm)
#                 R_primer = pcr_capseq + Seq(bsaI_lastoverlap).reverse_complement() + \
#                             paqcI_seqplusfour + R_primer
#                 amp_primers[gene_name+'_gene'+'_ampR'] = R_primer
            
#             #make oligos, primers, gblocks for each block
#             for i,breakpoint in enumerate(optimum_breakpoints):
                
#                 #find indices of breakpoint that correspond to oligo vs need to be PCRed/gblock
#                 pcr_indices = [[j,j+1] for j in range(len(breakpoint)-1)]
#                 pcr_indices.remove(oligo_array_indices[i])
                
#                 #find mutagenic window of oligo
#                 oligo_breaks = [breakpoint[j] for j in oligo_array_indices[i]]
#                 oligo_mutagenic_window = [int(3*np.ceil(max(oligo_breaks[0]+4-capping_length_F,3)/3)), int(3*np.floor(min(oligo_breaks[1]-capping_length_F,len(gene)-1)/3))]
                
#                 #subset the right block if needed
#                 if (all_blocks == True) | ((i+1) in blocks_to_include[r] if blocks_to_include != False else True): #subset on allowed blocks
                
#                     #add pcr primers and gblocks, one segment at a time
#                     for k,pcr_index in enumerate(pcr_indices):
#                         piece_name = gene_name + '_block' + str(i+1) + '_s' + str(k+1)
#                         pcr_breaks = [breakpoint[j] for j in pcr_index]
                                            
#                         #get pcr primers
#                         if find_pcr_primers:
#                             if pcr_breaks[0] == breakpoint[0]: #Fragment beginning at gene start 
#                                 R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=False,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 R_primer = pcr_capseq + R_primer
#                                 amp_primers[piece_name+'_ampR'] = R_primer
#                             elif pcr_breaks[1] == breakpoint[-1]: #Fragment ending at gene end
#                                 F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=True,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 F_primer = pcr_capseq + F_primer
#                                 amp_primers[piece_name+'_ampF'] = F_primer
#                             else:
#                                 F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=True,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 F_primer = pcr_capseq + F_primer
#                                 R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=False,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 R_primer = pcr_capseq + R_primer
#                                 amp_primers[piece_name+'_ampF'] = F_primer
#                                 amp_primers[piece_name+'_ampR'] = R_primer

#                         #make gblocks
#                         gbl = gbl_capseq_F + gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)] + gbl_capseq_R.reverse_complement()
#                         gblocks[piece_name] = gbl
                        
#                     #add oligos to oligo array, checking first for validated primers if they are given
#                     validated=False
#                     if validated_primer_set is not False:
#                         validated_combo = validated_primer_set.query('Gene == @gene_name & Block == (@i+1)')
#                         if not validated_combo.empty:
#                             validated=True
#                             name_primer_F, primer_F, name_primer_R, primer_R = \
#                                 validated_combo[['Forward Name',
#                                                   'Forward Primer',
#                                                   'Reverse Name',
#                                                   'Reverse Primer']].values[0]
#                         else:
#                             name_primer_F, primer_F, name_primer_R, primer_R = \
#                                 primer_set.iloc[oligo_primer_counter,][['Forward Name',
#                                                                       'Forward Primer',
#                                                                       'Reverse Name',
#                                                                       'Reverse Primer']]
#                             oligo_primer_counter += 1
#                     else:
#                         name_primer_F, primer_F, name_primer_R, primer_R = \
#                                 primer_set.iloc[oligo_primer_counter,][['Forward Name',
#                                                                       'Forward Primer',
#                                                                       'Reverse Name',
#                                                                       'Reverse Primer']]
#                         oligo_primer_counter += 1
                        
#                     # if mutations are given, use those - otherwise make all mutations or wtonly
#                     if mutations_to_use == False:
#                         add_on_array = make_mutations(gene_name + '_block' + str(i+1),
#                                            gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
#                                            region_flanks=[Seq(primer_F) + \
#                                                           bsaI_seqplusone + \
#                                                           gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
#                                                           gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
#                                                           bsaI_seqplusone.reverse_complement() + \
#                                                           Seq(primer_R).reverse_complement()],
#                                            nt_start=oligo_mutagenic_window[0],
#                                            wt_only=wt_only,
#                                            synonymous=synonymous,
#                                            stops=stops,
#                                            all3ntdeletions=all3ntdeletions,
#                                            codons_ranked_by_usage=codons_ranked_by_usage,
#                                            aa_start=aa_start_gene)
#                     else:
#                         add_on_array = make_mutations(gene_name + '_block' + str(i+1),
#                                            gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
#                                            region_flanks=[Seq(primer_F) + \
#                                                           bsaI_seqplusone + \
#                                                           gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
#                                                           gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
#                                                           bsaI_seqplusone.reverse_complement() + \
#                                                           Seq(primer_R).reverse_complement()],
#                                            nt_start=oligo_mutagenic_window[0],
#                                            mutation_list=mutations_to_use[(gene_name,i+1)],
#                                            codons_ranked_by_usage=codons_ranked_by_usage,
#                                            aa_start=aa_start_gene)
#                     oligo_array.update(add_on_array)
#                     amp_primer_dict.update({(gene_name,i+1): (name_primer_F,primer_F,name_primer_R,primer_R,validated)})
#                     breakpoint_dict.update({(gene_name,i+1): oligo_mutagenic_window})
                    
#     #Check that max oligo is less than the max oligo length
#     if sum([len(s)>max_oligo_size for s in oligo_array.values()]) == 0:
#         print('All oligos are below the maximum 250bp!')
#     else:
#         print('Some oligos are TOO BIG!')
        
#     #Check for nonspecific amplification
#     wt_oligos = {tuple([key.split('_')[0],
#                         int((key.split('_block')[1]).split('_')[0])]
#                       ):oligo_array[key] \
#                      for key in oligo_array.keys() if 'WT' in key}
#     nonspecific_primers = post_qc(amp_primer_dict, 
#                                   wt_oligos,
#                                   primer_set, 
#                                   melt_temp_threshold=qc_melt_temp_threshold,
#                                   check_all_primers=check_all_primers)
    
#     # Check that all mutagenic windows overlap
#     breakpoint_df = pd.DataFrame.from_dict(breakpoint_dict, orient='index', columns=['Mutagenesis Start','Mutagenesis End'])
#     breakpoint_df.index = pd.MultiIndex.from_tuples(breakpoint_dict.keys())
#     breakpoint_df = breakpoint_df.reset_index().rename(columns={'level_0':'Gene',
#                                                                 'level_1':'Block'})
#     if (mutations_to_use == False) and (all_blocks == True):
#         missed_counter = 0
#         for r,gene_group_breakpoints in breakpoint_df.groupby('Gene'):
#             for k,row in gene_group_breakpoints.iterrows():
#                 # look at whether the current row start is later than the last row end
#                 if row['Block'] > 1:
#                     if row['Mutagenesis Start'] > end:
#                         missed_counter += 1
#                         print('Mutagenic window missed at ' + str(r) + ' block ' + str(row['Block']))
#                 start,end = row['Mutagenesis Start'],row['Mutagenesis End']
#         if missed_counter == 0:
#             print('All mutagenic windows overlap!')
#         else:
#             print(str(missed_counter) + ' number of times the mutagenic window does not close!')
                
#     #Remove any oligos with additional BsaI sites or paqcI sites
#     bad_oligos = []
#     for name,oligo in oligo_array.items():
#         #check for paqcI sites
#         paqcI_F = sum([True for kmer in build_kmers(oligo, len(paqcI_seq)) if kmer==paqcI_seq])
#         paqcI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq])
#         #check that oligo is block 1 if it contains a paqcI site in the forward orientation 
#         if paqcI_F > 0:
#             if ('block1' not in name) | (paqcI_F > 1):
#                 bad_oligos.append(name)
#         #check that the oligo is block final if it contains a paqcI site in the reverse orientation
#         if paqcI_R > 0:
#             if ('block'+str(num_blocks[name.split('_')[0]]) not in name) | (paqcI_R > 1):
#                 bad_oligos.append(name)
#         #check for more than one BsaI site
#         bsaI_F = sum([True for kmer in build_kmers(oligo, len(bsaI_seq)) if kmer==bsaI_seq])
#         bsaI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq])
#         if (bsaI_F != 1) | (bsaI_R != 1):
#             bad_oligos.append(name)
#     bad_oligos=np.unique(bad_oligos)
#     for oligo_name in bad_oligos:
#         del oligo_array[oligo_name]
#     print(str(len(bad_oligos)) + ' oligos deleted due to errant restriction sites.')
    
#     #Remove any duplicate oligos
#     new_dict = {}
#     seen_values = set()
#     counter=0
#     for key, value in oligo_array.items():
#         if value not in seen_values:
#             new_dict[key] = value
#             seen_values.add(value)
#         else:
#             counter += 1
#     print(str(counter) + ' oligos removed due to duplication.')
#     oligo_array = new_dict
#     del new_dict
    
#     #write oligo array to file
#     with open(oligo_file, 'w') as f:
#         for key in oligo_array.keys():
#             f.write("%s,%s\n"%(key,oligo_array[key]))
#     f.close()
            
#     #write primers to file
#     if find_pcr_primers:
#         primer_order_sheet = []
#         for key in amp_primers.keys():
#             primer_order_sheet.append(key + '\t' + \
#                      str(amp_primers[key]) + \
#                      '\t' + '25nm' + '\t' + 'STD\n')
#         print(*primer_order_sheet)
#         with open(primer_file, 'w') as f:
#             for line in primer_order_sheet:
#                 f.write(line)
#         f.close()
    
#     #write amplification primer key to file
#     amp_primer_key = ['Gene' + '\t' + 'Block' + '\t' + \
#                       'Forward Primer Well' + '\t' + 'Forward Primer' + '\t' + \
#                       'Reverse Primer Well' + '\t' + 'Reverse Primer' + '\t' + 'Validated' + '\n']
#     for key in amp_primer_dict.keys():
#         genename, geneblock = key[0], str(key[1])
#         name_primer_F, primer_F, name_primer_R, primer_R, validated = amp_primer_dict[key]
#         amp_primer_key.append(genename + '\t' + geneblock + '\t' + \
#                  name_primer_F + '\t' + primer_F + '\t' + \
#                  name_primer_R + '\t' + primer_R + '\t' + str(validated) + '\n')
#     print(*amp_primer_key)
#     with open(amp_primer_key_file, 'w') as f:
#         for line in amp_primer_key:
#             f.write(line)
#     f.close()
    
#     #write breakpoint dict to file
#     breakpoint_df.to_csv(breakpoint_file, sep='\t')
    
#     #write gblocks to file
#     gblock_order_sheet = []
#     gblock_large_order_sheet = []
#     for key in gblocks.keys():
#         # pad gblock if it is not 300bp for Twist
#         if len(gblocks[key]) < gblock_min_size:
#             gblocks[key] = Seq(randomsequencepad[0:(gblock_min_size-len(gblocks[key]))]) + gblocks[key]
#         if len(gblocks[key]) < gblock_large_threshold:
#             gblock_order_sheet.append(key + '\t' + \
#                      str(gblocks[key]) + '\n')
#         else:
#             gblock_large_order_sheet.append(key + '\t' + \
#                      str(gblocks[key]) + '\n')
#     print(*gblock_order_sheet)
#     print(*gblock_large_order_sheet)
#     with open(gbl_file, 'w') as f:
#         for line in gblock_order_sheet:
#             f.write(line)
#     f.close()
#     with open(gbl_large_file, 'w') as f:
#         for line in gblock_large_order_sheet:
#             f.write(line)
#     f.close()
    
#     return oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df
                

In [66]:
# #redefine functions for cterm tagging 20250610
# def post_qc(amp_primer_set, wt_oligos, primer_set, melt_temp_threshold = 35, check_all_primers=True):
#     print("Running QC for primer specificity on WT oligos")
#     f_primer_map = {}
#     r_primer_map = {}
#     # invert the primer to subpool map
#     for k, v in amp_primer_set.items():
#         f_primer_map[v[1]] = f_primer_map.get(v[1], []) + [k]
#         r_primer_map[v[3]] = r_primer_map.get(v[3], []) + [k]
    
#     # initialize list of nonspecific problems
#     nonspecific = {}
    
#     # add unused primers if check_all_primers
#     if check_all_primers:
#         all_f_primers = np.unique(primer_set['Forward Primer'])
#         all_r_primers = np.unique(primer_set['Reverse Primer'])
#         for f_primer in all_f_primers:
#             if f_primer not in f_primer_map.keys():
#                 f_primer_map[f_primer] = []
#         for r_primer in all_r_primers:
#             if r_primer not in r_primer_map.keys():
#                 r_primer_map[r_primer] = []
        
#     for f_primer, subpools_used in f_primer_map.items():
#     # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
#         anneal_locs = []
#         for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
#             if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
#                 if check_nonspecific(f_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
#                     anneal_locs.append(subpoolcheck)
#         if anneal_locs:
#             nonspecific.update({f_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
#     for r_primer, subpools_used in r_primer_map.items():
#     # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
#         anneal_locs = []
#         for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
#             if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
#                 if check_nonspecific(r_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
#                     anneal_locs.append(subpoolcheck)
#         if anneal_locs:
#             nonspecific.update({r_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
#     if nonspecific:
#         print("Nonspecific Primers: (Manually removing primer sequence recommended)")
#         print(nonspecific)
#     else:
#         print("No non-specific primers detected")
        
#     return nonspecific

# def build_kmers(sequence, 
#                 ksize):
#     kmers = []
#     n_kmers = len(sequence) - ksize + 1

#     for i in range(n_kmers):
#         kmer = sequence[i:i + ksize]
#         kmers.append(kmer)

#     return kmers

# def compute_overlaps(breakpoints, 
#                      inclusion_array, 
#                      gene):
    
#     overlaps = [[gene[val:val+4].reverse_complement(), gene[val:val+4]] for val in breakpoints]
#     counter = 0
#     for val in inclusion_array:
#         if val == -1:
#             (overlaps[counter][1],overlaps[counter+1][0]) = (overlaps[counter+1][0],overlaps[counter][1])
#             counter += 1
#         elif val == 0:
#             overlaps[counter][1] = overlaps[counter+1][1]
#             del overlaps[counter+1]
        
#     return overlaps

# def score_breakpoints(gene, 
#                       breakpoint_pair, 
#                       empirical, 
#                       overhang_blacklist=overhang_blacklist):
    
#     #subset empirical matrix by the set of all overlaps
#     all_overlaps = []
#     for breakpoint in breakpoint_pair:
#         all_overlaps.append(gene[breakpoint:(breakpoint+4)])
#         all_overlaps.append(gene[breakpoint:(breakpoint+4)].reverse_complement())
#     all_overlaps = [str(o) for o in all_overlaps]
#     if (len(np.unique(all_overlaps)) == len(all_overlaps)) & (len(set(all_overlaps).intersection(set(overhang_blacklist))) == 0):
#         empirical_subset = empirical.loc[all_overlaps,all_overlaps]

#         #compute fidelity score
#         empirical_subset = empirical_subset/empirical_subset.sum(axis=1)
#         fidelity_score = 1
#         for breakpoint in breakpoint_pair:
#             fidelity_score = fidelity_score * \
#                 empirical_subset.loc[str(gene[breakpoint:(breakpoint+4)]),
#                                      str(gene[breakpoint:(breakpoint+4)].reverse_complement())]
    
#     else:
#         fidelity_score = 0
    
#     return fidelity_score
    
# def optimize_breakpoints(gene, 
#                          breakpoint_pair, 
#                          indices_to_shift, 
#                          indices_of_array,
#                          slack, 
#                          empirical=bsaI_empirical, 
#                          overhang_blacklist=overhang_blacklist):
    
#     #compute all enrichments
#     shifts = list(range(-slack,slack+1))
#     if (len(indices_to_shift) > 2) | (len(indices_to_shift) < 1):
#         print('Error -- too many or too few breakpoints!')
#         optimum_breakpoint = breakpoint_pair
#         optimum_score = 0
#     elif (len(indices_to_shift) == 1): #external pair
#         scores = [0]*len(shifts)
#         for i,shift in enumerate(shifts):
#             scores[i] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift] + breakpoint_pair[(indices_to_shift[0]+1):], 
#                                           empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
        
#         optimum_shift = np.argmax(scores)
#         optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift]] + breakpoint_pair[(indices_to_shift[0]+1):]
#         optimum_score = scores[optimum_shift]
#         optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]]
            
            
#     else: #internal pair
#         indices_to_shift = sorted(indices_to_shift)
#         scores = np.zeros((len(shifts),len(shifts)))
#         for i,shift1 in enumerate(shifts):
#             for j,shift2 in enumerate(shifts):
#                 scores[i,j] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift1] + \
#                                                             breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
#                                                             [breakpoint_pair[indices_to_shift[1]]+shift2] + breakpoint_pair[(indices_to_shift[1]+1):], 
#                                               empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                
#         optimum_shift = np.unravel_index(np.argmax(scores,axis=None), scores.shape)
#         optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift[0]]] + \
#                                             breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
#                                             [breakpoint_pair[indices_to_shift[1]]+shifts[optimum_shift[1]]] + breakpoint_pair[(indices_to_shift[1]+1):]
#         optimum_score = scores[optimum_shift]
#         optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]] + 4
    
#     return optimum_breakpoint, optimum_score, optimum_length

# def optimize_gene(gene, 
#                   max_tile_size,
#                   first_last_block_reduction,
#                   block_size_range=block_size_range, 
#                   slack=slack, 
#                   empirical=bsaI_empirical, 
#                   overhang_blacklist=overhang_blacklist): 
    
#     #setup initial inputs to optimization
#     gene_size = len(gene)
#     protein_size = len(gene.translate())
        
#     #exclude gene if it is too big
#     if protein_size > 1000:
#         print('Protein size too big!')
        
#     #divide genes between 500 and 1000aa into two blocks
#     elif protein_size > 620:
#         print('Protein size too big! Will add two superblock (620aa+ proteins) soon.')
        
#     else:
#         #gene is one superblock 
#         #print('Protein is one superblock.')
#         block_size = block_size_range[0] + np.argmin(
#             [abs((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]) - \
#                      round((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]))) \
#                  for i in range(0, block_size_range[1]-block_size_range[0])])
        
        
#         # now, set initial breakpoints
#         fragment_number = int((gene_size+2*first_last_block_reduction)/block_size)
        
#         # if any of the tiles are too big?
#         tile_lengths = [1000]
#         while max(tile_lengths) > (max_tile_size-2*slack):
#             fragment_number = fragment_number + 1
#             first_breakpoint = 0
#             last_breakpoint = gene_size-4
#             step = (last_breakpoint - first_breakpoint + 2*first_last_block_reduction)/fragment_number
#             evenly_spaced_floats = [first_breakpoint] + [step * i - first_last_block_reduction for i in range(1,fragment_number)] + [last_breakpoint]        
#             initial_breakpoints = [[first_breakpoint,int(evenly_spaced_floats[1])+slack+2,last_breakpoint]] + \
#                                     [[0,int(evenly_spaced_floats[i])-2-slack,int(evenly_spaced_floats[i+1])+slack+2,last_breakpoint] for i in range(1,len(evenly_spaced_floats)-2)] + \
#                                     [[0,int(evenly_spaced_floats[-2])-2-slack,last_breakpoint]]
#             tile_lengths = [int(evenly_spaced_floats[1])+slack+2-first_breakpoint+first_last_block_reduction+4] + \
#                                     [int(evenly_spaced_floats[i+1])-int(evenly_spaced_floats[i])+2*(slack+2)+4 for i in range(1,len(evenly_spaced_floats)-2)] + \
#                                     [last_breakpoint+4-int(evenly_spaced_floats[-2])+slack+2+first_last_block_reduction]
            

#         #optimize each breakpoint
#         optimum_breakpoints = []
#         optimum_scores = []
#         optimum_lengths = []
#         oligo_array_indices = []
#         for k,breakpoint in enumerate(initial_breakpoints):
#             if len(breakpoint) == 3:
#                 indices_of_array = [0, 1] if k==0 else [1, 2]
#                 optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1], indices_of_array,
#                                                             slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
#                 optimum_breakpoints.append(optimum_breakpoint)
#                 optimum_scores.append(optimum_score)
#                 optimum_lengths.append(optimum_length)
#                 oligo_array_indices.append(indices_of_array)
#             else:
#                 indices_of_array = [1, 2]
#                 optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1, 2], indices_of_array,
#                                                             slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
#                 optimum_breakpoints.append(optimum_breakpoint)
#                 optimum_scores.append(optimum_score)
#                 optimum_lengths.append(optimum_length)
#                 oligo_array_indices.append(indices_of_array)
    
#     optimum_overlaps = [[str(gene[t:(t+4)]) for t in s] for s in optimum_breakpoints]
#     if all([s >= 0.95 for s in optimum_scores]):
#         print('All regions are high fidelity!')
#     elif all([s >= 0.9 for s in optimum_scores]):
#         print('Some regions are medium fidelity.')
#     else:
#         print('Some regions are low fidelity. Look closer')
        
#     return optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices

# def generate_primer(DNA_seq,
#                      Fwd=True,
#                      extendtoCG=False,
#                      smallest_primer_size=16,
#                      largest_primer_size=30,
#                      Tm=55):
    
#     #Setup melting temperature arrays
#     melt_temp_array = np.zeros(largest_primer_size-smallest_primer_size+1)
    
#     if Fwd:
#         DNA_seq_touse = DNA_seq
#     else:
#         DNA_seq_touse = DNA_seq.reverse_complement()
            
#     #Make melting temperature arrays
#     primer_length = 0
#     for i in range(smallest_primer_size,largest_primer_size+1):
#         melt_temp_array[i-smallest_primer_size] = mt.Tm_NN(DNA_seq_touse[0:i])
        
#         #Pick F primer when Tm is first >F_Tm
#         if (melt_temp_array[i-smallest_primer_size] >= Tm) & (primer_length==0):
#             primer_length = i
    
#     #If Tm isnt high enough after max bases, just set primer length to be max and hope it works
#     if (primer_length == 0):
#         primer_length = largest_primer_size
        
#     if extendtoCG:
#         while ((DNA_seq_touse[primer_length-1] == 'A') | (DNA_seq_touse[primer_length-1] == 'T')) & \
#                     (primer_length < largest_primer_size):
#             primer_length += 1
    
#     return DNA_seq_touse[0:primer_length]

# def make_mutations(region_name,
#                        region,
#                        region_flanks=[Seq(''),Seq('')],
#                        nt_start=0, #zero-indexed!
#                        wt_only=False,
#                        synonymous=True,
#                        stops='TAA',
#                        all3ntdeletions=True,
#                        mutation_list=False,
#                        codons_ranked_by_usage=codons_ranked_by_usage,
#                        aa_start=0):
                       
#     oligo_array = {}
#     #Check that region has size divisible by three
#     if (len(region)/3 != len(region)//3) | (nt_start/3 != nt_start//3):
#         print('Region is not translatable!')
        
#     else:
#         #add wt seq to oligo array
#         oligo_name = region_name + '_WT'
#         wt_seq = \
#             region_flanks[0] + region + region_flanks[1]
#         oligo_array[oligo_name] = wt_seq
        
#         if not wt_only:
            
#             # see if a mutation list was given
#             if mutation_list == False:
                    
#                 #loop over amino acids
#                 for j in range(0,len(region),3):

#                     #add all missense variants
#                     aa = region[j:(j+3)].translate()
#                     for aa_to in codons_ranked_by_usage.keys():
#                         if aa_to != aa:
#                             oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa_to)
#                             seq_to_append = \
#                                 region_flanks[0] + \
#                                 region[0:j] + Seq(codons_ranked_by_usage[aa_to][0]) + \
#                                 region[(j+3):] + \
#                                 region_flanks[1]
#                             oligo_array[oligo_name] = seq_to_append

#                     #add synonymous variant if True and if possible, 
#                     # using the most common codon that is NOT the codon in the gene
#                     if synonymous:
#                         if len(codons_ranked_by_usage[aa]) > 1:
#                             oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa)
#                             possible_codons = codons_ranked_by_usage[aa].copy()
#                             possible_codons.remove(region[j:(j+3)])
#                             seq_to_append = \
#                                 region_flanks[0] + \
#                                 region[0:j] + Seq(possible_codons[0]) + \
#                                 region[(j+3):] + \
#                                 region_flanks[1]
#                             oligo_array[oligo_name] = seq_to_append

#                     #add stops if true
#                     if stops:
#                         oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + 'X'
#                         seq_to_append = \
#                             region_flanks[0] + \
#                             region[0:j] + Seq(stops) + \
#                             region[(j+3):] + \
#                             region_flanks[1]
#                         oligo_array[oligo_name] = seq_to_append

#                     #add all 3nt deletions if True
#                     if all3ntdeletions:
#                         for k in range(0,3):
#                             if j+k+3 <= len(region):
#                                 oligo_name = region_name + '_' + 'del' + str(nt_start+j+k+1+3*aa_start)
#                                 seq_to_append = \
#                                     region_flanks[0] + \
#                                     region[0:(j+k)] + \
#                                     region[(j+k+3):] + \
#                                     region_flanks[1]
#                                 oligo_array[oligo_name] = seq_to_append
                                
#             else:
                
#                 #loop over mutation list
#                 for i in range(len(mutation_list)):
                        
#                     #iterate over every single aa change
#                     oligo_name = region_name + '_' + 'variant' + str(i+1)
#                     seq_to_append = region
#                     for k,v in enumerate(mutation_list[i]):
#                         aa_from = v[0]
#                         aa_to = v[-1]
#                         pos = int(v[1:-1])
#                         j=3*(pos-(nt_start//3+1))
#                         aa = region[j:(j+3)].translate()
#                         if aa != aa_from:
#                             print('Check mutation list!')
#                         else:
#                             seq_to_append = \
#                                 seq_to_append[0:j] + \
#                                 Seq(codons_ranked_by_usage[aa_to][0]) + \
#                                 seq_to_append[(j+3):]

#                     # append oligo to array
#                     seq_to_append = region_flanks[0] + \
#                                     seq_to_append + \
#                                     region_flanks[1]
#                     oligo_array[oligo_name] = seq_to_append
        
#     return oligo_array


# def write_oligo_library(genes,
#                         oligo_file='./oligo_test.csv',
#                         primer_file='./primer_test.tsv',
#                         gbl_file='./gbl_test.tsv',
#                         gbl_large_file='./gbl_test_large.tsv',
#                         amp_primer_key_file='./amp_primer_key.tsv',
#                         breakpoint_file='./breakpoints.tsv',
#                         primer_set=orthogonal_primers_touse,
#                         codons_ranked_by_usage=codons_ranked_by_usage,
#                         block_size_range=block_size_range, 
#                         max_oligo_size=max_oligo_size,
#                         slack=slack, 
#                         empirical=bsaI_empirical, 
#                         overhang_blacklist=overhang_blacklist,
#                         validated_primer_set=False,
#                         aa_start=False,
#                         wt_only=False,
#                         synonymous=True,
#                         stops='TAA',
#                         all3ntdeletions=True,
#                         mutations_to_use=False,
#                         find_pcr_primers=True,
#                         smallest_primer_size=16,
#                         largest_primer_size=30,
#                         Tm=55,
#                         extendtoCG=True,
#                         bsaI_firstoverlap='CGTC',
#                         bsaI_lastoverlap='GCAT',
#                         all_blocks=True,
#                         blocks_to_include=False,
#                         tile_boundaries=False,
#                         paqcIcapF=True,
#                         paqcIcapR=True,
#                         check_all_primers=True,
#                         qc_melt_temp_threshold=32,
#                         gblock_min_size=300,
#                         gblock_large_threshold=1000,
#                         randomsequencepad=randomsequencepad):
    
#     #Split up primer set into F and R primers, cannot do more than 82 sublibraries
#     oligo_primer_counter = 0
#     oligo_array = {}
#     amp_primers = {}
#     gblocks = {}
#     num_blocks = {}
#     amp_primer_dict = {}
#     breakpoint_dict = {}
    
#     #Convert genes to Seq and genes to list
#     gene_names = list(genes.keys())
#     genes = [Seq(genes[gene_name].upper()) for gene_name in gene_names]
    
#     #PaqCI and BsaI site sequences and overhangs for paqcI
#     paqcI_seq = Seq('CACCTGC')
#     paqcI_overhang_nterm_ctag = Seq('CCAC')
#     paqcI_overhang_cterm_ctag = Seq('AGCGGG')
#     paqcI_overhang_nterm_ntag = Seq('TGGC')
#     paqcI_overhang_cterm_ntag = Seq('TAGG')
#     paqcI_seqplusfour = Seq('CACCTGCCTAG')
#     bsaI_seq = Seq('GGTCTC')
#     bsaI_seqplusone = Seq('GGTCTCT')
#     pcr_capseq = Seq('GGCTAC') + bsaI_seqplusone
#     gbl_capseq_F = Seq('CCGCGTGATTACGAGTCG') + pcr_capseq
#     gbl_capseq_R = Seq('GGGTTAGCAAGTGGCAGCCT') + pcr_capseq
    
#     # set max size of a tile
#     primer_len = len(primer_set['Forward Primer'][0])
#     max_tile_size = max_oligo_size - 2*primer_len - 2*len(bsaI_seqplusone)
#     first_last_block_reduction = len(paqcI_seqplusfour)
    
#     # iterate over genes
#     for r,gene in enumerate(genes):
        
#         gene_name = gene_names[r]
#         print('Processing gene ' + str(r+1) + ': ' + gene_name)
        
#         # amino acid to start at
#         if aa_start != False:
#             if gene_name in aa_start.keys():
#                 aa_start_gene=aa_start[gene_name]-1
#             else:
#                 aa_start_gene=0
#         else:
#             aa_start_gene=0
        
#         #exclude if gene size is not divisible by three
#         if len(gene)/3 != len(gene)//3:
#             print('Gene length is not divisible by 3!')
    
#         #exclude if there is a paqcI site in the gene
#         elif any([True for kmer in build_kmers(gene, len(paqcI_seq)) if kmer==paqcI_seq]) | \
#             any([True for kmer in build_kmers(gene.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq]):
#             print('Gene has paqcI site!')
        
#         #exclude if there is a BsaI site in the gene
#         elif any([True for kmer in build_kmers(gene, len(bsaI_seq)) if kmer==bsaI_seq]) | \
#             any([True for kmer in build_kmers(gene.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq]):
#             print('Gene has BsaI site!')
            
#         else:
#             print('Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...')
            
#             #cap gene with BsaI breakpoints and possible paqcI sites 
#             if paqcIcapF & paqcIcapR:
#                 gene_capped = bsaI_firstoverlap + paqcI_seqplusfour +  paqcI_overhang_nterm_ctag  +\
#                         gene + paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
#                         bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag)
#                 capping_length_R = len(paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
#                                    bsaI_lastoverlap)
#             elif paqcIcapF:
#                 gene_capped = bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag + \
#                         gene + bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag)
#                 capping_length_R = len(bsaI_lastoverlap)
#             elif paqcIcapR:
#                 gene_capped = bsaI_firstoverlap + \
#                         gene + paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap)
#                 capping_length_R = len(paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
#                                    bsaI_lastoverlap)
#             else:
#                 gene_capped = bsaI_firstoverlap + gene + bsaI_lastoverlap
#                 capping_length_F = len(bsaI_firstoverlap)
#                 capping_length_R = len(bsaI_lastoverlap)
            
#             #Optimize gene if tile boundaries are not given
#             if tile_boundaries is False:
#                 optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices = \
#                 optimize_gene(gene_capped, 
#                           max_tile_size=max_tile_size,
#                           first_last_block_reduction=first_last_block_reduction,
#                           block_size_range=block_size_range, 
#                           slack=slack, 
#                           empirical=bsaI_empirical, 
#                           overhang_blacklist=overhang_blacklist)
#                 pprint.pprint({'Optimum Breakpoints': optimum_breakpoints, 
#                        'Optimum Overlaps': optimum_overlaps, 
#                        'Optimum Scores': optimum_scores})
#                 num_blocks[gene_name] = len(optimum_breakpoints)
#             else:
#                 optimum_breakpoints = tile_boundaries[gene_name]
#                 if len(optimum_breakpoints[0])==3:
#                     if len(optimum_breakpoints)>1:
#                         #multiple tiles, including one at beginning of gene
#                         oligo_array_indices = [[0,1]] + [[1,2]]*(len(optimum_breakpoints)-1)
#                     else:
#                         # one tile
#                         if ((optimum_breakpoints[1]-optimum_breakpoints[0]) > (optimum_breakpoints[2]-optimum_breakpoints[1])):
#                             # at end of gene
#                             oligo_array_indices = [[1,2]]
#                         else:
#                             # at beginning of gene
#                             oligo_array_indices = [[0,1]]
#                 else:
#                     # multiple tiles, starting in the middle
#                     oligo_array_indices = [[1,2]]*(len(optimum_breakpoints))
#                 num_blocks[gene_name] = len(optimum_breakpoints)
                
            
#             #add primers for gene_F and gene_R that are repeated constantly throughout the PCRs
#             #note: should probably prevalidate these primers!
#             if find_pcr_primers:
#                 F_primer = generate_primer(gene,
#                                            Fwd=True,
#                                            extendtoCG=extendtoCG,
#                                            smallest_primer_size=smallest_primer_size,
#                                            largest_primer_size=largest_primer_size,
#                                            Tm=Tm)
#                 F_primer = pcr_capseq + bsaI_firstoverlap + paqcI_seqplusfour + F_primer
#                 amp_primers[gene_name+'_gene'+'_ampF'] = F_primer
#                 R_primer = generate_primer(gene,
#                                            Fwd=False,
#                                            extendtoCG=extendtoCG,
#                                            smallest_primer_size=smallest_primer_size,
#                                            largest_primer_size=largest_primer_size,
#                                            Tm=Tm)
#                 R_primer = pcr_capseq + Seq(bsaI_lastoverlap).reverse_complement() + \
#                             paqcI_seqplusfour + R_primer
#                 amp_primers[gene_name+'_gene'+'_ampR'] = R_primer
            
#             #make oligos, primers, gblocks for each block
#             for i,breakpoint in enumerate(optimum_breakpoints):
                
#                 #find indices of breakpoint that correspond to oligo vs need to be PCRed/gblock
#                 pcr_indices = [[j,j+1] for j in range(len(breakpoint)-1)]
#                 pcr_indices.remove(oligo_array_indices[i])
                
#                 #find mutagenic window of oligo
#                 oligo_breaks = [breakpoint[j] for j in oligo_array_indices[i]]
#                 oligo_mutagenic_window = [int(3*np.ceil(max(oligo_breaks[0]+4-capping_length_F,3)/3)), int(3*np.floor(min(oligo_breaks[1]-capping_length_F,len(gene)-1)/3))]
                
#                 #subset the right block if needed
#                 if (all_blocks == True) | ((i+1) in blocks_to_include[r] if blocks_to_include != False else True): #subset on allowed blocks
                
#                     #add pcr primers and gblocks, one segment at a time
#                     for k,pcr_index in enumerate(pcr_indices):
#                         piece_name = gene_name + '_block' + str(i+1) + '_s' + str(k+1)
#                         pcr_breaks = [breakpoint[j] for j in pcr_index]
                                            
#                         #get pcr primers
#                         if find_pcr_primers:
#                             if pcr_breaks[0] == breakpoint[0]: #Fragment beginning at gene start 
#                                 R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=False,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 R_primer = pcr_capseq + R_primer
#                                 amp_primers[piece_name+'_ampR'] = R_primer
#                             elif pcr_breaks[1] == breakpoint[-1]: #Fragment ending at gene end
#                                 F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=True,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 F_primer = pcr_capseq + F_primer
#                                 amp_primers[piece_name+'_ampF'] = F_primer
#                             else:
#                                 F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=True,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 F_primer = pcr_capseq + F_primer
#                                 R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
#                                                            Fwd=False,
#                                                            extendtoCG=extendtoCG,
#                                                            smallest_primer_size=smallest_primer_size,
#                                                            largest_primer_size=largest_primer_size,
#                                                            Tm=Tm)
#                                 R_primer = pcr_capseq + R_primer
#                                 amp_primers[piece_name+'_ampF'] = F_primer
#                                 amp_primers[piece_name+'_ampR'] = R_primer

#                         #make gblocks
#                         gbl = gbl_capseq_F + gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)] + gbl_capseq_R.reverse_complement()
#                         gblocks[piece_name] = gbl
                        
#                     #add oligos to oligo array, checking first for validated primers if they are given
#                     validated=False
#                     if validated_primer_set is not False:
#                         validated_combo = validated_primer_set.query('Gene == @gene_name & Block == (@i+1)')
#                         if not validated_combo.empty:
#                             validated=True
#                             name_primer_F, primer_F, name_primer_R, primer_R = \
#                                 validated_combo[['Forward Name',
#                                                   'Forward Primer',
#                                                   'Reverse Name',
#                                                   'Reverse Primer']].values[0]
#                         else:
#                             name_primer_F, primer_F, name_primer_R, primer_R = \
#                                 primer_set.iloc[oligo_primer_counter,][['Forward Name',
#                                                                       'Forward Primer',
#                                                                       'Reverse Name',
#                                                                       'Reverse Primer']]
#                             oligo_primer_counter += 1
#                     else:
#                         name_primer_F, primer_F, name_primer_R, primer_R = \
#                                 primer_set.iloc[oligo_primer_counter,][['Forward Name',
#                                                                       'Forward Primer',
#                                                                       'Reverse Name',
#                                                                       'Reverse Primer']]
#                         oligo_primer_counter += 1
                        
#                     # if mutations are given, use those - otherwise make all mutations or wtonly
#                     if mutations_to_use == False:
#                         add_on_array = make_mutations(gene_name + '_block' + str(i+1),
#                                            gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
#                                            region_flanks=[Seq(primer_F) + \
#                                                           bsaI_seqplusone + \
#                                                           gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
#                                                           gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
#                                                           bsaI_seqplusone.reverse_complement() + \
#                                                           Seq(primer_R).reverse_complement()],
#                                            nt_start=oligo_mutagenic_window[0],
#                                            wt_only=wt_only,
#                                            synonymous=synonymous,
#                                            stops=stops,
#                                            all3ntdeletions=all3ntdeletions,
#                                            codons_ranked_by_usage=codons_ranked_by_usage,
#                                            aa_start=aa_start_gene)
#                     else:
#                         add_on_array = make_mutations(gene_name + '_block' + str(i+1),
#                                            gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
#                                            region_flanks=[Seq(primer_F) + \
#                                                           bsaI_seqplusone + \
#                                                           gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
#                                                           gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
#                                                           bsaI_seqplusone.reverse_complement() + \
#                                                           Seq(primer_R).reverse_complement()],
#                                            nt_start=oligo_mutagenic_window[0],
#                                            mutation_list=mutations_to_use[(gene_name,i+1)],
#                                            codons_ranked_by_usage=codons_ranked_by_usage,
#                                            aa_start=aa_start_gene)
#                     oligo_array.update(add_on_array)
#                     amp_primer_dict.update({(gene_name,i+1): (name_primer_F,primer_F,name_primer_R,primer_R,validated)})
#                     breakpoint_dict.update({(gene_name,i+1): oligo_mutagenic_window})
                    
#     #Check that max oligo is less than the max oligo length
#     if sum([len(s)>max_oligo_size for s in oligo_array.values()]) == 0:
#         print('All oligos are below the maximum 250bp!')
#     else:
#         print('Some oligos are TOO BIG!')
        
#     #Check for nonspecific amplification
#     wt_oligos = {tuple([key.split('_')[0],
#                         int((key.split('_block')[1]).split('_')[0])]
#                       ):oligo_array[key] \
#                      for key in oligo_array.keys() if 'WT' in key}
#     nonspecific_primers = post_qc(amp_primer_dict, 
#                                   wt_oligos,
#                                   primer_set, 
#                                   melt_temp_threshold=qc_melt_temp_threshold,
#                                   check_all_primers=check_all_primers)
    
#     # Check that all mutagenic windows overlap
#     breakpoint_df = pd.DataFrame.from_dict(breakpoint_dict, orient='index', columns=['Mutagenesis Start','Mutagenesis End'])
#     breakpoint_df.index = pd.MultiIndex.from_tuples(breakpoint_dict.keys())
#     breakpoint_df = breakpoint_df.reset_index().rename(columns={'level_0':'Gene',
#                                                                 'level_1':'Block'})
#     if (mutations_to_use == False) and (all_blocks == True):
#         missed_counter = 0
#         for r,gene_group_breakpoints in breakpoint_df.groupby('Gene'):
#             for k,row in gene_group_breakpoints.iterrows():
#                 # look at whether the current row start is later than the last row end
#                 if row['Block'] > 1:
#                     if row['Mutagenesis Start'] > end:
#                         missed_counter += 1
#                         print('Mutagenic window missed at ' + str(r) + ' block ' + str(row['Block']))
#                 start,end = row['Mutagenesis Start'],row['Mutagenesis End']
#         if missed_counter == 0:
#             print('All mutagenic windows overlap!')
#         else:
#             print(str(missed_counter) + ' number of times the mutagenic window does not close!')
                
#     #Remove any oligos with additional BsaI sites or paqcI sites
#     bad_oligos = []
#     for name,oligo in oligo_array.items():
#         #check for paqcI sites
#         paqcI_F = sum([True for kmer in build_kmers(oligo, len(paqcI_seq)) if kmer==paqcI_seq])
#         paqcI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq])
#         #check that oligo is block 1 if it contains a paqcI site in the forward orientation 
#         if paqcI_F > 0:
#             if ('block1' not in name) | (paqcI_F > 1):
#                 bad_oligos.append(name)
#         #check that the oligo is block final if it contains a paqcI site in the reverse orientation
#         if paqcI_R > 0:
#             if ('block'+str(num_blocks[name.split('_')[0]]) not in name) | (paqcI_R > 1):
#                 bad_oligos.append(name)
#         #check for more than one BsaI site
#         bsaI_F = sum([True for kmer in build_kmers(oligo, len(bsaI_seq)) if kmer==bsaI_seq])
#         bsaI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq])
#         if (bsaI_F != 1) | (bsaI_R != 1):
#             bad_oligos.append(name)
#     bad_oligos=np.unique(bad_oligos)
#     for oligo_name in bad_oligos:
#         del oligo_array[oligo_name]
#     print(str(len(bad_oligos)) + ' oligos deleted due to errant restriction sites.')
    
#     #Remove any duplicate oligos
#     new_dict = {}
#     seen_values = set()
#     counter=0
#     for key, value in oligo_array.items():
#         if value not in seen_values:
#             new_dict[key] = value
#             seen_values.add(value)
#         else:
#             counter += 1
#     print(str(counter) + ' oligos removed due to duplication.')
#     oligo_array = new_dict
#     del new_dict
    
#     #write oligo array to file
#     with open(oligo_file, 'w') as f:
#         for key in oligo_array.keys():
#             f.write("%s,%s\n"%(key,oligo_array[key]))
#     f.close()
            
#     #write primers to file
#     if find_pcr_primers:
#         primer_order_sheet = []
#         for key in amp_primers.keys():
#             primer_order_sheet.append(key + '\t' + \
#                      str(amp_primers[key]) + \
#                      '\t' + '25nm' + '\t' + 'STD\n')
#         print(*primer_order_sheet)
#         with open(primer_file, 'w') as f:
#             for line in primer_order_sheet:
#                 f.write(line)
#         f.close()
    
#     #write amplification primer key to file
#     amp_primer_key = ['Gene' + '\t' + 'Block' + '\t' + \
#                       'Forward Primer Well' + '\t' + 'Forward Primer' + '\t' + \
#                       'Reverse Primer Well' + '\t' + 'Reverse Primer' + '\t' + 'Validated' + '\n']
#     for key in amp_primer_dict.keys():
#         genename, geneblock = key[0], str(key[1])
#         name_primer_F, primer_F, name_primer_R, primer_R, validated = amp_primer_dict[key]
#         amp_primer_key.append(genename + '\t' + geneblock + '\t' + \
#                  name_primer_F + '\t' + primer_F + '\t' + \
#                  name_primer_R + '\t' + primer_R + '\t' + str(validated) + '\n')
#     print(*amp_primer_key)
#     with open(amp_primer_key_file, 'w') as f:
#         for line in amp_primer_key:
#             f.write(line)
#     f.close()
    
#     #write breakpoint dict to file
#     breakpoint_df.to_csv(breakpoint_file, sep='\t')
    
#     #write gblocks to file
#     gblock_order_sheet = []
#     gblock_large_order_sheet = []
#     for key in gblocks.keys():
#         # pad gblock if it is not 300bp for Twist
#         if len(gblocks[key]) < gblock_min_size:
#             gblocks[key] = Seq(randomsequencepad[0:(gblock_min_size-len(gblocks[key]))]) + gblocks[key]
#         if len(gblocks[key]) < gblock_large_threshold:
#             gblock_order_sheet.append(key + '\t' + \
#                      str(gblocks[key]) + '\n')
#         else:
#             gblock_large_order_sheet.append(key + '\t' + \
#                      str(gblocks[key]) + '\n')
#     print(*gblock_order_sheet)
#     print(*gblock_large_order_sheet)
#     with open(gbl_file, 'w') as f:
#         for line in gblock_order_sheet:
#             f.write(line)
#     f.close()
#     with open(gbl_large_file, 'w') as f:
#         for line in gblock_large_order_sheet:
#             f.write(line)
#     f.close()
    
#     return oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df
                

In [31]:
#redefine functions for cterm tagging (again) 20250613
def post_qc(amp_primer_set, wt_oligos, primer_set, melt_temp_threshold = 35, check_all_primers=True):
    print("Running QC for primer specificity on WT oligos")
    f_primer_map = {}
    r_primer_map = {}
    # invert the primer to subpool map
    for k, v in amp_primer_set.items():
        f_primer_map[v[1]] = f_primer_map.get(v[1], []) + [k]
        r_primer_map[v[3]] = r_primer_map.get(v[3], []) + [k]
    
    # initialize list of nonspecific problems
    nonspecific = {}
    
    # add unused primers if check_all_primers
    if check_all_primers:
        all_f_primers = np.unique(primer_set['Forward Primer'])
        all_r_primers = np.unique(primer_set['Reverse Primer'])
        for f_primer in all_f_primers:
            if f_primer not in f_primer_map.keys():
                f_primer_map[f_primer] = []
        for r_primer in all_r_primers:
            if r_primer not in r_primer_map.keys():
                r_primer_map[r_primer] = []
        
    for f_primer, subpools_used in f_primer_map.items():
    # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
        anneal_locs = []
        for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
            if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
                if check_nonspecific(f_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
                    anneal_locs.append(subpoolcheck)
        if anneal_locs:
            nonspecific.update({f_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
    for r_primer, subpools_used in r_primer_map.items():
    # iterate over every barcode primer pair and match to each oligo to check for nonspecific amplification
        anneal_locs = []
        for subpoolcheck, fragmentcheck in wt_oligos.items():  # iterate over every WT oligo
            if (subpoolcheck not in subpools_used):  # ignore designed annealing (same name)
                if check_nonspecific(r_primer, fragmentcheck, Tm_rem = melt_temp_threshold, verbose=False) > 0: #use high Tm_rem
                    anneal_locs.append(subpoolcheck)
        if anneal_locs:
            nonspecific.update({r_primer:[a[0] + '_block' + str(a[1]+1) for a in anneal_locs]})
    if nonspecific:
        print("Nonspecific Primers: (Manually removing primer sequence recommended)")
        print(nonspecific)
    else:
        print("No non-specific primers detected")
        
    return nonspecific

def build_kmers(sequence, 
                ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)

    return kmers

def compute_overlaps(breakpoints, 
                     inclusion_array, 
                     gene):
    
    overlaps = [[gene[val:val+4].reverse_complement(), gene[val:val+4]] for val in breakpoints]
    counter = 0
    for val in inclusion_array:
        if val == -1:
            (overlaps[counter][1],overlaps[counter+1][0]) = (overlaps[counter+1][0],overlaps[counter][1])
            counter += 1
        elif val == 0:
            overlaps[counter][1] = overlaps[counter+1][1]
            del overlaps[counter+1]
        
    return overlaps

def score_breakpoints(gene, 
                      breakpoint_pair, 
                      empirical, 
                      overhang_blacklist=overhang_blacklist):
    
    #subset empirical matrix by the set of all overlaps
    all_overlaps = []
    for breakpoint in breakpoint_pair:
        all_overlaps.append(gene[breakpoint:(breakpoint+4)])
        all_overlaps.append(gene[breakpoint:(breakpoint+4)].reverse_complement())
    all_overlaps = [str(o) for o in all_overlaps]
    if (len(np.unique(all_overlaps)) == len(all_overlaps)) & (len(set(all_overlaps).intersection(set(overhang_blacklist))) == 0):
        empirical_subset = empirical.loc[all_overlaps,all_overlaps]

        #compute fidelity score
        empirical_subset = empirical_subset/empirical_subset.sum(axis=1)
        fidelity_score = 1
        for breakpoint in breakpoint_pair:
            fidelity_score = fidelity_score * \
                empirical_subset.loc[str(gene[breakpoint:(breakpoint+4)]),
                                     str(gene[breakpoint:(breakpoint+4)].reverse_complement())]
    
    else:
        fidelity_score = 0
    
    return fidelity_score
    
def optimize_breakpoints(gene, 
                         breakpoint_pair, 
                         indices_to_shift, 
                         indices_of_array,
                         slack, 
                         empirical=bsaI_empirical, 
                         overhang_blacklist=overhang_blacklist):
    
    #compute all enrichments
    shifts = list(range(-slack,slack+1))
    if (len(indices_to_shift) > 2) | (len(indices_to_shift) < 1):
        print('Error -- too many or too few breakpoints!')
        optimum_breakpoint = breakpoint_pair
        optimum_score = 0
    elif (len(indices_to_shift) == 1): #external pair
        scores = [0]*len(shifts)
        for i,shift in enumerate(shifts):
            scores[i] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift] + breakpoint_pair[(indices_to_shift[0]+1):], 
                                          empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
        
        optimum_shift = np.argmax(scores)
        optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift]] + breakpoint_pair[(indices_to_shift[0]+1):]
        optimum_score = scores[optimum_shift]
        optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]]
            
            
    else: #internal pair
        indices_to_shift = sorted(indices_to_shift)
        scores = np.zeros((len(shifts),len(shifts)))
        for i,shift1 in enumerate(shifts):
            for j,shift2 in enumerate(shifts):
                scores[i,j] = score_breakpoints(gene, breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shift1] + \
                                                            breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
                                                            [breakpoint_pair[indices_to_shift[1]]+shift2] + breakpoint_pair[(indices_to_shift[1]+1):], 
                                              empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                
        optimum_shift = np.unravel_index(np.argmax(scores,axis=None), scores.shape)
        optimum_breakpoint = breakpoint_pair[0:indices_to_shift[0]] + [breakpoint_pair[indices_to_shift[0]]+shifts[optimum_shift[0]]] + \
                                            breakpoint_pair[(indices_to_shift[0]+1):indices_to_shift[1]] + \
                                            [breakpoint_pair[indices_to_shift[1]]+shifts[optimum_shift[1]]] + breakpoint_pair[(indices_to_shift[1]+1):]
        optimum_score = scores[optimum_shift]
        optimum_length = optimum_breakpoint[indices_of_array[1]] - optimum_breakpoint[indices_of_array[0]] + 4
    
    return optimum_breakpoint, optimum_score, optimum_length

def optimize_gene(gene, 
                  max_tile_size,
                  first_last_block_reduction,
                  block_size_range=block_size_range, 
                  slack=slack, 
                  empirical=bsaI_empirical, 
                  overhang_blacklist=overhang_blacklist): 
    
    #setup initial inputs to optimization
    gene_size = len(gene)
    protein_size = len(gene.translate())
        
    #exclude gene if it is too big
    if protein_size > 1000:
        print('Protein size too big!')
        
    #divide genes between 500 and 1000aa into two blocks
    elif protein_size > 620:
        print('Protein size too big! Will add two superblock (620aa+ proteins) soon.')
        
    else:
        #gene is one superblock 
        #print('Protein is one superblock.')
        block_size = block_size_range[0] + np.argmin(
            [abs((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]) - \
                     round((gene_size+2*first_last_block_reduction)/(i+block_size_range[0]))) \
                 for i in range(0, block_size_range[1]-block_size_range[0])])
        
        
        # now, set initial breakpoints
        fragment_number = int((gene_size+2*first_last_block_reduction)/block_size)
        
        # if any of the tiles are too big?
        tile_lengths = [1000]
        while max(tile_lengths) > (max_tile_size-2*slack):
            fragment_number = fragment_number + 1
            first_breakpoint = 0
            last_breakpoint = gene_size-4
            step = (last_breakpoint - first_breakpoint + 2*first_last_block_reduction)/fragment_number
            evenly_spaced_floats = [first_breakpoint] + [step * i - first_last_block_reduction for i in range(1,fragment_number)] + [last_breakpoint]        
            initial_breakpoints = [[first_breakpoint,int(evenly_spaced_floats[1])+slack+2,last_breakpoint]] + \
                                    [[0,int(evenly_spaced_floats[i])-2-slack,int(evenly_spaced_floats[i+1])+slack+2,last_breakpoint] for i in range(1,len(evenly_spaced_floats)-2)] + \
                                    [[0,int(evenly_spaced_floats[-2])-2-slack,last_breakpoint]]
            tile_lengths = [int(evenly_spaced_floats[1])+slack+2-first_breakpoint+first_last_block_reduction+4] + \
                                    [int(evenly_spaced_floats[i+1])-int(evenly_spaced_floats[i])+2*(slack+2)+4 for i in range(1,len(evenly_spaced_floats)-2)] + \
                                    [last_breakpoint+4-int(evenly_spaced_floats[-2])+slack+2+first_last_block_reduction]
            

        #optimize each breakpoint
        optimum_breakpoints = []
        optimum_scores = []
        optimum_lengths = []
        oligo_array_indices = []
        for k,breakpoint in enumerate(initial_breakpoints):
            if len(breakpoint) == 3:
                indices_of_array = [0, 1] if k==0 else [1, 2]
                optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1], indices_of_array,
                                                            slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                optimum_breakpoints.append(optimum_breakpoint)
                optimum_scores.append(optimum_score)
                optimum_lengths.append(optimum_length)
                oligo_array_indices.append(indices_of_array)
            else:
                indices_of_array = [1, 2]
                optimum_breakpoint, optimum_score, optimum_length = optimize_breakpoints(gene, breakpoint, [1, 2], indices_of_array,
                                                            slack, empirical=bsaI_empirical, overhang_blacklist=overhang_blacklist)
                optimum_breakpoints.append(optimum_breakpoint)
                optimum_scores.append(optimum_score)
                optimum_lengths.append(optimum_length)
                oligo_array_indices.append(indices_of_array)
    
    optimum_overlaps = [[str(gene[t:(t+4)]) for t in s] for s in optimum_breakpoints]
    if all([s >= 0.95 for s in optimum_scores]):
        print('All regions are high fidelity!')
    elif all([s >= 0.9 for s in optimum_scores]):
        print('Some regions are medium fidelity.')
    else:
        print('Some regions are low fidelity. Look closer')
        
    return optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices

def generate_primer(DNA_seq,
                     Fwd=True,
                     extendtoCG=False,
                     smallest_primer_size=16,
                     largest_primer_size=30,
                     Tm=55):
    
    #Setup melting temperature arrays
    melt_temp_array = np.zeros(largest_primer_size-smallest_primer_size+1)
    
    if Fwd:
        DNA_seq_touse = DNA_seq
    else:
        DNA_seq_touse = DNA_seq.reverse_complement()
            
    #Make melting temperature arrays
    primer_length = 0
    for i in range(smallest_primer_size,largest_primer_size+1):
        melt_temp_array[i-smallest_primer_size] = mt.Tm_NN(DNA_seq_touse[0:i])
        
        #Pick F primer when Tm is first >F_Tm
        if (melt_temp_array[i-smallest_primer_size] >= Tm) & (primer_length==0):
            primer_length = i
    
    #If Tm isnt high enough after max bases, just set primer length to be max and hope it works
    if (primer_length == 0):
        primer_length = largest_primer_size
        
    if extendtoCG:
        while ((DNA_seq_touse[primer_length-1] == 'A') | (DNA_seq_touse[primer_length-1] == 'T')) & \
                    (primer_length < largest_primer_size):
            primer_length += 1
    
    return DNA_seq_touse[0:primer_length]

def make_mutations(region_name,
                       region,
                       region_flanks=[Seq(''),Seq('')],
                       nt_start=0, #zero-indexed!
                       wt_only=False,
                       synonymous=True,
                       stops='TAA',
                       all3ntdeletions=True,
                       mutation_list=False,
                       codons_ranked_by_usage=codons_ranked_by_usage,
                       aa_start=0):
                       
    oligo_array = {}
    #Check that region has size divisible by three
    if (len(region)/3 != len(region)//3) | (nt_start/3 != nt_start//3):
        print('Region is not translatable!')
        
    else:
        #add wt seq to oligo array
        oligo_name = region_name + '_WT'
        wt_seq = \
            region_flanks[0] + region + region_flanks[1]
        oligo_array[oligo_name] = wt_seq
        
        if not wt_only:
            
            # see if a mutation list was given
            if mutation_list == False:
                    
                #loop over amino acids
                for j in range(0,len(region),3):

                    #add all missense variants
                    aa = region[j:(j+3)].translate()
                    for aa_to in codons_ranked_by_usage.keys():
                        if aa_to != aa:
                            oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa_to)
                            seq_to_append = \
                                region_flanks[0] + \
                                region[0:j] + Seq(codons_ranked_by_usage[aa_to][0]) + \
                                region[(j+3):] + \
                                region_flanks[1]
                            oligo_array[oligo_name] = seq_to_append

                    #add synonymous variant if True and if possible, 
                    # using the most common codon that is NOT the codon in the gene
                    if synonymous:
                        if len(codons_ranked_by_usage[aa]) > 1:
                            oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + str(aa)
                            possible_codons = codons_ranked_by_usage[aa].copy()
                            possible_codons.remove(region[j:(j+3)])
                            seq_to_append = \
                                region_flanks[0] + \
                                region[0:j] + Seq(possible_codons[0]) + \
                                region[(j+3):] + \
                                region_flanks[1]
                            oligo_array[oligo_name] = seq_to_append

                    #add stops if true
                    if stops:
                        oligo_name = region_name + '_' + str(aa) + str((nt_start+j)//3+1+aa_start) + 'X'
                        seq_to_append = \
                            region_flanks[0] + \
                            region[0:j] + Seq(stops) + \
                            region[(j+3):] + \
                            region_flanks[1]
                        oligo_array[oligo_name] = seq_to_append

                    #add all 3nt deletions if True
                    if all3ntdeletions:
                        for k in range(0,3):
                            if j+k+3 <= len(region):
                                oligo_name = region_name + '_' + 'del' + str(nt_start+j+k+1+3*aa_start)
                                seq_to_append = \
                                    region_flanks[0] + \
                                    region[0:(j+k)] + \
                                    region[(j+k+3):] + \
                                    region_flanks[1]
                                oligo_array[oligo_name] = seq_to_append
                                
            else:
                
                #loop over mutation list
                for i in range(len(mutation_list)):
                        
                    #iterate over every single aa change
                    oligo_name = region_name + '_' + 'variant' + str(i+1)
                    seq_to_append = region
                    for k,v in enumerate(mutation_list[i]):
                        aa_from = v[0]
                        aa_to = v[-1]
                        pos = int(v[1:-1])
                        j=3*(pos-(nt_start//3+1))
                        aa = region[j:(j+3)].translate()
                        if aa != aa_from:
                            print('Check mutation list!')
                        else:
                            seq_to_append = \
                                seq_to_append[0:j] + \
                                Seq(codons_ranked_by_usage[aa_to][0]) + \
                                seq_to_append[(j+3):]

                    # append oligo to array
                    seq_to_append = region_flanks[0] + \
                                    seq_to_append + \
                                    region_flanks[1]
                    oligo_array[oligo_name] = seq_to_append
        
    return oligo_array


def write_oligo_library(genes,
                        oligo_file='./oligo_test.csv',
                        primer_file='./primer_test.tsv',
                        gbl_file='./gbl_test.tsv',
                        gbl_large_file='./gbl_test_large.tsv',
                        amp_primer_key_file='./amp_primer_key.tsv',
                        breakpoint_file='./breakpoints.tsv',
                        primer_set=orthogonal_primers_touse,
                        codons_ranked_by_usage=codons_ranked_by_usage,
                        block_size_range=block_size_range, 
                        max_oligo_size=max_oligo_size,
                        slack=slack, 
                        empirical=bsaI_empirical, 
                        overhang_blacklist=overhang_blacklist,
                        validated_primer_set=False,
                        aa_start=False,
                        wt_only=False,
                        synonymous=True,
                        stops='TAA',
                        all3ntdeletions=True,
                        mutations_to_use=False,
                        find_pcr_primers=True,
                        smallest_primer_size=16,
                        largest_primer_size=30,
                        Tm=55,
                        extendtoCG=True,
                        bsaI_firstoverlap='CGTC',
                        bsaI_lastoverlap='GCAT',
                        all_blocks=True,
                        blocks_to_include=False,
                        tile_boundaries=False,
                        paqcIcapF=True,
                        paqcIcapR=True,
                        check_all_primers=True,
                        qc_melt_temp_threshold=32,
                        gblock_min_size=300,
                        gblock_large_threshold=1000,
                        randomsequencepad=randomsequencepad):
    
    #Split up primer set into F and R primers, cannot do more than 82 sublibraries
    oligo_primer_counter = 0
    oligo_array = {}
    amp_primers = {}
    gblocks = {}
    num_blocks = {}
    amp_primer_dict = {}
    breakpoint_dict = {}
    
    #Convert genes to Seq and genes to list
    gene_names = list(genes.keys())
    genes = [Seq(genes[gene_name].upper()) for gene_name in gene_names]
    
    #PaqCI and BsaI site sequences and overhangs for paqcI
    paqcI_seq = Seq('CACCTGC')
    paqcI_overhang_nterm_ctag = Seq('CCACC')
    paqcI_overhang_cterm_ctag = Seq('ACGGG')
    paqcI_overhang_nterm_ntag = Seq('TGGC')
    paqcI_overhang_cterm_ntag = Seq('TAGG')
    paqcI_seqplusfour = Seq('CACCTGCCTAG')
    bsaI_seq = Seq('GGTCTC')
    bsaI_seqplusone = Seq('GGTCTCT')
    pcr_capseq = Seq('GGCTAC') + bsaI_seqplusone
    gbl_capseq_F = Seq('CCGCGTGATTACGAGTCG') + pcr_capseq
    gbl_capseq_R = Seq('GGGTTAGCAAGTGGCAGCCT') + pcr_capseq
    
    # set max size of a tile
    primer_len = len(primer_set['Forward Primer'][0])
    max_tile_size = max_oligo_size - 2*primer_len - 2*len(bsaI_seqplusone)
    first_last_block_reduction = len(paqcI_seqplusfour)
    
    # iterate over genes
    for r,gene in enumerate(genes):
        
        gene_name = gene_names[r]
        print('Processing gene ' + str(r+1) + ': ' + gene_name)
        
        # amino acid to start at
        if aa_start != False:
            if gene_name in aa_start.keys():
                aa_start_gene=aa_start[gene_name]-1
            else:
                aa_start_gene=0
        else:
            aa_start_gene=0
        
        #exclude if gene size is not divisible by three
        if len(gene)/3 != len(gene)//3:
            print('Gene length is not divisible by 3!')
    
        #exclude if there is a paqcI site in the gene
        elif any([True for kmer in build_kmers(gene, len(paqcI_seq)) if kmer==paqcI_seq]) | \
            any([True for kmer in build_kmers(gene.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq]):
            print('Gene has paqcI site!')
        
        #exclude if there is a BsaI site in the gene
        elif any([True for kmer in build_kmers(gene, len(bsaI_seq)) if kmer==bsaI_seq]) | \
            any([True for kmer in build_kmers(gene.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq]):
            print('Gene has BsaI site!')
            
        else:
            print('Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...')
            
            #cap gene with BsaI breakpoints and possible paqcI sites 
            if paqcIcapF & paqcIcapR:
                gene_capped = bsaI_firstoverlap + paqcI_seqplusfour +  paqcI_overhang_nterm_ctag  +\
                        gene + paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
                        bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag)
                capping_length_R = len(paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
                                   bsaI_lastoverlap)
            elif paqcIcapF:
                gene_capped = bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag + \
                        gene + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap + paqcI_seqplusfour + paqcI_overhang_nterm_ctag)
                capping_length_R = len(bsaI_lastoverlap)
            elif paqcIcapR:
                gene_capped = bsaI_firstoverlap + \
                        gene + paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap)
                capping_length_R = len(paqcI_overhang_cterm_ctag + paqcI_seqplusfour.reverse_complement() + \
                                   bsaI_lastoverlap)
            else:
                gene_capped = bsaI_firstoverlap + gene + bsaI_lastoverlap
                capping_length_F = len(bsaI_firstoverlap)
                capping_length_R = len(bsaI_lastoverlap)
            
            #Optimize gene if tile boundaries are not given
            if tile_boundaries is False:
                optimum_breakpoints, optimum_overlaps, optimum_scores, optimum_lengths, oligo_array_indices = \
                optimize_gene(gene_capped, 
                          max_tile_size=max_tile_size,
                          first_last_block_reduction=first_last_block_reduction,
                          block_size_range=block_size_range, 
                          slack=slack, 
                          empirical=bsaI_empirical, 
                          overhang_blacklist=overhang_blacklist)
                pprint.pprint({'Optimum Breakpoints': optimum_breakpoints, 
                       'Optimum Overlaps': optimum_overlaps, 
                       'Optimum Scores': optimum_scores})
                num_blocks[gene_name] = len(optimum_breakpoints)
            else:
                optimum_breakpoints = tile_boundaries[gene_name]
                if len(optimum_breakpoints[0])==3:
                    if len(optimum_breakpoints)>1:
                        #multiple tiles, including one at beginning of gene
                        oligo_array_indices = [[0,1]] + [[1,2]]*(len(optimum_breakpoints)-1)
                    else:
                        # one tile
                        if ((optimum_breakpoints[1]-optimum_breakpoints[0]) > (optimum_breakpoints[2]-optimum_breakpoints[1])):
                            # at end of gene
                            oligo_array_indices = [[1,2]]
                        else:
                            # at beginning of gene
                            oligo_array_indices = [[0,1]]
                else:
                    # multiple tiles, starting in the middle
                    oligo_array_indices = [[1,2]]*(len(optimum_breakpoints))
                num_blocks[gene_name] = len(optimum_breakpoints)
                
            
            #add primers for gene_F and gene_R that are repeated constantly throughout the PCRs
            #note: should probably prevalidate these primers!
            if find_pcr_primers:
                F_primer = generate_primer(gene,
                                           Fwd=True,
                                           extendtoCG=extendtoCG,
                                           smallest_primer_size=smallest_primer_size,
                                           largest_primer_size=largest_primer_size,
                                           Tm=Tm)
                F_primer = pcr_capseq + bsaI_firstoverlap + paqcI_seqplusfour + F_primer
                amp_primers[gene_name+'_gene'+'_ampF'] = F_primer
                R_primer = generate_primer(gene,
                                           Fwd=False,
                                           extendtoCG=extendtoCG,
                                           smallest_primer_size=smallest_primer_size,
                                           largest_primer_size=largest_primer_size,
                                           Tm=Tm)
                R_primer = pcr_capseq + Seq(bsaI_lastoverlap).reverse_complement() + \
                            paqcI_seqplusfour + R_primer
                amp_primers[gene_name+'_gene'+'_ampR'] = R_primer
            
            #make oligos, primers, gblocks for each block
            for i,breakpoint in enumerate(optimum_breakpoints):
                
                #find indices of breakpoint that correspond to oligo vs need to be PCRed/gblock
                pcr_indices = [[j,j+1] for j in range(len(breakpoint)-1)]
                pcr_indices.remove(oligo_array_indices[i])
                
                #find mutagenic window of oligo
                oligo_breaks = [breakpoint[j] for j in oligo_array_indices[i]]
                oligo_mutagenic_window = [int(3*np.ceil(max(oligo_breaks[0]+4-capping_length_F,3)/3)), int(3*np.floor(min(oligo_breaks[1]-capping_length_F,len(gene)-1)/3))]
                
                #subset the right block if needed
                if (all_blocks == True) | ((i+1) in blocks_to_include[r] if blocks_to_include != False else True): #subset on allowed blocks
                
                    #add pcr primers and gblocks, one segment at a time
                    for k,pcr_index in enumerate(pcr_indices):
                        piece_name = gene_name + '_block' + str(i+1) + '_s' + str(k+1)
                        pcr_breaks = [breakpoint[j] for j in pcr_index]
                                            
                        #get pcr primers
                        if find_pcr_primers:
                            if pcr_breaks[0] == breakpoint[0]: #Fragment beginning at gene start 
                                R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=False,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                R_primer = pcr_capseq + R_primer
                                amp_primers[piece_name+'_ampR'] = R_primer
                            elif pcr_breaks[1] == breakpoint[-1]: #Fragment ending at gene end
                                F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=True,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                F_primer = pcr_capseq + F_primer
                                amp_primers[piece_name+'_ampF'] = F_primer
                            else:
                                F_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=True,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                F_primer = pcr_capseq + F_primer
                                R_primer = generate_primer(gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)],
                                                           Fwd=False,
                                                           extendtoCG=extendtoCG,
                                                           smallest_primer_size=smallest_primer_size,
                                                           largest_primer_size=largest_primer_size,
                                                           Tm=Tm)
                                R_primer = pcr_capseq + R_primer
                                amp_primers[piece_name+'_ampF'] = F_primer
                                amp_primers[piece_name+'_ampR'] = R_primer

                        #make gblocks
                        gbl = gbl_capseq_F + gene_capped[pcr_breaks[0]:(pcr_breaks[1]+4)] + gbl_capseq_R.reverse_complement()
                        gblocks[piece_name] = gbl
                        
                    #add oligos to oligo array, checking first for validated primers if they are given
                    validated=False
                    if validated_primer_set is not False:
                        validated_combo = validated_primer_set.query('Gene == @gene_name & Block == (@i+1)')
                        if not validated_combo.empty:
                            validated=True
                            name_primer_F, primer_F, name_primer_R, primer_R = \
                                validated_combo[['Forward Name',
                                                  'Forward Primer',
                                                  'Reverse Name',
                                                  'Reverse Primer']].values[0]
                        else:
                            name_primer_F, primer_F, name_primer_R, primer_R = \
                                primer_set.iloc[oligo_primer_counter,][['Forward Name',
                                                                      'Forward Primer',
                                                                      'Reverse Name',
                                                                      'Reverse Primer']]
                            oligo_primer_counter += 1
                    else:
                        name_primer_F, primer_F, name_primer_R, primer_R = \
                                primer_set.iloc[oligo_primer_counter,][['Forward Name',
                                                                      'Forward Primer',
                                                                      'Reverse Name',
                                                                      'Reverse Primer']]
                        oligo_primer_counter += 1
                        
                    # if mutations are given, use those - otherwise make all mutations or wtonly
                    if mutations_to_use == False:
                        add_on_array = make_mutations(gene_name + '_block' + str(i+1),
                                           gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
                                           region_flanks=[Seq(primer_F) + \
                                                          bsaI_seqplusone + \
                                                          gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
                                                          gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
                                                          bsaI_seqplusone.reverse_complement() + \
                                                          Seq(primer_R).reverse_complement()],
                                           nt_start=oligo_mutagenic_window[0],
                                           wt_only=wt_only,
                                           synonymous=synonymous,
                                           stops=stops,
                                           all3ntdeletions=all3ntdeletions,
                                           codons_ranked_by_usage=codons_ranked_by_usage,
                                           aa_start=aa_start_gene)
                    else:
                        add_on_array = make_mutations(gene_name + '_block' + str(i+1),
                                           gene[oligo_mutagenic_window[0]:oligo_mutagenic_window[1]],
                                           region_flanks=[Seq(primer_F) + \
                                                          bsaI_seqplusone + \
                                                          gene_capped[oligo_breaks[0]:(oligo_mutagenic_window[0]+capping_length_F)] ,
                                                          gene_capped[(oligo_mutagenic_window[1]+capping_length_F):(oligo_breaks[1]+4)] + \
                                                          bsaI_seqplusone.reverse_complement() + \
                                                          Seq(primer_R).reverse_complement()],
                                           nt_start=oligo_mutagenic_window[0],
                                           mutation_list=mutations_to_use[(gene_name,i+1)],
                                           codons_ranked_by_usage=codons_ranked_by_usage,
                                           aa_start=aa_start_gene)
                    oligo_array.update(add_on_array)
                    amp_primer_dict.update({(gene_name,i+1): (name_primer_F,primer_F,name_primer_R,primer_R,validated)})
                    breakpoint_dict.update({(gene_name,i+1): oligo_mutagenic_window})
                    
    #Check that max oligo is less than the max oligo length
    if sum([len(s)>max_oligo_size for s in oligo_array.values()]) == 0:
        print('All oligos are below the maximum 250bp!')
    else:
        print('Some oligos are TOO BIG!')
        
    #Check for nonspecific amplification
    wt_oligos = {tuple([key.split('_')[0],
                        int((key.split('_block')[1]).split('_')[0])]
                      ):oligo_array[key] \
                     for key in oligo_array.keys() if 'WT' in key}
    nonspecific_primers = post_qc(amp_primer_dict, 
                                  wt_oligos,
                                  primer_set, 
                                  melt_temp_threshold=qc_melt_temp_threshold,
                                  check_all_primers=check_all_primers)
    
    # Check that all mutagenic windows overlap
    breakpoint_df = pd.DataFrame.from_dict(breakpoint_dict, orient='index', columns=['Mutagenesis Start','Mutagenesis End'])
    breakpoint_df.index = pd.MultiIndex.from_tuples(breakpoint_dict.keys())
    breakpoint_df = breakpoint_df.reset_index().rename(columns={'level_0':'Gene',
                                                                'level_1':'Block'})
    if (mutations_to_use == False) and (all_blocks == True):
        missed_counter = 0
        for r,gene_group_breakpoints in breakpoint_df.groupby('Gene'):
            for k,row in gene_group_breakpoints.iterrows():
                # look at whether the current row start is later than the last row end
                if row['Block'] > 1:
                    if row['Mutagenesis Start'] > end:
                        missed_counter += 1
                        print('Mutagenic window missed at ' + str(r) + ' block ' + str(row['Block']))
                start,end = row['Mutagenesis Start'],row['Mutagenesis End']
        if missed_counter == 0:
            print('All mutagenic windows overlap!')
        else:
            print(str(missed_counter) + ' number of times the mutagenic window does not close!')
                
    #Remove any oligos with additional BsaI sites or paqcI sites
    bad_oligos = []
    for name,oligo in oligo_array.items():
        #check for paqcI sites
        paqcI_F = sum([True for kmer in build_kmers(oligo, len(paqcI_seq)) if kmer==paqcI_seq])
        paqcI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(paqcI_seq)) if kmer==paqcI_seq])
        #check that oligo is block 1 if it contains a paqcI site in the forward orientation 
        if paqcI_F > 0:
            if ('block1' not in name) | (paqcI_F > 1):
                bad_oligos.append(name)
        #check that the oligo is block final if it contains a paqcI site in the reverse orientation
        if paqcI_R > 0:
            if ('block'+str(num_blocks[name.split('_')[0]]) not in name) | (paqcI_R > 1):
                bad_oligos.append(name)
        #check for more than one BsaI site
        bsaI_F = sum([True for kmer in build_kmers(oligo, len(bsaI_seq)) if kmer==bsaI_seq])
        bsaI_R = sum([True for kmer in build_kmers(oligo.reverse_complement(), len(bsaI_seq)) if kmer==bsaI_seq])
        if (bsaI_F != 1) | (bsaI_R != 1):
            bad_oligos.append(name)
    bad_oligos=np.unique(bad_oligos)
    for oligo_name in bad_oligos:
        del oligo_array[oligo_name]
    print(str(len(bad_oligos)) + ' oligos deleted due to errant restriction sites.')
    
    #Remove any duplicate oligos
    new_dict = {}
    seen_values = set()
    counter=0
    for key, value in oligo_array.items():
        if value not in seen_values:
            new_dict[key] = value
            seen_values.add(value)
        else:
            counter += 1
    print(str(counter) + ' oligos removed due to duplication.')
    oligo_array = new_dict
    del new_dict
    
    #write oligo array to file
    with open(oligo_file, 'w') as f:
        for key in oligo_array.keys():
            f.write("%s,%s\n"%(key,oligo_array[key]))
    f.close()
            
    #write primers to file
    if find_pcr_primers:
        primer_order_sheet = []
        for key in amp_primers.keys():
            primer_order_sheet.append(key + '\t' + \
                     str(amp_primers[key]) + \
                     '\t' + '25nm' + '\t' + 'STD\n')
        print(*primer_order_sheet)
        with open(primer_file, 'w') as f:
            for line in primer_order_sheet:
                f.write(line)
        f.close()
    
    #write amplification primer key to file
    amp_primer_key = ['Gene' + '\t' + 'Block' + '\t' + \
                      'Forward Primer Well' + '\t' + 'Forward Primer' + '\t' + \
                      'Reverse Primer Well' + '\t' + 'Reverse Primer' + '\t' + 'Validated' + '\n']
    for key in amp_primer_dict.keys():
        genename, geneblock = key[0], str(key[1])
        name_primer_F, primer_F, name_primer_R, primer_R, validated = amp_primer_dict[key]
        amp_primer_key.append(genename + '\t' + geneblock + '\t' + \
                 name_primer_F + '\t' + primer_F + '\t' + \
                 name_primer_R + '\t' + primer_R + '\t' + str(validated) + '\n')
    print(*amp_primer_key)
    with open(amp_primer_key_file, 'w') as f:
        for line in amp_primer_key:
            f.write(line)
    f.close()
    
    #write breakpoint dict to file
    breakpoint_df.to_csv(breakpoint_file, sep='\t')
    
    #write gblocks to file
    gblock_order_sheet = []
    gblock_large_order_sheet = []
    for key in gblocks.keys():
        # pad gblock if it is not 300bp for Twist
        if len(gblocks[key]) < gblock_min_size:
            gblocks[key] = Seq(randomsequencepad[0:(gblock_min_size-len(gblocks[key]))]) + gblocks[key]
        if len(gblocks[key]) < gblock_large_threshold:
            gblock_order_sheet.append(key + '\t' + \
                     str(gblocks[key]) + '\n')
        else:
            gblock_large_order_sheet.append(key + '\t' + \
                     str(gblocks[key]) + '\n')
    print(*gblock_order_sheet)
    print(*gblock_large_order_sheet)
    with open(gbl_file, 'w') as f:
        for line in gblock_order_sheet:
            f.write(line)
    f.close()
    with open(gbl_large_file, 'w') as f:
        for line in gblock_large_order_sheet:
            f.write(line)
    f.close()
    
    return oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df
                

In [42]:
# Make the library for C term
oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df = write_oligo_library({
                                            'ARF1':ARF1,
                                            'EPM2A':EPM2A,
                                            'TANGO2':TANGO2,
                                            #'GATM':GATM,
                                            #'SUFU':SUFU,-run separately
                                            #'CEBPA':CEBPA, -removed
                                            #'AKT1':AKT1, -removed 
                                            #'STK11':STK11, -run separately
                                            'PRKAG2':PRKAG2,
                                            'BCL10':BCL10},
                                          oligo_file='./L_Seq_Lib2/cterm/circRNA_oligos.csv',
                                          primer_file='./L_Seq_Lib2/cterm/circRNA_primers.tsv',
                                          gbl_file='./L_Seq_Lib2/cterm/circRNA_gblocks.tsv',
                                          gbl_large_file='./L_Seq_Lib2/cterm/circRNA_gblocks_large.tsv',
                                          amp_primer_key_file='./L_Seq_Lib2/cterm/circRNA_ampkey.tsv',
                                          breakpoint_file='./L_Seq_Lib2/cterm/circRNA_breakpoints.tsv',
                                          block_size_range=[180,198],
                                          primer_set= orthogonal_primers_remaining_iter3,
                                          validated_primer_set=validated_primer_combos_filtered,
                                          aa_start={                    })

Processing gene 1: ARF1
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.




{'Optimum Breakpoints': [[0, 144, 579],
                         [0, 128, 294, 579],
                         [0, 285, 446, 579],
                         [0, 431, 579]],
 'Optimum Overlaps': [['CGTC', 'TCGT', 'GCAT'],
                      ['CGTC', 'CTTA', 'TGGA', 'GCAT'],
                      ['CGTC', 'TCTT', 'CTGG', 'GCAT'],
                      ['CGTC', 'GAGA', 'GCAT']],
 'Optimum Scores': [0.9690522381883248,
                    0.9455243983520154,
                    0.9469458864702252,
                    0.9684024282767125]}
Processing gene 2: EPM2A
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 142, 1029],
                         [0, 136, 301, 1029],
                         [0, 282, 450, 1029],
                         [0, 431, 600, 1029],
                         [0, 586, 747, 1029],
                         [0, 733, 901, 1029],
                         [0, 887, 1029]],
 'Optimum Overl



Processing gene 3: TANGO2
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 141, 864],
                         [0, 134, 289, 864],
                         [0, 280, 443, 864],
                         [0, 426, 587, 864],
                         [0, 576, 737, 864],
                         [0, 720, 864]],
 'Optimum Overlaps': [['CGTC', 'GGAA', 'GCAT'],
                      ['CGTC', 'TTCT', 'ACTT', 'GCAT'],
                      ['CGTC', 'GCGA', 'GAGC', 'GCAT'],
                      ['CGTC', 'ACTA', 'ATCG', 'GCAT'],
                      ['CGTC', 'AGGA', 'ACTA', 'GCAT'],
                      ['CGTC', 'ACGG', 'GCAT']],
 'Optimum Scores': [0.969089677906989,
                    0.9492248770241213,
                    0.9424220022428247,
                    0.9458902589775311,
                    0.9490765722912766,
                    0.9665528878438787]}
Processing gene 4: PRKAG2
Gene has no paqcI or 



Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 159, 1743],
                         [0, 143, 320, 1743],
                         [0, 305, 479, 1743],
                         [0, 467, 638, 1743],
                         [0, 623, 799, 1743],
                         [0, 787, 955, 1743],
                         [0, 941, 1117, 1743],
                         [0, 1101, 1275, 1743],
                         [0, 1267, 1438, 1743],
                         [0, 1422, 1599, 1743],
                         [0, 1586, 1743]],
 'Optimum Overlaps': [['CGTC', 'TCCT', 'GCAT'],
                      ['CGTC', 'TCCT', 'TCTC', 'GCAT'],
                      ['CGTC', 'AAGA', 'AGAA', 'GCAT'],
                      ['CGTC', 'TTCT', 'TCTT', 'GCAT'],
                      ['CGTC', 'CAGA', 'TTAC', 'GCAT'],
                      ['CGTC', 'AGAA', 'AAGT', 'GCAT'],
                      ['CGTC', 'GAGA', 'AGAT', 'GCAT'],
                      ['CGTC', 'TAGT', 'CCTT', 'GCAT'],
                     

  return THERMO_ANALYSIS.calcHeterodimer(


No non-specific primers detected
All mutagenic windows overlap!
228 oligos deleted due to errant restriction sites.
1481 oligos removed due to duplication.
ARF1_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGGGGAACATCTTCGCCAAC	25nm	STD
 ARF1_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGCTTCTGGTTCCGGAGCTGATTG	25nm	STD
 ARF1_block1_s1_ampF	GGCTACGGTCTCTTCGTGACCACCATTCCCACC	25nm	STD
 ARF1_block2_s1_ampR	GGCTACGGTCTCTTAAGCTTGTAGAGGATCGTGGTTTTC	25nm	STD
 ARF1_block2_s2_ampF	GGCTACGGTCTCTTGGACAGCAATGACAGAGAGCG	25nm	STD
 ARF1_block3_s1_ampR	GGCTACGGTCTCTAAGATCAGGCCTTGTGTGTTCTG	25nm	STD
 ARF1_block3_s2_ampF	GGCTACGGTCTCTCTGGGGCTGCACTCACTACG	25nm	STD
 ARF1_block4_s1_ampR	GGCTACGGTCTCTTCTCGGCCGCATTCATGGC	25nm	STD
 EPM2A_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGCGCTTCCGCTTTGGG	25nm	STD
 EPM2A_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGCAGGCTACACACAGAAGAACGAAC	25nm	STD
 EPM2A_block1_s1_ampF	GGCTACGGTCTCTTCCTGCTGGGACTGCTGC	25nm	STD
 EPM2A_block2_s1_ampR	GGCTACGGTCTCTAAGTCGTACAGCTCCTCTTGGTTC	25nm	STD
 EPM2A_block2_s2

In [43]:
# Subtract the primer pairs used in making the large C-term library from the remaining pairs to make the list for the SUFU

#1. Read the two primer tables
df_used = pd.read_csv(
    "L_Seq_Lib2/cterm/circRNA_ampkey.tsv",
    sep="\t",
    usecols=["Forward Primer", "Reverse Primer"]
)

df_all = orthogonal_primers_remaining_iter3  # keep every column

#2. Build helper columns to subracting 
df_used_norm = df_used.assign(
    FWD=df_used["Forward Primer"].str.strip().str.upper(),
    REV=df_used["Reverse Primer"].str.strip().str.upper()
)

df_all_norm = df_all.assign(
    FWD=df_all["Forward Primer"].str.strip().str.upper(),
    REV=df_all["Reverse Primer"].str.strip().str.upper()
)

#3. Identify pairs that are already used
used_index = df_used_norm.set_index(["FWD", "REV"]).index
mask       = ~df_all_norm.set_index(["FWD", "REV"]).index.isin(used_index)

#4. Keep the rows (and *all* original columns) that remain
orthogonal_primers_remaining_iter4 = df_all.loc[mask].reset_index(drop=True)

print(f"{len(orthogonal_primers_remaining_iter4)} primer pairs remain.")
orthogonal_primers_remaining_iter4.head()
orthogonal_primers_remaining_iter4.to_csv("orthogonal_primers_remaining_iter4.csv", index=False)


51 primer pairs remain.


In [56]:
# Make the library for C term
oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df = write_oligo_library({
                                            'SUFU':SUFU,
                                            #'STK11':STK11, -run separately
                                            },
                                          oligo_file='./L_Seq_Lib2/cterm/SUFU_circRNA_oligos.csv',
                                          primer_file='./L_Seq_Lib2/cterm/SUFU_circRNA_primers.tsv',
                                          gbl_file='./L_Seq_Lib2/cterm/SUFU_circRNA_gblocks.tsv',
                                          gbl_large_file='./L_Seq_Lib2/cterm/SUFU_circRNA_gblocks_large.tsv',
                                          amp_primer_key_file='./L_Seq_Lib2/cterm/SUFU_circRNA_ampkey.tsv',
                                          breakpoint_file='./L_Seq_Lib2/cterm/SUFU_circRNA_breakpoints.tsv',
                                          block_size_range=[168,180],
                                          primer_set= orthogonal_primers_remaining_iter4,
                                          validated_primer_set=validated_primer_combos_filtered,
                                          aa_start={                    })

Processing gene 1: SUFU
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...
Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 148, 1488],
                         [0, 135, 302, 1488],
                         [0, 279, 454, 1488],
                         [0, 432, 600, 1488],
                         [0, 584, 756, 1488],
                         [0, 742, 897, 1488],
                         [0, 890, 1050, 1488],
                         [0, 1041, 1200, 1488],
                         [0, 1185, 1356, 1488],
                         [0, 1341, 1488]],
 'Optimum Overlaps': [['CGTC', 'CCTT', 'GCAT'],
                      ['CGTC', 'GAGA', 'CTGA', 'GCAT'],
                      ['CGTC', 'ACTG', 'ACGA', 'GCAT'],
                      ['CGTC', 'CAGA', 'TAGT', 'GCAT'],
                      ['CGTC', 'CAGA', 'AGAT', 'GCAT'],
                      ['CGTC', 'AGAA', 'TCGG', 'GCAT'],
                      ['CGTC', 'ATCT', 'AAGA', 'GCAT'],
                      ['CGTC',



All oligos are below the maximum 250bp!
Running QC for primer specificity on WT oligos


  return THERMO_ANALYSIS.calcHeterodimer(


No non-specific primers detected
All mutagenic windows overlap!
106 oligos deleted due to errant restriction sites.
388 oligos removed due to duplication.
SUFU_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGGCGGAGCTGCGGC	25nm	STD
 SUFU_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGGTGTAGCGGACTGTCGAACAC	25nm	STD
 SUFU_block1_s1_ampF	GGCTACGGTCTCTCCTTTACCCTGACCAGCCGAAC	25nm	STD
 SUFU_block2_s1_ampR	GGCTACGGTCTCTTCTCCGTAGATGGCGTGCAG	25nm	STD
 SUFU_block2_s2_ampF	GGCTACGGTCTCTCTGAGTGATCTCTATGGTGACAACAGAG	25nm	STD
 SUFU_block3_s1_ampR	GGCTACGGTCTCTCAGTGCTCGGGGATGTTAGC	25nm	STD
 SUFU_block3_s2_ampF	GGCTACGGTCTCTACGATACGTGTTCCAGTCAGAGAAC	25nm	STD
 SUFU_block4_s1_ampR	GGCTACGGTCTCTTCTGCGGGCCATGTTGGTG	25nm	STD
 SUFU_block4_s2_ampF	GGCTACGGTCTCTTAGTTACCTTCCTCCAGATCGTTGG	25nm	STD
 SUFU_block5_s1_ampR	GGCTACGGTCTCTTCTGCACGGGCTGCATCTG	25nm	STD
 SUFU_block5_s2_ampF	GGCTACGGTCTCTAGATCGATCCACACCTCCAAGAG	25nm	STD
 SUFU_block6_s1_ampR	GGCTACGGTCTCTTTCTCCCCTCCGCATGTCAG	25nm	STD
 SUFU_block6_s2_ampF	GGCTACGGTCTCTTCGGCA

In [58]:
# Subtract the primer pairs used in making the SUFU library from the remaining pairs to make the list for the STK11 library

#1. Read the two primer tables
df_used = pd.read_csv(
    "L_Seq_Lib2/cterm/SUFU_circRNA_ampkey.tsv",
    sep="\t",
    usecols=["Forward Primer", "Reverse Primer"]
)

df_all = orthogonal_primers_remaining_iter4  # keep every column

#2. Build helper columns to subracting 
df_used_norm = df_used.assign(
    FWD=df_used["Forward Primer"].str.strip().str.upper(),
    REV=df_used["Reverse Primer"].str.strip().str.upper()
)

df_all_norm = df_all.assign(
    FWD=df_all["Forward Primer"].str.strip().str.upper(),
    REV=df_all["Reverse Primer"].str.strip().str.upper()
)

#3. Identify pairs that are already used
used_index = df_used_norm.set_index(["FWD", "REV"]).index
mask       = ~df_all_norm.set_index(["FWD", "REV"]).index.isin(used_index)

#4. Keep the rows (and *all* original columns) that remain
orthogonal_primers_remaining_iter5 = df_all.loc[mask].reset_index(drop=True)

print(f"{len(orthogonal_primers_remaining_iter5)} primer pairs remain.")
orthogonal_primers_remaining_iter5.head()
orthogonal_primers_remaining_iter5.to_csv("orthogonal_primers_remaining_iter5.csv", index=False)


41 primer pairs remain.


In [76]:
# Make the library for C term
oligo_array,amp_primers,gblocks,amp_primer_dict,breakpoint_df = write_oligo_library({
                                            'STK11':STK11},
                                          oligo_file='./L_Seq_Lib2/cterm/STK11_circRNA_oligos.csv',
                                          primer_file='./L_Seq_Lib2/cterm/STK11_circRNA_primers.tsv',
                                          gbl_file='./L_Seq_Lib2/cterm/STK11_circRNA_gblocks.tsv',
                                          gbl_large_file='./L_Seq_Lib2/cterm/STK11_circRNA_gblocks_large.tsv',
                                          amp_primer_key_file='./L_Seq_Lib2/cterm/STK11_circRNA_ampkey.tsv',
                                          breakpoint_file='./L_Seq_Lib2/cterm/STK11_circRNA_breakpoints.tsv',
                                          block_size_range=[136,170],
                                          primer_set= orthogonal_primers_remaining_iter5,
                                          validated_primer_set=validated_primer_combos_filtered,
                                          aa_start={                    })

Processing gene 1: STK11
Gene has no paqcI or BsaI sites! Performing GoldenGate optimization...




Some regions are medium fidelity.
{'Optimum Breakpoints': [[0, 122, 1335],
                         [0, 104, 239, 1335],
                         [0, 226, 371, 1335],
                         [0, 351, 488, 1335],
                         [0, 480, 613, 1335],
                         [0, 594, 731, 1335],
                         [0, 726, 864, 1335],
                         [0, 849, 984, 1335],
                         [0, 966, 1110, 1335],
                         [0, 1089, 1226, 1335],
                         [0, 1217, 1335]],
 'Optimum Overlaps': [['CGTC', 'ATCT', 'GCAT'],
                      ['CGTC', 'ATCG', 'AGGA', 'GCAT'],
                      ['CGTC', 'GGAA', 'TACA', 'GCAT'],
                      ['CGTC', 'TCCA', 'TTCT', 'GCAT'],
                      ['CGTC', 'ACGG', 'CGAG', 'GCAT'],
                      ['CGTC', 'TCTC', 'ATCT', 'GCAT'],
                      ['CGTC', 'TGGA', 'TCTC', 'GCAT'],
                      ['CGTC', 'ACTG', 'TCCC', 'GCAT'],
                      ['C

  return THERMO_ANALYSIS.calcHeterodimer(


No non-specific primers detected
All mutagenic windows overlap!
77 oligos deleted due to errant restriction sites.
432 oligos removed due to duplication.
STK11_gene_ampF	GGCTACGGTCTCTCGTCCACCTGCCTAGATGGAGGTGGTGGACCCG	25nm	STD
 STK11_gene_ampR	GGCTACGGTCTCTATGCCACCTGCCTAGCTGCTGCTTGCAGGCCG	25nm	STD
 STK11_block1_s1_ampF	GGCTACGGTCTCTATCTACCAGCCGCGCCG	25nm	STD
 STK11_block2_s1_ampR	GGCTACGGTCTCTCGATGCGGTGGATGAACGTG	25nm	STD
 STK11_block2_s2_ampF	GGCTACGGTCTCTAGGAGGGCCGTCAAGATCC	25nm	STD
 STK11_block3_s1_ampR	GGCTACGGTCTCTTTCCGAGTCCAGCACCTCC	25nm	STD
 STK11_block3_s2_ampF	GGCTACGGTCTCTTACAACGAAGAGAAGCAGAAAATGTATATG	25nm	STD
 STK11_block4_s1_ampR	GGCTACGGTCTCTTGGATGACATTTTTGTGCCGTAACC	25nm	STD
 STK11_block4_s2_ampF	GGCTACGGTCTCTTTCTGTCAGCTGATTGACGGC	25nm	STD
 STK11_block5_s1_ampR	GGCTACGGTCTCTCCGTGGGCCTGGCACAC	25nm	STD
 STK11_block5_s2_ampF	GGCTACGGTCTCTCGAGGCACTGCACCCGTTC	25nm	STD
 STK11_block6_s1_ampR	GGCTACGGTCTCTGAGATTTTGAGGGTGCCACCG	25nm	STD
 STK11_block6_s2_ampF	GGCTACGGTCTCTATCTGGTCG

In [77]:
# Subtract the primer pairs used to make the list of remaining primer's for other's libraries 

#1. Read the two primer tables
df_used = pd.read_csv(
    "L_Seq_Lib2/cterm/STK11_circRNA_ampkey.tsv",
    sep="\t",
    usecols=["Forward Primer", "Reverse Primer"]
)

df_all = orthogonal_primers_remaining_iter5

#2. Build helper columns to subracting 
df_used_norm = df_used.assign(
    FWD=df_used["Forward Primer"].str.strip().str.upper(),
    REV=df_used["Reverse Primer"].str.strip().str.upper()
)

df_all_norm = df_all.assign(
    FWD=df_all["Forward Primer"].str.strip().str.upper(),
    REV=df_all["Reverse Primer"].str.strip().str.upper()
)

#3. Identify pairs that are already used
used_index = df_used_norm.set_index(["FWD", "REV"]).index
mask       = ~df_all_norm.set_index(["FWD", "REV"]).index.isin(used_index)

#4. Keep the rows (and *all* original columns) that remain
orthogonal_primers_remaining_iter6 = df_all.loc[mask].reset_index(drop=True)

print(f"{len(orthogonal_primers_remaining_iter6)} primer pairs remain.")
orthogonal_primers_remaining_iter6.head()


30 primer pairs remain.


Unnamed: 0.1,Unnamed: 0,Forward Name,Forward Primer,Reverse Name,Reverse Primer
0,27,B3,AATGCAAAGCTATTAGCGCG,A6,ACGTATGGGGAACACTACAC
1,28,D5,AGCTATGATCCCGGTGTAAC,D2,GGGTTGTCTCCTCTGATAGC
2,29,C8,CACTCGATAGGTACAACCGG,D4,ATCCAGGAGGTCTAGGAACC
3,30,B6,AGACATGGGATTGACCACAC,B11,ACCACAGGTCAAGATTCACG
4,32,G10,ACGATGGGGACATAGAACAC,C7,TCGAGACAAGAACGATTCCC


In [78]:
#write out to csv to be used with the other oligo sets
orthogonal_primers_remaining_iter6.to_csv("20250613_remaining_primer_pairs.csv", index=False)