This notebook contains code for creating a promoter library for promoter bashing.
It details how to create libraries of <300 nucleotides that can be ordered, as well as creating the primers for cloning these into vectors.

# get transcription factors and motifs.


## find transcription factors and their imputed motifs from plantregdb

In [None]:
# read in the plantregdb TFs in sorghum

# plant TF db description
df_tf_info = pd.read_csv('../data/plantregdb/tfs/Sbi_TF_binding_motifs_information.txt', delimiter='\t')
df_tf_info['origin'] = df_tf_info.apply(
    lambda r: r.Datasource_ID.split('(')[1][:-1],
    axis=1
)

# create df of tf motif
meme_dir = '../data/plantregdb/tfs/Sbi_TF_binding_motifs_individual/'
meme_files = [f for f in listdir(meme_dir) if f.endswith('.meme')]

tf_to_motif = {}
for f in meme_files:
    tf = f[:-5]
    motif =get_motif_from_meme_file(meme_dir + f)
    tf_to_motif[tf] = motif

print('unique motifs:', len(set(list(tf_to_motif.values())))) # there is only 242 unique motifs

df_motif_seqs = pd.DataFrame.from_dict(tf_to_motif, columns= ['motif_seq'], orient='index')
df_tf_db = df_tf_info.merge(df_motif_seqs, left_on='Gene_id', right_index=True)

# reduce set of motifs to motifs that contain other smaller motifs.
db_ms = set(list(tf_to_motif.values()))
db_ms_unique = reduce_motifs(db_ms)
small_list2 = reduce_motifs(db_ms_unique)
# create a dictionary of original motif to uber motif
om_to_um = {} # original motif to uber motif
for m in db_ms:
    ums = [k for k in db_ms_unique if m in k]
    om_to_um[m] = ums

df_tf_db['uber_motifs'] = df_tf_db.apply(lambda r: om_to_um[r.motif_seq], axis=1)

Unnamed: 0,Gene_id,Family,Matrix_id,Species,Method,Datasource,Datasource_ID
0,Sobic.001G002500,ERF,MP00374,Sorghum bicolor,DAP,PlantTFDB,transfer from AT3G23240(Arabidopsis thaliana)
1,Sobic.001G007200,HSF,MP00514,Sorghum bicolor,DAP,PlantTFDB,transfer from AT5G16820(Arabidopsis thaliana)
2,Sobic.001G026500,SBP,MP00633,Sorghum bicolor,PBM,PlantTFDB,transfer from PK06791.1(Cannabis sativa)
3,Sobic.001G034300,Dof,MP00295,Sorghum bicolor,DAP,PlantTFDB,transfer from AT2G37590(Arabidopsis thaliana)
4,Sobic.001G036800,AP2,MP00615,Sorghum bicolor,PBM,PlantTFDB,transfer from AT2G28550(Arabidopsis thaliana)
...,...,...,...,...,...,...,...
255,Sobic.010G215700,SBP,MP00090,Sorghum bicolor,SELEX,PlantTFDB,transfer from AT1G02065(Arabidopsis thaliana)
256,Sobic.010G236300,ARF,MP00461,Sorghum bicolor,DAP,PlantTFDB,transfer from AT4G30080(Arabidopsis thaliana)
257,Sobic.010G249200,GATA,MP00369,Sorghum bicolor,DAP,PlantTFDB,transfer from AT3G21175(Arabidopsis thaliana)
258,Sobic.010G254300,G2-like,MP00354,Sorghum bicolor,DAP,PlantTFDB,transfer from AT3G13040(Arabidopsis thaliana)


## find expression values of transcription factor binding sites

In [None]:
# prioritize based on expression of TF in sorghum leaf from: http://sorghum.riken.jp/Downloads.html

df_fpkm = pd.read_csv('../data/sorghum_riken/tf_fpkm.csv')
leaf_stringent = [
'Leaf', 'Keller-leaf','leaf_blade', 'leaf_flag_internode', 'leaf_lower',
       'leaf_lower_growing', 'leaf_lower_whorl', 'leaf_middle_whorl',
       'leaf_sheath_growing', 'leaf_upper', 'leaf_upper_growing',
       'leaf_upper_whorl'
]
df_fpkm['mean_fpkm'] = df_fpkm[leaf_stringent].apply(lambda r: sum(r.values)/len(leaf_stringent), axis=1)

df_tf_db= df_tf_db.merge(df_fpkm[['mean_fpkm', 'gene']], left_on = 'Gene_id', right_on = 'gene')

# select genes that have non zero fpkm in either of the leaf observations. 
#better to take too many than too little
# select genes that are not arabidopsis
df_db_select = df_tf_db.loc[
    (df_tf_db.mean_fpkm > 2) |
    (df_tf_db.origin != 'Arabidopsis thaliana')
]

# get unique motifs
uber_motifs_db = []
for um_list in df_db_select.uber_motifs:
    for um in um_list:
        uber_motifs_db.append(um)
uber_motfs_db_unique = set(uber_motifs_db)
print(len(uber_motfs_db_unique))

260

## TF motifs from DNase I hypersensitivity sites
from: https://academic.oup.com/plcell/article/31/10/2297/5985706


In [None]:
df_bur = pd.read_csv('../data/burgess2019/TPC2019-LSB-00078DR1-A_Supplemental_Data_set_9_p2.csv')
df_sb = df_bur.loc[df_bur['sample'] == 'sbwl']

# select motifs in upstream and 5'utr
df_sb_up = df_sb.loc[df_sb.genomic_feature.isin(['Upstream', "5'UTR"])]

# select significant motifs only
df_sb_up_sig = df_sb_up.loc[df_sb_up.q_value<0.1]

# convert all sequences to uppercase
df_sb_up_sig['seq_upper'] = [k.upper() for k in df_sb_up_sig.sequence]

# get a set of unique 'ubermotifs', ie. longer motifs that contain all the shorter ones.
burgess_ums = reduce_motifs(df_sb_up_sig.seq_upper)

# add a column of ubermotif

om_to_um = {} # original motif to uber motif
for m in df_sb_up_sig.seq_upper:
    ums = [k for k in burgess_ums if m in k]
    om_to_um[m] = ums

df_sb_up_sig['uber_motifs'] = df_sb_up_sig.apply(
    lambda r: om_to_um[r.seq_upper], 
    axis=1)

{"3'UTR", "5'UTR", 'Downstream', 'Exon/CD', 'Intron', 'Upstream'}

# get orthogonal primers for PCR amplification

In [None]:
# get primers that enable orthogonal amplification of the library
df_primers_subu = pd.read_csv('../data/ortho_primers/ysx008_supp_st_1.csv')
df_primers_choose = df_primers_subu[df_primers_subu['Keep primer in orthogonal set?'] == 'Yes'] 

# generate all motif sequences

In [32]:
all_motifs_to_insert = list(burgess_ums) + list(uber_motfs_db_unique)


n_to_m_ins = dict(zip(range(len(all_motifs_to_insert)), all_motifs_to_insert))
len(all_motifs_to_insert)

217

# generate mutations

using BSAI for all the inserts, 
for the backbone primeres: 
- use BSAI for all the synthetic GFP driving constructs (coded starting with 2)
- For full length native genes due to existing Type II restriction sites in the gene:
    - 3A SBPase: use BsmBI
    - 3B Raf1: use BsaI or AarI
    - 3C PsbS: use AarI


In [None]:
c=0
df_muts_psbsb, c = get_gene_library(
    dic_dic_gene_utr['psbs']['prime_5'], 
    dic_dic_gene_utr['psbs']['utr_5'], 
    dic_dic_gene_utr['psbs']['exon_1'], 
    c, 
    'psbs', 
    n_to_m_ins
)

df_muts_raf1, c = get_gene_library(
    dic_dic_gene_utr['raf1']['prime_5'], 
    dic_dic_gene_utr['raf1']['utr_5'], 
    dic_dic_gene_utr['raf1']['exon_1'], 
    c, 'raf1', 
    n_to_m_ins)

# these were ordered but fell out during cloning
df_muts_zmm28, c = get_gene_library(
    dic_dic_gene_utr['zmm28']['prime_5'], 
    dic_dic_gene_utr['zmm28']['utr_5'], 
    dic_dic_gene_utr['zmm28']['exon_1'], 
    c, 'zmm28', n_to_m_ins)

df_muts_sbpase, c = get_gene_library(
    dic_dic_gene_utr['sbpase']['prime_5'], 
    dic_dic_gene_utr['sbpase']['utr_5'], 
    dic_dic_gene_utr['sbpase']['exon_1'], 
    c, 'sbpase', n_to_m_ins)



current segment to mutate: GTCCGGAACCATTTGCTCTAAAGAACAGAATGGAACACCGGAATATCCAACGAATCCCTTGTAATTGTACCGCTATGAAATAGTAAATCCTCTTCTTCTTTTTTTTTGGCAAAGGGAATGAAAATAAATCTAACTTAGGCCTTGTTTAGTTGGCAAAAACTTTGGTTTCTGCCTACTGTAGCACTTTCGTTTTTATTTGA
len of current segment to mtuate 200
gene psbs, segment -1168, singles #: 600
gene psbs, segment-1168, 2bp dels #: 199
gene psbs, segment -1168, 12bp dels #: 189
gene psbs, segment -1168, abe #: 192
1
current segment to mutate: CAAACATTGTCCAATCATGGAGTAACTAGGCTCAAAAGATTCATCTCACAAATTACAGTTAAACTGTGCAATTAGTTTTTATTTTTATCTATATTTAATGCTCCATGCATGCGATCAAAGATTTGATGTGACGGAGAATCTTGAAAATTTTTGCGAACTAAACAAGGCCTTACAGCAACATTGCCACAGCGGGTTTTTTA
len of current segment to mtuate 200
gene psbs, segment -968, singles #: 600
gene psbs, segment-968, 2bp dels #: 199
gene psbs, segment -968, 12bp dels #: 189
gene psbs, segment -968, abe #: 188
2
current segment to mutate: CTTGGTGAAGAGTTTTCCTTCAAAATAGACCTTCAAGGTTCAGAACTTCAGATACAAAAATATACTTACTTTGCAGTGGAGGTTGAAATTTCTTAGCAAGATCCAATTCATCGACTTGA

# filtering down to get fewer insertions

In [None]:
n_muts_non_ins = (len(df_muts_sbpase.loc[~df_muts_sbpase['Unnamed: 0'].str.startswith('ins')]) + 
                    len(df_muts_zmm28.loc[~df_muts_zmm28['Unnamed: 0'].str.startswith('ins')]) + 
                    len(df_muts_raf1.loc[~df_muts_raf1['Unnamed: 0'].str.startswith('ins')]) + 
                  len(df_muts_psbsb.loc[~df_muts_psbsb['Unnamed: 0'].str.startswith('ins')])
                 )
n_muts_non_ins

26359

In [None]:
all_motifs_to_insert = list(burgess_ums) + list(uber_motfs_db_unique)
n_to_m_ins = dict(zip(range(len(all_motifs_to_insert)), all_motifs_to_insert))

In [None]:
# get the top35 most frequently occuring motifs from Burgess et al. and the rest of the motifs
uber_motifs_occurences = []
for n,r in df_sb_up_sig.iterrows():
    uber_motifs_occurences += r.uber_motifs

um_burgess_counts = sorted(Counter(uber_motifs_occurences).items(), key=lambda x: x[1], reverse=True)

um_burgess_all = um_burgess_counts[:35]
um_burgess_split = um_burgess_counts[35:]

m_all = [k[0] for k in um_burgess_all]
m_split = [k[0] for k in um_burgess_split] + list(uber_motfs_db_unique)

n_motif_all = [n for n,m in n_to_m_ins.items() if m in m_all]
n_motif_split = [n for n,m in n_to_m_ins.items() if m in m_split]


chunk_size = int(len(n_motif_split)/4)
chunked_list = [n_motif_split[i:i+chunk_size] for i in range(0, len(n_motif_split), chunk_size)]


df_filtered_psbs = filter_df_muts(df_muts_psbsb, n_motif_all + chunked_list[0])
df_filtered_raf1 = filter_df_muts(df_muts_raf1, n_motif_all + chunked_list[1])
df_filtered_sbpase = filter_df_muts(df_muts_sbpase, n_motif_all + chunked_list[3])

df_filtered_psbs['order_name'] = df_filtered_psbs['gene_name']  + df_filtered_psbs['Unnamed: 0'] 
df_filtered_raf1['order_name'] = df_filtered_raf1['gene_name']  + df_filtered_raf1['Unnamed: 0'] 
df_filtered_sbpase['order_name'] = df_filtered_sbpase['gene_name']  + df_filtered_sbpase['Unnamed: 0'] 

write_fasta_df_muts(df_filtered_psbs, 'df_filtered_psbs.fasta', fasta_name = 'order_name')
write_fasta_df_muts(df_filtered_raf1, 'df_filtered_raf1.fasta', fasta_name = 'order_name')
write_fasta_df_muts(df_filtered_sbpase, 'df_filtered_sbpase.fasta', fasta_name = 'order_name')

df_all_order = pd.concat([df_filtered_psbs[['order_name', 'seq_order']], 
          df_filtered_raf1[['order_name', 'seq_order']], 
          df_filtered_zmm28[['order_name', 'seq_order']],
          df_filtered_sbpase[['order_name', 'seq_order']]
                            ])
write_fasta_df_muts(df_all_order, 'df_all_order.fasta', fasta_name = 'order_name')

3034


35

# order primers to clone out backbone and sublibraries

In [7]:
df_filtered_psbs = pd.read_csv('./df_filtered_psbs.csv')
df_filtered_raf1 = pd.read_csv('./df_filtered_raf1.csv')
df_filtered_zmm28 = pd.read_csv('./df_filtered_zmm28.csv')
df_filtered_sbpase = pd.read_csv('./df_filtered_sbpase.csv')

In [8]:
df_all_order = pd.concat([df_filtered_psbs, 
          df_filtered_raf1, 
          df_filtered_zmm28,
          df_filtered_sbpase
                            ])
df_all_order

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,min_chunk,bsa1_pre,bsa1_post,prime_5_add,amp_fwd,amp_rev,primer_ids,segment_loc,seq_order,len_seq_order,gene_name
0,0,single_G-1168A,ATCCGGAACCATTTGCTCTAAAGAACAGAATGGAACACCGGAATAT...,GGTCTCCCATTG,CAAACAGAGACC,,AAACACGTGGCAAACATTCC,GTACTGTATGGCTCCGGTTT,1_2,-1168,AAACACGTGGCAAACATTCCGGTCTCCCATTGATCCGGAACCATTT...,264,psbs
1,1,single_G-1168C,CTCCGGAACCATTTGCTCTAAAGAACAGAATGGAACACCGGAATAT...,GGTCTCCCATTG,CAAACAGAGACC,,AAACACGTGGCAAACATTCC,GTACTGTATGGCTCCGGTTT,1_2,-1168,AAACACGTGGCAAACATTCCGGTCTCCCATTGCTCCGGAACCATTT...,264,psbs
2,2,single_G-1168T,TTCCGGAACCATTTGCTCTAAAGAACAGAATGGAACACCGGAATAT...,GGTCTCCCATTG,CAAACAGAGACC,,AAACACGTGGCAAACATTCC,GTACTGTATGGCTCCGGTTT,1_2,-1168,AAACACGTGGCAAACATTCCGGTCTCCCATTGTTCCGGAACCATTT...,264,psbs
3,3,single_T-1167A,GACCGGAACCATTTGCTCTAAAGAACAGAATGGAACACCGGAATAT...,GGTCTCCCATTG,CAAACAGAGACC,,AAACACGTGGCAAACATTCC,GTACTGTATGGCTCCGGTTT,1_2,-1168,AAACACGTGGCAAACATTCCGGTCTCCCATTGGACCGGAACCATTT...,264,psbs
4,4,single_T-1167C,GCCCGGAACCATTTGCTCTAAAGAACAGAATGGAACACCGGAATAT...,GGTCTCCCATTG,CAAACAGAGACC,,AAACACGTGGCAAACATTCC,GTACTGTATGGCTCCGGTTT,1_2,-1168,AAACACGTGGCAAACATTCCGGTCTCCCATTGGCCCGGAACCATTT...,264,psbs
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13712,24668,ins_motif_m212_p195_rev,ACGACAATTAAATACCAGAACAGAGCCAAACAAACATCCTAGACCG...,GGTCTCGTATAT,CAAATGGAGACC,,AGGACACCAGACCAATGAAG,CGCTGATGGTAATTAGCCCT,62_63,-150,AGGACACCAGACCAATGAAGGGTCTCGTATATACGACAATTAAATA...,278,sbpase
13713,24669,ins_motif_m213_p195_fwd,ACGACAATTAAATACCAGAACAGAGCCAAACAAACATCCTAGACCG...,GGTCTCGTATAT,CAAATGGAGACC,,AGGACACCAGACCAATGAAG,CGCTGATGGTAATTAGCCCT,62_63,-150,AGGACACCAGACCAATGAAGGGTCTCGTATATACGACAATTAAATA...,272,sbpase
13714,24670,ins_motif_m213_p195_rev,ACGACAATTAAATACCAGAACAGAGCCAAACAAACATCCTAGACCG...,GGTCTCGTATAT,CAAATGGAGACC,,AGGACACCAGACCAATGAAG,CGCTGATGGTAATTAGCCCT,62_63,-150,AGGACACCAGACCAATGAAGGGTCTCGTATATACGACAATTAAATA...,272,sbpase
13715,24671,ins_motif_m214_p195_fwd,ACGACAATTAAATACCAGAACAGAGCCAAACAAACATCCTAGACCG...,GGTCTCGTATAT,CAAATGGAGACC,,AGGACACCAGACCAATGAAG,CGCTGATGGTAATTAGCCCT,62_63,-150,AGGACACCAGACCAATGAAGGGTCTCGTATATACGACAATTAAATA...,273,sbpase


In [None]:
dic_pc_num_to_seq = dict(zip(df_primers_choose['Primer id'], df_primers_choose['Sequence']))


In [None]:
df_primers_all = pd.DataFrame()

df_primers_psbs = get_primers_df(df_filtered_psbs, 'psbs')
df_primers_all = pd.concat([df_primers_all,df_primers_psbs],axis=0)
df_primers_raf1 = get_primers_df(df_filtered_raf1, 'raf1')
df_primers_all = pd.concat([df_primers_all,df_primers_raf1],axis=0)
df_primers_zmm28 = get_primers_df(df_filtered_zmm28, 'zmm28')
df_primers_all = pd.concat([df_primers_all,df_primers_zmm28],axis=0)
df_primers_sbpase = get_primers_df(df_filtered_sbpase, 'sbpase')
df_primers_all = pd.concat([df_primers_all,df_primers_sbpase],axis=0)


df_primers_all

Unnamed: 0,p1,p2,p1_seq,p2_seq,gene
0,1,2,AAACACGTGGCAAACATTCC,AAACCGGAGCCATACAGTAC,psbs
1,3,4,AAAGCACTCTTAGGCCTCTG,AAAGGGGCCGTCAATATCAG,psbs
2,5,6,AAATAAGACGACGACCCTCG,AACGATGATGCTCACTCTCG,psbs
3,11,12,AAGAATTACTGACCCCTCGG,AAGACGATCCGAGCCATTAC,psbs
4,13,14,AAGGAACTATGGCATCGAGC,AAGGACTGCATACCAGGTTG,psbs
5,15,16,AAGGATATGTAGACACCGCC,AAGGCCCAGAAGGATACAAC,psbs
6,17,18,AAGGCGCTCGGATAATACTC,AAGGTATGTATAGCGACCGC,psbs
0,20,21,AATAGGAACCTCTTACGCGG,AATATCACGCAAAAGCACCG,raf1
1,22,23,AATCAGTTTCTTTGGCAGCC,AATGCAAAGCTATTAGCGCG,raf1
2,24,25,AATGCGTCATTTTACACGGC,AATGTCCTTAGGCAGTCGTC,raf1


## get backbone vector primers to insert libraries at

In [None]:
# get backbone sequences
c = 0

###
c=0
d_bb_seq_psbs, c = get_matching_backbone_seqs_library(
    dic_dic_gene_utr['psbs']['prime_5'], 
    dic_dic_gene_utr['psbs']['utr_5'], 
    dic_dic_gene_utr['psbs']['first_10_aa'], 
    c, 
    'psbs'
)

d_bb_seq_raf1, c = get_matching_backbone_seqs_library(
    dic_dic_gene_utr['raf1']['prime_5'], 
    dic_dic_gene_utr['raf1']['utr_5'], 
    dic_dic_gene_utr['raf1']['first_10_aa'], 
    c, 'raf1')

d_bb_seq_zmm28, c = get_matching_backbone_seqs_library(
    dic_dic_gene_utr['zmm28']['prime_5'], 
    dic_dic_gene_utr['zmm28']['utr_5'], 
    dic_dic_gene_utr['zmm28']['first_10_aa'], 
    c, 'zmm28')

d_bb_seq_sbpase, c = get_matching_backbone_seqs_library(
    dic_dic_gene_utr['sbpase']['prime_5'], 
    dic_dic_gene_utr['sbpase']['utr_5'], 
    dic_dic_gene_utr['sbpase']['first_10_aa'], 
    c, 'sbpase')



psbs curr_primers 1 2
-1288 GTCCGGAACC
-1288 CAAACATTGT
1
psbs curr_primers 3 4
-1088 CAAACATTGT
-1088 CTTGGTGAAG
2
psbs curr_primers 5 6
-888 CTTGGTGAAG
-888 ACTTCATTCA
3
psbs curr_primers 11 12
-688 ACTTCATTCA
-688 CGGCGCGGCC
4
psbs curr_primers 13 14
-488 CGGCGCGGCC
-488 AGACACGGGA
5
doing last segment
rev_seq_indices -320 -390
GAAAGGGGGCATTGGGGTCACAGGCGCGGGCAGCGTGAAGCTGGAGTGGCGCCAACTGGAGGTGTCGATT
psbs curr_primers 15 16
-288 AGACACGGGA
-288 CGTGAGTAAA
6
raf1 curr_primers 20 21
-1170 AAAAGTTAGA
-1170 GGTGGTTATG
8
raf1 curr_primers 22 23
-970 GGTGGTTATG
-970 AAGTTGTAAG
9
raf1 curr_primers 24 25
-770 AAGTTGTAAG
-770 AACGATCGTA
10
raf1 curr_primers 26 27
-570 AACGATCGTA
-570 ATATAGGAAA
11
raf1 curr_primers 28 29
-370 ATATAGGAAA
-370 AGACTCTCAT
12
doing last segment
rev_seq_indices -320 -390
TCTCCACTCAACGGACGCTTCTCTTCTTCGAGGTTTGAATTTTCCTATATAAAAATTCAGATACGCAACT
raf1 curr_primers 30 31
-170 AGACTCTCAT
-170 AAAGGTTAAC
13
zmm28 curr_primers 34 35
-1247 TCATCAGTCA
-1247 GGCAGTGCTC
15
zmm28 

In [None]:
d_bb_seq_psbs_keep = get_tm_bb_primers(d_bb_seq_psbs)
d_bb_seq_raf1_keep = get_tm_bb_primers(d_bb_seq_raf1)
d_bb_seq_sbpase_keep = get_tm_bb_primers(d_bb_seq_sbpase)


1_2_rev 63.087897914902555
1_2_fwd 63.018726488303344
3_4_rev 63.02025148890124
3_4_fwd 63.467712419348175
5_6_rev 63.46038869723145
5_6_fwd 63.76186304611872
11_12_rev 63.05896958824479
11_12_fwd 63.7482917332502
13_14_rev 64.17936197199913
13_14_fwd 64.63413487862323
15_16_rev 63.64825209752297
15_16_fwd 63.37195169691586
17_18_rev 63.18999273666401
17_18_fwd 63.582471892771764
hit total length
20_21_rev 60.117187786263614
20_21_fwd 63.30455099155819
hit total length
22_23_rev 59.53147350054934
22_23_fwd 63.13819618962496
hit total length
24_25_rev 58.94575921483505
24_25_fwd 63.498914156880346
26_27_rev 63.07030072567778
26_27_fwd 63.34146386778747
28_29_rev 63.015222618389885
28_29_fwd 63.26979030259878
30_31_rev 63.23553213889329
30_31_fwd 63.50430935335055
32_33_rev 63.23553213889329
32_33_fwd 63.50259460612091
34_35_rev 63.68376149489325
34_35_fwd 63.43838197116378
36_37_rev 63.170871066519226
hit total length
36_37_fwd 62.46004492912077
38_39_rev 63.32604414810828
38_39_fwd 63.

In [None]:
# add aarI for PsbS
d_bb_seq_psbs_final = add_t2s_sites(d_bb_seq_psbs_keep, 'aar1')
df_bb_prims_all = pd.DataFrame.from_dict(d_bb_seq_psbs_final, orient='index')
df_bb_prims_all['gene'] = 'psbs'

# add bsaI for Raf1
d_bb_seq_raf1_final = add_t2s_sites(d_bb_seq_raf1_keep, 'bsa1')
df_bb_prims = pd.DataFrame.from_dict(d_bb_seq_raf1_final, orient='index')
df_bb_prims['gene'] = 'raf1'
df_bb_prims_all = pd.concat([df_bb_prims_all, df_bb_prims], axis=0)

# add bsmbI for SBPase
d_bb_seq_sbpase_final = add_t2s_sites(d_bb_seq_sbpase_keep, 'bsmb1')
df_bb_prims = pd.DataFrame.from_dict(d_bb_seq_sbpase_final, orient='index')
df_bb_prims['gene'] = 'sbpase'
df_bb_prims_all = pd.concat([df_bb_prims_all, df_bb_prims], axis=0)


# add an ID, and write to file
df_bb_prims_all['primer_n'] = df_bb_prims_all.index + '_'+df_bb_prims_all.gene +'_bb'
df_bb_prims_all = df_bb_prims_all.rename(columns={'0':'seq'})

df_bb_prims_fwd = df_bb_prims_all.loc[df_bb_prims_all.index.str.endswith('fwd')]
df_bb_prims_rev = df_bb_prims_all.loc[df_bb_prims_all.index.str.endswith('rev')]

df_bb_prims_fwd.to_csv('df_bb_prims_fwd.csv')
df_bb_prims_rev.to_csv('df_bb_prims_rev.csv')

# get primers for amplification of ordered sublibraries

In [146]:
df_primers_all['primers'] = df_primers_all['p1'].astype(str) + '_' + df_primers_all['p2'].astype(str)
df_primers_chunks_fwd = df_primers_all[['p1_seq', 'gene', 'primers']]
df_primers_chunks_rev = df_primers_all[['p2_seq', 'gene', 'primers']]
df_primers_chunks_rev['primer_name']= df_primers_chunks_rev['primers'] +df_primers_chunks_rev['gene'] +'_rev_chunk'
df_primers_chunks_fwd['primer_name']= df_primers_chunks_fwd['primers'] + df_primers_chunks_rev['gene']+'_fwd_chunk'
df_primers_chunks_fwd.to_csv('df_primers_chunks_fwd.csv')
df_primers_chunks_rev.to_csv('df_primers_chunks_rev.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


# create primers to make around the horn large deletions

In [None]:
prime5_full = dic_dic_gene_utr['raf1']['prime_5'] + dic_dic_gene_utr['raf1']['utr_5']
exon_1 = dic_dic_gene_utr['raf1']['exon_1']

prime5_exon1_full = prime5_full + exon_1

all_dic = {}
# make all 200 bp deletions
for gene, gene_dic in dic_dic_gene_utr.items():

    d_curr = get_deletions_primers(gene, 
                                   gene_to_fl_seq[gene],
                          gene_dic['prime_5'], 
                          gene_dic['utr_5'], 
                          gene_dic['exon_1'], 
                          chunk_size=200, max_len_primer = 90, min_tm = 57)
    all_dic.update(d_curr)

# make all 500 bp deletions
for gene, gene_dic in dic_dic_gene_utr.items():

    d_curr = get_deletions_primers(gene, 
                                   gene_to_fl_seq[gene],
                          gene_dic['prime_5'], 
                          gene_dic['utr_5'], 
                          gene_dic['exon_1'], 
                          chunk_size=500, max_len_primer = 90, min_tm = 57)
    all_dic.update(d_curr)

df_del = pd.DataFrame.from_dict(all_dic, orient='index')
df_del_f = df_del.loc[df_del.index.str[-1]=='f']
df_del_r = df_del.loc[df_del.index.str[-1]=='r']
df_del_f.to_csv('./del_f_primers_order.csv')
df_del_r.to_csv('./del_r_primers_order.csv')


psbs [('aar1', 'CACCTGC')]
198
raf1 [('bsa1', 'GGTCTC'), ('aar1', 'CACCTGC')]
1383
zmm28 [('bsa1', 'GGTCTC')]
185
sbpase [('bsmb1', 'CGTCTC')]
75
psbs [('aar1', 'CACCTGC')]
198
raf1 [('bsa1', 'GGTCTC'), ('aar1', 'CACCTGC')]
1383
zmm28 [('bsa1', 'GGTCTC')]
185
sbpase [('bsmb1', 'CGTCTC')]
75


{'g_psbs_p_-2198_dsize_200_re_aar1_f': 'gatacaCACCTGCaccaacacatgctaatgctaagatctctac',
 'g_psbs_p_-2198_dsize_200_re_aar1_r': 'GATACAcacctgcGCATGTGTctagaaatatgttgagcagattcattcctgcc',
 'g_psbs_p_-1998_dsize_200_re_aar1_f': 'gatacaCACCTGCgaggctacaatgctcttatttatttaaatatttgatttgc',
 'g_psbs_p_-1998_dsize_200_re_aar1_r': 'GATACAcacctgcCATTGTAGtggtatacataaagaatgcatacataaatagcctctagt',
 'g_psbs_p_-1798_dsize_200_re_aar1_f': 'gatacaCACCTGCggaaagcaagcatttgtatatggatcttatgc',
 'g_psbs_p_-1798_dsize_200_re_aar1_r': 'GATACAcacctgcTGCTTGCTcctcacagggccaatccacaagttgcttcg',
 'g_psbs_p_-1598_dsize_200_re_aar1_f': 'gatacaCACCTGCtctatcatgaacaaccagcacatggaccgt',
 'g_psbs_p_-1598_dsize_200_re_aar1_r': 'GATACAcacctgcGTTCATGAttccgttgtgggtggacccagaacacgtca',
 'g_psbs_p_-1398_dsize_200_re_aar1_f': 'gatacaCACCTGCgtttctgcctactgtagcactttcgttttt',
 'g_psbs_p_-1398_dsize_200_re_aar1_r': 'GATACAcacctgcGTAGGCAGtagagataaatatcaccatcagcacatcagtct',
 'g_psbs_p_-1198_dsize_200_re_aar1_f': 'gatacaCACCTGCaggccttacagcaacattgcc