In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def reverse_complement(s):
    return str(s).translate(str.maketrans('ATGC','TACG'))[::-1]

In [3]:
def get_ENSP_d():
    principal_ENSP_v1 = pd.read_table("data/appris_clean_v3.txt", sep=',')
    d = {}
    for i in range(principal_ENSP_v1.shape[0]):
        ENSP = principal_ENSP_v1.iloc[i, :]['ENSP']
        HUGO = principal_ENSP_v1.iloc[i, :]['HUGO']
        d[HUGO] = ENSP
    return d

In [4]:
def get_HUGO_d():
    principal_ENSP_v1 = pd.read_table("data/appris_clean_v3.txt", sep=',')
    d = {}
    for i in range(principal_ENSP_v1.shape[0]):
        ENSP = principal_ENSP_v1.iloc[i, :]['ENSP']
        HUGO = principal_ENSP_v1.iloc[i, :]['HUGO']
        d[ENSP] = HUGO
    return d

In [5]:
def get_combined_sgRNA(database, HUGO):
    
    ENSP = ENSP_d[HUGO]
    target_seq_wPAM = database.loc[database['ENSP'] == ENSP, 'target_seq_wPAM']
    combined_sgRNA_list = list(target_seq_wPAM)

    return combined_sgRNA_list

In [6]:
def get_unique_sgRNA(data_ABE, data_CBE, HUGO):
    
    sgRNA_ABE = get_combined_sgRNA(database=data_ABE, HUGO=HUGO)
    sgRNA_CBE = get_combined_sgRNA(database=data_CBE, HUGO=HUGO)
    
    unique_sgRNA_list = list(set(sgRNA_ABE + sgRNA_CBE))
    
    unique_sgRNA_df = {'HUGO': HUGO, 'ENSP': ENSP_d[HUGO], 'target_seq_wPAM': unique_sgRNA_list}
    unique_sgRNA_df = pd.DataFrame(unique_sgRNA_df)
    
    return unique_sgRNA_df

In [7]:
def get_sgRNA_main(sublib_ID, HUGO_input, data_ABE, data_CBE):
    
    res = []
    #for sublib_ID in range(len(HUGO_input)):
    for HUGO in (HUGO_input[sublib_ID]):
        unique_sgRNA_df = get_unique_sgRNA(data_ABE, data_CBE, HUGO)
        unique_sgRNA_df['target_seq_woPAM'] = [ss[:-4] for ss in unique_sgRNA_df['target_seq_wPAM']]
        # To design sgRNA with BsmBI sites.
        full_oligo = fwd_primers[sublib_ID] + "CGTCTCACACCG" + unique_sgRNA_df['target_seq_woPAM'] + "GTTTCGAGACG" + rev_primers[sublib_ID]
        unique_sgRNA_df['oligo_order'] = full_oligo
        unique_sgRNA_df['sublib_ID'] = sublib_ID
        # Get unique names
        unique_names = [(HUGO + "_" + str(i)) for i in range(unique_sgRNA_df.shape[0])]
        unique_sgRNA_df['unique_names'] =  unique_names

        res = res + [unique_sgRNA_df]
            
    df = pd.concat(res)
    return df

In [8]:
def get_sgRNA_control(sublib_ID, top_N=510):

    control_sgRNA_df = pd.read_table("data/control_sgRNA_table.txt")
    
    fwd_primer_list = [fwd_primers[sublib_ID] for kk in range(top_N)]
    rev_primer_list = [rev_primers[sublib_ID] for kk in range(top_N)]
    
    partial_oligo = "CGTCTCACACCG" + control_sgRNA_df.loc[range(top_N), 'sgRNA'] + "GTTTCGAGACG"
    full_oligo = fwd_primer_list + partial_oligo + rev_primer_list
    
    control_sgRNA_df.loc[range(top_N), 'oligo_order'] = full_oligo
    control_sgRNA_df.loc[range(top_N), 'sublib_ID'] = sublib_ID

    return control_sgRNA_df.loc[range(top_N), :]

In [9]:
# Some oligos have BsmBI cutting sites.
def remove_bad_oligos(data):
    cut_sites = ["CGTCTC", "GAGACG"]
    cleavage_res = []
    for oligo in data['oligo_order']:
        cleavage_res += [oligo.count(cut_sites[0]) + oligo.count(cut_sites[1])]

    sel = [(i==2) for i in cleavage_res]
    return data.loc[sel, :]

In [10]:
def remove_T4(data_1):
    data_1.index = list(range(data_1.shape[0]))
    target_seq_wPAM = list(data_1['target_seq_wPAM'])
    target_seq_woPAM = [ss[:-4] for ss in target_seq_wPAM]
    no_T4 = pd.Series(['TTTT' not in sg for sg in target_seq_woPAM])
    res = data_1.loc[no_T4, :]
    res.index = list(range(res.shape[0]))
    return(res)

In [11]:
# Define a list of aa positions that are close to the ligandable Cys.
# Use Cys +/- 3 amino acid (7-aa-window to increase sgRNA coverage of the site).
def get_ID_set(Cys_df):
    ID_list = []
    for i in range(Cys_df.shape[0]):
        aa_center = Cys_df.loc[i, 'Ev87_Cys']
        HUGO_temp = Cys_df.loc[i, 'HUGO']
        ID_temp = [(HUGO_temp + '_' + str(aa)) for aa in range(aa_center-3, aa_center+4)]
        ID_list = ID_list + ID_temp
        ID_set = set(ID_list)
        
    return ID_set

In [12]:
def clean_data_input_local_saturated(data, ID_set):
    
    # Select sgRNAs that are targeting the region of interest.
    row_sel = []
    for i in range(data.shape[0]):
        pos_vec_temp = data.iloc[i, :]['aa_pos_from_to']
        HUGO_temp = data.iloc[i, :]['HUGO']
        pos_list = [each_list[0] for each_list in pos_vec_temp]
        ID_list_temp = [(HUGO_temp + '_' + str(pos)) for pos in pos_list]
        overlap = [(ID in ID_set) for ID in ID_list_temp]
        row_sel.append(any(overlap))
        
    res = data.loc[row_sel, :]
    
    return res

In [13]:
def clean_data_input_global(data, target_genes):
    
    # Keep non-empty data
    target_seq_wPAM = data['target_seq_wPAM']
    non_empty = target_seq_wPAM != '' # No sgRNA designed
    data = data.loc[non_empty, :]

    # Need to add HUGO and target_seq_woPAM columns
    target_seq_woPAM = [ss[:-4] for ss in data['target_seq_wPAM']]
    data['target_seq_woPAM'] = target_seq_woPAM
    HUGO_list = [HUGO_d[ENSP] for ENSP in data['ENSP']]
    data.insert(0, 'HUGO', HUGO_list)
    
    overlap_sel = [(HUGO in target_genes) for HUGO in HUGO_list]

    res = remove_T4(data.loc[overlap_sel, :])
    
    return res

In [14]:
fwd_primers = ["AGGCACTTGCTCGTACGACG", "GTGTAACCCGTAGGGCACCT", "CAGCGCCAATGGGCTTTCGA", "CTACAGGTACCGGTCCTGAG", "CATGTTGCCCTGAGGCACAG", "GGTCGTCGCATCACAATGCG"]
rev_primers_rc = ["ATGTGGGCCCGGCACCTTAA", "GTCGAGAGCAGTCCTTCGAC", "AGCCGCTTAAGAGCCTGTCG", "GTACCTAGCGTGACGATCCG", "CCGTTAGGTCCCGAAAGGCT", "TCTCGAGCGCCAATGTGACG"]
rev_primers = [reverse_complement(s) for s in rev_primers_rc]

ENSP_d = get_ENSP_d()
HUGO_d = get_HUGO_d()

In [15]:
data_ABE_1 = pd.read_pickle("data/all_ABE.pkl")
data_CBE_1 = pd.read_pickle("data/all_evoCDA.pkl")

In [19]:
# For saturated profiling (Fig 1)
# Each sublibrary can be amplified by a unique pair of PCR primers.
sublib_ID = 0
target_genes = ['EGFR'] # This is an example.

data_ABE_2 = clean_data_input_global(data=data_ABE_1, target_genes=target_genes)
data_CBE_2 = clean_data_input_global(data=data_CBE_1, target_genes=target_genes)

HUGO_input = {sublib_ID: target_genes}

result = get_sgRNA_main(sublib_ID=sublib_ID, HUGO_input=HUGO_input, data_ABE=data_ABE_2, data_CBE=data_CBE_2)
clean_result = remove_bad_oligos(result)

control = get_sgRNA_control(sublib_ID=sublib_ID, top_N=100)
clean_control = remove_bad_oligos(control)

#clean_result.to_csv('clean_result.csv')
#clean_control.to_csv('clean_control.csv')

In [17]:
# For local saturated editing (Fig 3)
# Use Cys +/- 3 amino acid (7-aa-window to increase sgRNA coverage of the site).
sublib_ID = 2

Cys_df = pd.read_csv("data/target_Cys.csv")
ID_set = get_ID_set(Cys_df)
target_genes = list(set(Cys_df['HUGO']))

data_ABE_2 = clean_data_input_global(data=data_ABE_1, target_genes=target_genes)
data_CBE_2 = clean_data_input_global(data=data_CBE_1, target_genes=target_genes)

data_ABE_3 = clean_data_input_local_saturated(data=data_ABE_2, ID_set=ID_set)
data_CBE_3 = clean_data_input_local_saturated(data=data_CBE_2, ID_set=ID_set)

HUGO_input = {sublib_ID: target_genes}

result = get_sgRNA_main(sublib_ID=sublib_ID, HUGO_input=HUGO_input, data_ABE=data_ABE_3, data_CBE=data_CBE_3)
clean_result = remove_bad_oligos(result)

control = get_sgRNA_control(sublib_ID=sublib_ID, top_N=70)
clean_control = remove_bad_oligos(control)

#clean_result.to_csv('clean_result.csv')
#clean_control.to_csv('clean_control.csv')