# Import packages

In [1]:
from detectDesign import *
from itertools import repeat
sns.set_style('whitegrid')
%matplotlib inline

# Selecting targets for T and E 2

In [2]:
# Output details
TARGET_CSV = True
OFF_TARGET_CSV = True
OUTPUT_FOLDER = '../results/F_tularensis/'


# Target genome, name, and region
TARGET = '../data/sequences/off_target_genomes/Ftula_LSV_genomic.fa'
TARGET_NAME = 'F_tula'
TARGET_REGION = [601621, 603620]


# Target parameters
PAIR_SHARE_STRAND = True # Checks if the pairs are on the same strand

PAMS = ['NGG', 'NAG', 'NGA'] # List of acceptable PAMS for target
PAIR_DIST = 62 # Exact distance for target pair
PAIR_RANGE = False # When True target pairs filtered on distance between MIN_DIST and MAX_DIST instead of exact PAIR_DIST
MAX_DIST = 200 # Maximum distance acceptable for target pairs
MIN_DIST = 23 # Minimum distance acceptable for target pairs


# Off-target parameters
INCLUDE_SELF = True # Include target genome in off-target analysis

GENOMES = ['../../Scratch/Joshua_Things/off_target_genomes/genome1.gb',
           '../../Scratch/Joshua_Things/off_target_genomes/Integration Vector.gb',
           '../../Scratch/Joshua_Things/off_target_genomes/IP_193.gb']

OFF_TARGET_PAMS = ['NGG', 'NAG', 'NGA'] # List of acceptable PAMS for off-target
HAMMING_MAX = 8
SEED_MAX = 3
SEED_SIZE = 8

# Initialize tables
gene_table = []
target_table = []

## Finding potential binding sites in target

In [3]:
# Load in target
target_seq = read_seq_file(TARGET)
target_sites = find_guides_multiple_pams(target_seq, TARGET_NAME, PAMS)

# Set target_sites to the sites in the region specified
if TARGET_REGION != None:
    target_sites = target_sites[target_sites.Start.between(TARGET_REGION[0], TARGET_REGION[1])]

## Finding pairs of target sites
Getting the total number of proximal sites, and extract seqs corresponding to the ideal pair distance

In [4]:
target_sites_gg = target_sites[target_sites['PAM'].apply(lambda x: x[1:] == 'GG')]
target_pairs = pair_sites(target_sites_gg, MAX_DIST, MIN_DIST)

target_pairs.loc[:, 'Pair_Dist'] = np.abs(target_pairs['Start_2'] - target_pairs['Start_1'])
target_pairs.loc[:, 'Shared_Strand'] = (target_pairs['Strand_2'] == target_pairs['Strand_1'])

exact_pairs = ((target_pairs.Start_1 - target_pairs.Start_2) == PAIR_DIST)

if TARGET_CSV == True:
    target_pairs.to_csv(OUTPUT_FOLDER + TARGET_NAME + '_' + str(TARGET_REGION[0]) + '_' + str(TARGET_REGION[1]) + '.csv')


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))


## For all possible pairs of guide RNAs, find potential off-targets in other genes and b. subtilis

In [5]:
if PAIR_RANGE == True:
    dist_cond = target_pairs.Pair_Dist.between(MIN_DIST, MAX_DIST)
else:
    dist_cond = target_pairs.Pair_Dist == PAIR_DIST
    
candidate_table = target_pairs[(dist_cond) & (target_pairs.Shared_Strand == PAIR_SHARE_STRAND)]

In [6]:
candidate_table 

Unnamed: 0,End_1,Genome_1,Guide_1,PAM_1,Start_1,Strand_1,End_2,Genome_2,Guide_2,PAM_2,Start_2,Strand_2,Pair_Dist,Shared_Strand
383,602749,F_tula,ATAGTAGCAACAGTAGATTT,AGG,602729,+,602687,F_tula,TTACAGATATTTTTTTATAA,TGG,602667,+,62,True
557,603060,F_tula,TAAAATTATTTCCGGATTGG,AGG,603040,+,602998,F_tula,AAAGCAAAAAATAATATAAA,AGG,602978,+,62,True
660,603201,F_tula,AATTGCGCGTAGTTTAGATA,TGG,603181,+,603139,F_tula,TTAGTCATAGATATCGGTGG,AGG,603119,+,62,True
695,603222,F_tula,GGGTTGTGTCGGAATGCAGA,AGG,603202,+,603160,F_tula,GGCTCAACAGAATTTGTAAT,TGG,603140,+,62,True
707,603273,F_tula,TGCTAATTTTCATGCAGCAG,CGG,603253,+,603211,F_tula,AGTTTAGATATGGGTTGTGT,CGG,603191,+,62,True
937,603580,F_tula,TCTGAGATGAGATTATCAAA,TGG,603560,+,603518,F_tula,GTGAGAGTGTCCTTGCTGGT,GGG,603498,+,62,True


# Off-target genomes analysis

In [7]:
if INCLUDE_SELF == True:
    GENOMES = [TARGET] + GENOMES
    
genome_list = [(read_seq_file(g), g.split('/')[-1]) for g in GENOMES]
off_target_results = list(map(off_target_analysis, repeat(candidate_table), genome_list, repeat(HAMMING_MAX), repeat(SEED_MAX), repeat(SEED_SIZE), repeat(OFF_TARGET_PAMS)))

383 computing sites


  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


4856 site
383 computing pairs
2391 pairs
557 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


4822 site
557 computing pairs
2516 pairs
660 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


1482 site
660 computing pairs
250 pairs
695 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


989 site
695 computing pairs
92 pairs
707 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


1670 site
707 computing pairs
282 pairs
937 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


1548 site
937 computing pairs
228 pairs


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))


383 computing sites


  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


4734 site
383 computing pairs
1185 pairs
557 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


6747 site
557 computing pairs
2393 pairs
660 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


1958 site
660 computing pairs
170 pairs
695 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


2545 site
695 computing pairs
245 pairs
707 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


2627 site
707 computing pairs
261 pairs
937 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


2685 site
937 computing pairs
298 pairs


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))


383 computing sites
4 site
383 computing pairs
0 pairs
557 computing sites


  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)


9 site
557 computing pairs
0 pairs
660 computing sites
3 site
660 computing pairs
0 pairs
695 computing sites
2 site
695 computing pairs
0 pairs
707 computing sites


  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


3 site
707 computing pairs
0 pairs
937 computing sites
5 site
937 computing pairs
0 pairs
383 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


4 site
383 computing pairs
0 pairs
557 computing sites
9 site
557 computing pairs
0 pairs
660 computing sites
3 site
660 computing pairs
0 pairs
695 computing sites


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))
  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))
  target = guides['One_Hot'].get_value(i)
  scores.append(score.assign(Target_Guide=guides['Guide'].get_value(i)))


2 site
695 computing pairs
0 pairs
707 computing sites
3 site
707 computing pairs
0 pairs
937 computing sites
5 site
937 computing pairs
0 pairs


  curr = (i, coords[pos].get_value(i), coords[gen].get_value(i))


In [8]:
for i in candidate_table.index:
    pair_i_pd_list = []
    for g in range(len(genome_list)):
        if len(off_target_results[g][i]) > 0:
            pair_i_pd_list.append(off_target_results[g][i])
        
    pair_i_pd = pd.concat(pair_i_pd_list, axis=0)
    
    if OFF_TARGET_CSV == True:
        pair_i_pd.drop(columns=['Seed_One_Hot_1', 'Seed_One_Hot_2'], inplace=True) 
        pair_i_pd.drop_duplicates(['Start_1', 'Genome_1', 'Genome_2', 'Start_2'], keep='first', inplace=True)
        pair_i_pd = pair_i_pd.sort_values(['Full_Mism_1', 'Seed_Mism_1', 'Full_Mism_2', 'Seed_Mism_2'], ascending=True)
        pair_i_pd.to_csv(OUTPUT_FOLDER + TARGET_NAME + '_pair_' + str(i) + '.csv')