In [1]:
# test function to form the reverse complement of a DNA sequence
from sgRNAutils import revComp
revComp('ATG')

'CAT'

In [2]:
# test function to extract the DNA sequence at particular genomic coordinates of a particular chromosome
# to confirm look up the genome coordinates and the chromosome for drosophila melanogaster dm6 on the UCSC genome browser https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6
# below the link to to verify the results of the query below
# https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chrX%3A20000080%2D20000090&hgsid=1877457346_JpaxsFWfPkr6i1rbo2jG4jhD73Fb
from sgRNAutils import refSeq
refSeqPerChromosome= refSeq()
refSeqPerChromosome['X'][20000080:20000090]

Seq('TGCTAGAGGG')

In [4]:
# test function to identify known transcripts of transcription factors and extract their genomic information
# this cell takes about 50s to run but you can avoid it by running the next cell and loading the output from this cell from MockMaterials
# to confirm the results look up the transcriptIDs below on the UCSC genome browser and double check the genomic information: https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6
# here an example for FBtr0070072: https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chrX%3A370031%2D370947&hgsid=1877489778_pLlolcHmOnq6afrqpZAa7WNNrw7r
# you can also check the different transcripts belonging to the same gene in the UCSC genome browser
from sgRNAutils import make_dataframe_from_TFs_list
TFsdf = make_dataframe_from_TFs_list('inputfiles/mockMaterials/TFsTruncatedLong.xlsx', refSeqPerChromosome)
display(TFsdf)

Unnamed: 0,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq
0,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...
1,FBgn0000022,FBtr0070072,X,stop_codon,370697,370699,+,GACATTGTGTCGTTCGTATGTCGCCCATTGAGACCGCCAATGGAGG...
2,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...
3,FBgn0003963,FBtr0078063,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...
4,FBgn0003963,FBtr0310027,2L,start_codon,491316,491318,+,CACTGATAAAAAATACCCTAAATTATCGCAACGCAAATGGTAAAGT...
5,FBgn0003963,FBtr0310027,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...
6,FBgn0003963,FBtr0329895,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...
7,FBgn0003963,FBtr0329895,2L,stop_codon,540017,540019,+,TTTTTTAATGTTCTGCACTTGCTTAATTACCAGCATGTACCATCTG...
8,FBgn0003963,FBtr0333012,2L,start_codon,525418,525420,+,ATCATTGGCCAAAAAAGTAGGAATTCTGGGAGGAGGGCATGCCAAA...
9,FBgn0003963,FBtr0333012,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...


In [3]:
# test function to extract 225 bp of the genome left and right of the start/stop codon of a transcript as homology arms (HA)
from sgRNAutils import make_homology_arm_fragments
import pandas as pd
TFsdf = pd.read_excel('inputfiles/mockMaterials/TFsdfTruncatedLong.xlsx')
TFsdf = make_homology_arm_fragments(TFsdf, refSeqPerChromosome)
TFsdf = TFsdf.reset_index()
del TFsdf["index"]

# for every transcript in the dataframe above ...
for index, row in TFsdf.iterrows():
    # ... check that the homology arms are 225 bp long and that the sequence between the homology arms is a start or a stop codon in forward or reverse complement
    print(refSeqPerChromosome[row["Chromosome"]][row['Start']-1:row['Stop']], len(TFsdf.iloc[0]["HAL"]), len(TFsdf.iloc[0]["HAR"]))
    # ... compare the region around the start/stop codon from the genome file to the end of HAL and the beginning of HAR that this function extracts
    print(refSeqPerChromosome[row["Chromosome"]][row['Start']-21:row['Stop']+20])
    print(TFsdf.iloc[index]["HAL"][-20:] + "XXX" + TFsdf.iloc[index]["HAR"][:20])

ATG 225 225
ACGTTATACTATCTCTTAAAATGGCTTTGGGCAGCGAAAATCA
ACGTTATACTATCTCTTAAAXXXGCTTTGGGCAGCGAAAATCA
TAA 225 225
CACTCTGGCAGGACGACCTGTAAAAAAACAGATCAAATCTTCA
CACTCTGGCAGGACGACCTGXXXAAAAACAGATCAAATCTTCA
ATG 225 225
ATACACATATAGGGAAAACAATGCTTAGCAGCAATACAAGAGG
ATACACATATAGGGAAAACAXXXCTTAGCAGCAATACAAGAGG
TGA 225 225
ACCTGCAGGAGGCGGCCATTTGAGAAAGCCAGCTGCGGTGGGA
ACCTGCAGGAGGCGGCCATTXXXGAAAGCCAGCTGCGGTGGGA
ATG 225 225
GTCCAGCAGCTGGACTCGAAATGTCACGACGCAAACAGTCCAA
GTCCAGCAGCTGGACTCGAAXXXTCACGACGCAAACAGTCCAA
TGA 225 225
ACCTGCAGGAGGCGGCCATTTGAGAAAGCCAGCTGCGGTGGGA
ACCTGCAGGAGGCGGCCATTXXXGAAAGCCAGCTGCGGTGGGA
ATG 225 225
ATACACATATAGGGAAAACAATGCTTAGCAGCAATACAAGAGG
ATACACATATAGGGAAAACAXXXCTTAGCAGCAATACAAGAGG
TAA 225 225
AGTGGTATTACACGTGCTTCTAAGCAGCAGCCCAACCCAATCG
AGTGGTATTACACGTGCTTCXXXGCAGCAGCCCAACCCAATCG
ATG 225 225
GTTCCGATACCGCAGAAGAGATGACCGTCGACAGCAGAGGTAA
GTTCCGATACCGCAGAAGAGXXXACCGTCGACAGCAGAGGTAA
TGA 225 225
ACCTGCAGGAGGCGGCCATTTGAGAAAGCCAGCTGCGGTGGGA
ACCTGCAGGAGGCGGCCATTXXXGAAAGCCAGCTGCGGTGGGA


DO NOT RUN THE NEXT CELL without reading this!
The next cell might take about 30 min to run. You can avoid it by running the cell after next cell and loading the output from this cell from MockMaterials.

In [10]:
# test function to identify sgRNAs within a given window around the start/stop codon of a transcript
# this cell takes 32 min to run, you can avoid it by running the next cell and loading the output from this cell from MockMaterials
from sgRNAutils import gRNA_stringencyIterator
import os

folder_name = "inputfiles/mockMaterials/sgRNAsdfs"
window = 21
for index, row in TFsdf.iterrows():
    result_df = gRNA_stringencyIterator(row, window, refSeqPerChromosome)
    display(result_df)
    # test whether every sgRNA identified is within the window of 21 bp around the start/stop codon
    # you can manually do this by comparing the genomic coordinates of the sgRNA (fmin and fmax) with the coordinates of the start/stop codon (Start and Stop)
    for i in range(len(result_df)):
        if result_df['fmin'].iloc[i] >= result_df["Start"].iloc[i] - window & result_df['fmax'].iloc[i] <= result_df["Stop"].iloc[i] + window:
            print("sgRNA within window")

stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,370082,370104,X,+,CTATCTCTTAAAATGGCTTTGGG,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...,TACTACCTCTCTATTAAAATCAGAGAAAACACTCATCTCAAGAGAC...,GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGT...


sgRNA within window
stringency 0


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,524038,524060,2L,+,ATGCTTAGCAGCAATACAAGAGG,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...,AAGAGCTACAATACGATACCGCACGTTCGTCATAACGAGAATTTGA...,CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCT...


sgRNA within window
stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,9792987,9793009,2L,+,CACAGTTTACGCAGGTCCATGGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...
1,9792986,9793008,2L,+,GCACAGTTTACGCAGGTCCATGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...


sgRNA within window
sgRNA within window
stringency 0
stringency 1
stringency 2
stringency 3


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,8362874,8362896,3R,+,CTGAAAATGTCGCCACCATCTGG,FBgn0037584,FBtr0301759,3R,start_codon,8362880,8362882,+,GATATACATCTATACATTATACGGATATTTATAGCATATACCTGCG...,AATATTTGTTGGCTGTTAGTATTAAATATATTCATATATTTATATT...,TCGCCACCATCTGGTGAATTCCGTTGCCGAGTTTGTCTTAAACAGG...


sgRNA within window
stringency 0
stringency 1
stringency 2


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,18506503,18506525,3R,+,GAGTTCTTTTGCAGCATTTATGG,FBgn0004652,FBtr0083648,3R,start_codon,18506517,18506519,-,CCTAATGCAACTTATTGAATTTTCGAGTGCTTGGCTTAGTTGATTT...,GAAAAAAAAAAATCGAAATATGTCCAGTCGTACTCACGTATGAATG...,TTATGGGAAACTAAAACCGAGCGATTTTATGACCGCGACGATTCGA...
1,18506504,18506526,3R,+,AGTTCTTTTGCAGCATTTATGGG,FBgn0004652,FBtr0083648,3R,start_codon,18506517,18506519,-,CCTAATGCAACTTATTGAATTTTCGAGTGCTTGGCTTAGTTGATTT...,GAAAAAAAAAAATCGAAATATGTCCAGTCGTACTCACGTATGAATG...,TTATGGGAAACTAAAACCGAGCGATTTTATGACCGCGACGATTCGA...


sgRNA within window
sgRNA within window
stringency 0
stringency 1
stringency 2
stringency 3


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,26005878,26005900,3R,-,CCATTTCCAGAACCATTTTGTTG,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...
1,26005884,26005906,3R,-,CCAGAACCATTTTGTTGTAGTTT,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...


sgRNA within window
sgRNA within window
stringency 0
stringency 1
stringency 2
stringency 3
stringency 4


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR


stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,8145930,8145952,X,-,CCAGGTGGGAGCCGCAATCATGC,FBgn0030008,FBtr0071155,X,start_codon,8145949,8145951,+,GTGCGTGTCAAATAAATTCGATGGAAACCCGTGAAACGAGGACCGA...,TTTTGTCACTAACATTTCAAAAGTAAATGAAATAATAATAAAGATG...,CTCAAGTCCCTCAAGCCGGACACGTACGCCGGAATGGCGAACGCCA...


sgRNA within window
stringency 0
stringency 1
stringency 2


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,15138209,15138231,2R,-,CCTGCAATGGCCACACTAATACC,FBgn0015371,FBtr0111006,2R,start_codon,15138215,15138217,+,GTTGCCACCATGACTGCCACTGCTGCCGGAACTCGAGCTGGCCACC...,TCGTTAATTCAATTAATCAGTCTCTATTCTTTAAGTCTTTACAATT...,GCCACACTAATACCAGTGAACGGCGGCCATCCAGCAGCGAGCGGAC...
1,15138196,15138218,2R,+,GCAAAGTGAAGCACCTGCAATGG,FBgn0015371,FBtr0111006,2R,start_codon,15138215,15138217,+,GTTGCCACCATGACTGCCACTGCTGCCGGAACTCGAGCTGGCCACC...,TCGTTAATTCAATTAATCAGTCTCTATTCTTTAAGTCTTTACAATT...,GCCACACTAATACCAGTGAACGGCGGCCATCCAGCAGCGAGCGGAC...


sgRNA within window
sgRNA within window
stringency 0
stringency 1
stringency 2


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,17757557,17757579,2R,-,CCAACTTGTTTTTGAACAATGCC,FBgn0022740,FBtr0089547,2R,start_codon,17757575,17757577,+,TTCTAGTGGGGGGCCCGGCTGAATGGCTGACTCCGGTCATGGGATG...,GAGAGCTGCAAACTGCGGAGCAGAATGCTCAAAAAGGAGGATCTCC...,CCACGCAAGAAGCGCAGTTCCAGCCCCACGGAGTTCTTTGATGACG...


sgRNA within window
stringency 0
stringency 1
stringency 2
stringency 3
stringency 4


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR


stringency 0


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,30852536,30852558,3R,-,CCCACATCGGTATGCAGTCGTCG,FBgn0003720,FBtr0085709,3R,start_codon,30852547,30852549,+,TATGAGAAATCGTTGATTTCACTGATCGAGTATTGTCCAAAGCGCT...,GCCGGCGGTCCTCACAGCAGACAACACAACCCATCGTGATCTCAGC...,CAGTCGTCGGAGGGTTCACCAGACATGATGGATCAGAAATACAACA...


sgRNA within window
stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,11278450,11278472,2L,+,CATGCAGCTGCATCCCAATGCGG,FBgn0052831,FBtr0091683,2L,start_codon,11278451,11278453,+,TTAGCGGACCATTAGAAACACAATTGAGTTTGCATTGGGTGCTTTA...,GCGGGAGAGAGTCGAGAGTCATTCGTTTGTCTCACTCCTGCGAGTG...,CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTA...


sgRNA within window
stringency 0
stringency 1
stringency 2


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,7492183,7492205,2R,+,TCTCCATGTCCAGATCCGAGTGG,FBgn0263240,FBtr0088984,2R,start_codon,7492187,7492189,-,TTTTATTTGGTTCAAAACAAAGTATGCATCATATTATTAATTTATT...,CTGACCCACCTCGAACCAAACAGGCGTGATGTCGCTGCGCTTCTGG...,GTCCAGATCCGAGTGGCCAATTTCCGATAGCCCGATAGCTAGTTAG...
1,7492180,7492202,2R,-,CCATCTCCATGTCCAGATCCGAG,FBgn0263240,FBtr0088984,2R,start_codon,7492187,7492189,-,TTTTATTTGGTTCAAAACAAAGTATGCATCATATTATTAATTTATT...,CTGACCCACCTCGAACCAAACAGGCGTGATGTCGCTGCGCTTCTGG...,GTCCAGATCCGAGTGGCCAATTTCCGATAGCCCGATAGCTAGTTAG...
2,7492186,7492208,2R,-,CCATGTCCAGATCCGAGTGGCCA,FBgn0263240,FBtr0088984,2R,start_codon,7492187,7492189,-,TTTTATTTGGTTCAAAACAAAGTATGCATCATATTATTAATTTATT...,CTGACCCACCTCGAACCAAACAGGCGTGATGTCGCTGCGCTTCTGG...,GTCCAGATCCGAGTGGCCAATTTCCGATAGCCCGATAGCTAGTTAG...


sgRNA within window
sgRNA within window
sgRNA within window
stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,361640,361662,3L,-,CCAAGGAAGGCACCCACGATGAG,FBgn0035137,FBtr0072548,3L,start_codon,361658,361660,+,CACCAAAATCAAGGTCCAAAGCACTAAAACGTGTGTTTTCCACATG...,TAACGCGACAGTGTGACCGAATATAGTTGTGCAAACACCGCTGCCA...,AGCGATGCGACCTATCCGGGATATTACGGCGGCAACCAGGTGAGCA...
1,361653,361675,3L,-,CCACGATGAGCGATGCGACCTAT,FBgn0035137,FBtr0072548,3L,start_codon,361658,361660,+,CACCAAAATCAAGGTCCAAAGCACTAAAACGTGTGTTTTCCACATG...,TAACGCGACAGTGTGACCGAATATAGTTGTGCAAACACCGCTGCCA...,AGCGATGCGACCTATCCGGGATATTACGGCGGCAACCAGGTGAGCA...


sgRNA within window
sgRNA within window
stringency 0
stringency 1
stringency 2


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,8600563,8600585,2R,-,CCAGTAAAGCGCAGCAGCAATGA,FBgn0267792,FBtr0088692,2R,start_codon,8600582,8600584,+,GCCGCGGGTAAAGGCCTCTCACCAAACAAAGAAAATGTGACGAGCT...,GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCA...,ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACT...


sgRNA within window
stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,10533746,10533768,2R,-,CCATTCTGGTTTCTCGGGTGGTT,FBgn0283521,FBtr0343132,2R,start_codon,10533747,10533749,-,GGCACGATTATTACCGCTGCAGTTCTCCTTTTTCCTTTGAGCTCCT...,TGTATTTTTAGTGTTAACAGAGTTTTCCGAAACTATGAATTTGGTA...,TCTGGTTTCTCGGGTGGTTGGATGGGGTTGTTGGAAATGGATCCTG...
1,10533744,10533766,2R,+,ATCCATTCTGGTTTCTCGGGTGG,FBgn0283521,FBtr0343132,2R,start_codon,10533747,10533749,-,GGCACGATTATTACCGCTGCAGTTCTCCTTTTTCCTTTGAGCTCCT...,TGTATTTTTAGTGTTAACAGAGTTTTCCGAAACTATGAATTTGGTA...,TCTGGTTTCTCGGGTGGTTGGATGGGGTTGTTGGAAATGGATCCTG...


sgRNA within window
sgRNA within window
stringency 0
stringency 1
stringency 2


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,18514698,18514720,X,+,CGGCTTGTTCATCTCGTGTGTGG,FBgn0030933,FBtr0074621,X,start_codon,18514707,18514709,-,CATCAGTAGACTTACATCAGCAAGAGATTAATATAAACAATTGCTT...,CATGCTCGATCCTTCCCGCCCTGCTCCTCCGCCTGCGCCAGCCGTG...,CTCGTGTGTGGTGTGCGTGTGTGTTTGTGTATGTGTAGCGTGTCTT...


sgRNA within window
stringency 0
stringency 1
stringency 2
stringency 3
stringency 4


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR


In [4]:
# to avoid the cell above but still manually check the results of the gRNA_stringencyIterator function, run this cell
# the gRNA_stringencyIterator is supposed to identify sgRNAs within a given window around the start/stop codon of a transcript
# to test the downstream functions we need to identifiy representatives for the cases and combinations that the downstream functions are supposed to handle
# there are 8 different combinations of start/stop codon - sgRNA strand - gene strand (see MutationLogic_ML_20231001.pptx)
# however, for two combinations each, gene and sgRNA have effectively the same relative position to each other 
# (e.g. the two combinations start codon - sgRNA strand (+) - gene strand (+) and stop codon - sgRNA strand (+) - gene strand (-))
# therefore, we have identified 4 combinations with unique relative positions to test the downstream pipeline
# further, we have identified representatives for the 3 possible sgRNA position cases relevant for the find_best_sgRNA function
# further, we have identified representatives for the 3 possible sgRNA position cases relevant for the find_best_mutation function
sgRNAdf1 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0032150_FBtr0079917.xlsx') # PAM in start/stop, start codon - gene (-) - gRNA (+)
sgRNAdf2 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0002733_FBtr0084982.xlsx') # max 15 bp overlap, start codon - gene (-) - gRNA (-)
sgRNAdf3 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0003963_FBtr0078063.xlsx') # more than 15 bp overlap, PAM in CDS
sgRNAdf4 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0000022_FBtr0070072.xlsx') # start codon - gene (+) - gRNA (+)
sgRNAdf5 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0003720_FBtr0085709.xlsx') # start codon - gene (+) - gRNA (-)
sgRNAdf6 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0052831_FBtr0091683.xlsx') # SRS in CDS (need to manually change "PAM in CDS" to False)
sgRNAdf7 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/FBgn0267792_FBtr0088692.xlsx') # PAM and SRS bot outside CDS

for i in [sgRNAdf1, sgRNAdf2, sgRNAdf3, sgRNAdf4, sgRNAdf5, sgRNAdf6, sgRNAdf7]:
    display(i)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,9792987,9793009,2L,+,CACAGTTTACGCAGGTCCATGGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...
1,9792986,9793008,2L,+,GCACAGTTTACGCAGGTCCATGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,26005878,26005900,3R,-,CCATTTCCAGAACCATTTTGTTG,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...
1,26005884,26005906,3R,-,CCAGAACCATTTTGTTGTAGTTT,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,524038,524060,2L,+,ATGCTTAGCAGCAATACAAGAGG,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...,AAGAGCTACAATACGATACCGCACGTTCGTCATAACGAGAATTTGA...,CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCT...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,370082,370104,X,+,CTATCTCTTAAAATGGCTTTGGG,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...,TACTACCTCTCTATTAAAATCAGAGAAAACACTCATCTCAAGAGAC...,GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGT...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,30852536,30852558,3R,-,CCCACATCGGTATGCAGTCGTCG,FBgn0003720,FBtr0085709,3R,start_codon,30852547,30852549,+,TATGAGAAATCGTTGATTTCACTGATCGAGTATTGTCCAAAGCGCT...,GCCGGCGGTCCTCACAGCAGACAACACAACCCATCGTGATCTCAGC...,CAGTCGTCGGAGGGTTCACCAGACATGATGGATCAGAAATACAACA...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,11278450,11278472,2L,+,CATGCAGCTGCATCCCAATGCGG,FBgn0052831,FBtr0091683,2L,start_codon,11278451,11278453,+,TTAGCGGACCATTAGAAACACAATTGAGTTTGCATTGGGTGCTTTA...,GCGGGAGAGAGTCGAGAGTCATTCGTTTGTCTCACTCCTGCGAGTG...,CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTA...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,8600563,8600585,2R,-,CCAGTAAAGCGCAGCAGCAATGA,FBgn0267792,FBtr0088692,2R,start_codon,8600582,8600584,+,GCCGCGGGTAAAGGCCTCTCACCAAACAAAGAAAATGTGACGAGCT...,GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCA...,ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACT...


In [5]:
# test the sgRNApositionCheck function which checks how the sgRNA is positioned relative to the start/stop codon
from sgRNAutils import sgRNApositionCheck

#Create list to store results in
positionCheckedList = []

#Derive min and max distance from stop position from window size
window = 21
minDistance = (window + 3) * -1 # leave a 3bp gap to account for start/stop site #TODO: explain this better
maxDistance = window

for sgRNAdf in [sgRNAdf1, sgRNAdf2, sgRNAdf3, sgRNAdf4, sgRNAdf5, sgRNAdf6, sgRNAdf7]:
    result_df = sgRNApositionCheck(sgRNAdf, minDistance, maxDistance)
    display(result_df)
    # check whether the sgRNA is positioned within the HA as expected
    for i in range(len(result_df)):
        positionScore = result_df.iloc[i]["positionScore"]
        # print positionScore which represents the distance between the end of the start/stop codon and the end of the sgRNA (see MutationLogic_ML_20231001.pptx)
        # note that the positionScore consequently represents the number of nucleotides in the HAR that are part of the sgRNA
        print(positionScore)
        # print the part of the sgRNA that is located in HAR (homology arm right)
        display(result_df.iloc[i]["HAR"][:positionScore])
        # print the part of the sgRNA that is the start/stop codon
        print(refSeqPerChromosome[result_df.iloc[i]["Chromosome"]][result_df.iloc[i]['Start']-1:result_df.iloc[i]['Stop']])
        positionScore = positionScore + 2 # currently the stop of the start/stop codon is zero, we want the stop of the HAL to be -1, 3 nucleotides from the start/stop codon are left out
        # TODO: explain this better
        positionScore = positionScore  -22 # currently we have the stop of the sgRNA, we want the start of the sgRNA, sgRNA is 23 nucleotides long and we want position inclusively
        # print the part of the sgRNA that is located in HAL (homology arm left)
        display(result_df.iloc[i]["HAL"][positionScore:])
    positionCheckedList.append(result_df)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,9792987,9793009,2L,+,CACAGTTTACGCAGGTCCATGGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,...,False,True,False,True,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-2
1,9792986,9793008,2L,+,GCACAGTTTACGCAGGTCCATGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,...,True,False,False,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-3


3


'GGG'

CAT


'CACAGTTTACGCAGGTC'

2


'GG'

CAT


'GCACAGTTTACGCAGGTC'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,26005878,26005900,3R,-,CCATTTCCAGAACCATTTTGTTG,FBgn0002733,FBtr0084982,3R,start_codon,26005891,...,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-10
1,26005884,26005906,3R,-,CCAGAACCATTTTGTTGTAGTTT,FBgn0002733,FBtr0084982,3R,start_codon,26005891,...,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-4


7


'TTTGTTG'

CAT


'CCATTTCCAGAAC'

13


'TTTGTTGTAGTTT'

CAT


'CCAGAAC'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,524038,524060,2L,+,ATGCTTAGCAGCAATACAAGAGG,FBgn0003963,FBtr0078063,2L,start_codon,524038,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",15


20


'CTTAGCAGCAATACAAGAGG'

ATG


'AAGAGCTACAATACGATACCGCACGTTCGTCATAACGAGAATTTGAATAATTGTATTACAGTGAGTAGTGAGTCAAAGAAAATTCCCACGGGAAGTGACAACATAATTGCCGCTTAAGTACGGCCCAAACCGAAAGATAGAAATGTGCAAGTGACAAAGTGAAGGAGCCACCAGTAAACCTGGATTATTTGCCAAGTGGATTACAATACACATATAGGGAAAACA'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,370082,370104,X,+,CTATCTCTTAAAATGGCTTTGGG,FBgn0000022,FBtr0070072,X,start_codon,370094,...,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",3


8


'GCTTTGGG'

ATG


'CTATCTCTTAAA'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,30852536,30852558,3R,-,CCCACATCGGTATGCAGTCGTCG,FBgn0003720,FBtr0085709,3R,start_codon,30852547,...,False,True,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-8


9


'CAGTCGTCG'

ATG


'CCCACATCGGT'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11278450,11278472,2L,+,CATGCAGCTGCATCCCAATGCGG,FBgn0052831,FBtr0091683,2L,start_codon,11278451,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14


19


'CAGCTGCATCCCAATGCGG'

ATG


'C'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8600563,8600585,2R,-,CCAGTAAAGCGCAGCAGCAATGA,FBgn0267792,FBtr0088692,2R,start_codon,8600582,...,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-16


1


'A'

ATG


'CCAGTAAAGCGCAGCAGCA'

In [6]:
# test checkCDSCutandOrder function selects a sgRNA that cuts in the CDS and/or as a lower priority an sgRNA that cuts the closest to the start/stop codon
from sgRNAutils import checkCDSCutandOrder
import pandas as pd
# test this for some of the TF transcripts with multiple sgRNAs
display(positionCheckedList[0])
display(positionCheckedList[1])
# we expect the second sgRNA in first dataframe to be selected since cuts inside the CDS (cut_site value is among the CDS_boundary values) and the first one cuts outside
# we expect the second sgRNA in the second dataframe to be selected since it is closer to the start/stop codon (4 is smaller than 10)
sgRNAdf1_checked = checkCDSCutandOrder(positionCheckedList[0])
sgRNAdf2_checked = checkCDSCutandOrder(positionCheckedList[1])
display(sgRNAdf1_checked)
display(sgRNAdf2_checked)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,9792987,9793009,2L,+,CACAGTTTACGCAGGTCCATGGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,...,False,True,False,True,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-2
1,9792986,9793008,2L,+,GCACAGTTTACGCAGGTCCATGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,...,True,False,False,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-3


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,26005878,26005900,3R,-,CCATTTCCAGAACCATTTTGTTG,FBgn0002733,FBtr0084982,3R,start_codon,26005891,...,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-10
1,26005884,26005906,3R,-,CCAGAACCATTTTGTTGTAGTTT,FBgn0002733,FBtr0084982,3R,start_codon,26005891,...,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-4


'GCACAGTTTACGCAGGTCCATGG'

'CCAGAACCATTTTGTTGTAGTTT'

In [7]:
# test for the three different cases for identifying best sgRNA (e.g. PAM in start/stop, max 15 bp overlap, none of the above)
# we expect that for the first dataframe the sgRNA is selected that has the PAM in the start/stop codon, no mutation needed
# we expect that for the second dataframe the sgRNA is selected that is cuts the closest to the start/stop codon, since both candidate sgRNAs exhibit max 15bp overlap and cut within the CDS, no mutation needed
# we expect that for the third dataframe a mutation is needed since the PAM is not located in the start/stop codon and the overlap is more than 15 bp

from sgRNAutils import find_best_gRNA  
pd.set_option('display.max_columns', None)

bestsgRNAList = []

for sgRNAdf in positionCheckedList:
    # first display all sgRNAs within the given windown around the start/stop codon
    display(sgRNAdf)
    sgRNA, mutationNeeded = find_best_gRNA(sgRNAdf)
    print("mutation needed: ", mutationNeeded)
    winnerdf = sgRNAdf[sgRNAdf["sgRNA_sequence"] == sgRNA] 
    # next display the sgRNA that is identified as the best sgRNA
    display(winnerdf)
    bestsgRNAList.append(winnerdf)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
0,9792987,9793009,2L,+,CACAGTTTACGCAGGTCCATGGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...,3,HAL,HAR,False,True,False,True,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-2,2
1,9792986,9793008,2L,+,GCACAGTTTACGCAGGTCCATGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...,2,HAL,HAR,True,False,False,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-3,3


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
1,9792986,9793008,2L,+,GCACAGTTTACGCAGGTCCATGG,FBgn0032150,FBtr0079917,2L,start_codon,9793004,9793006,-,AATTTAATTTTTTTTATAATTAATTTAGTGCTAATCTTTGAGCAGC...,AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTAT...,GGGTTTTATTATTTAATTAATGTAAATAAACTGTAATGTTAATGTT...,2,HAL,HAR,True,False,False,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-3,3


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
0,26005878,26005900,3R,-,CCATTTCCAGAACCATTTTGTTG,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...,7,HAL,HAR,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-10,10
1,26005884,26005906,3R,-,CCAGAACCATTTTGTTGTAGTTT,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...,13,HAL,HAR,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-4,4


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
1,26005884,26005906,3R,-,CCAGAACCATTTTGTTGTAGTTT,FBgn0002733,FBtr0084982,3R,start_codon,26005891,26005893,-,GGTCCAGCGGACAAAACGTTTCGGTTAGATAAGTTGTTAGCCGTTT...,CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGC...,TTTGTTGTAGTTTGGTTTTGTTTTTTTTTTATTGGGGGTAAAGTAA...,13,HAL,HAR,False,True,True,False,True,"[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-4,4


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,524038,524060,2L,+,ATGCTTAGCAGCAATACAAGAGG,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...,AAGAGCTACAATACGATACCGCACGTTCGTCATAACGAGAATTTGA...,CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCT...,20,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",15


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,524038,524060,2L,+,ATGCTTAGCAGCAATACAAGAGG,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...,AAGAGCTACAATACGATACCGCACGTTCGTCATAACGAGAATTTGA...,CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCT...,20,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",15


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,370082,370104,X,+,CTATCTCTTAAAATGGCTTTGGG,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...,TACTACCTCTCTATTAAAATCAGAGAAAACACTCATCTCAAGAGAC...,GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGT...,8,HAR,HAL,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",3


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,370082,370104,X,+,CTATCTCTTAAAATGGCTTTGGG,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...,TACTACCTCTCTATTAAAATCAGAGAAAACACTCATCTCAAGAGAC...,GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGT...,8,HAR,HAL,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",3


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,30852536,30852558,3R,-,CCCACATCGGTATGCAGTCGTCG,FBgn0003720,FBtr0085709,3R,start_codon,30852547,30852549,+,TATGAGAAATCGTTGATTTCACTGATCGAGTATTGTCCAAAGCGCT...,GCCGGCGGTCCTCACAGCAGACAACACAACCCATCGTGATCTCAGC...,CAGTCGTCGGAGGGTTCACCAGACATGATGGATCAGAAATACAACA...,9,HAR,HAL,False,True,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-8


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,30852536,30852558,3R,-,CCCACATCGGTATGCAGTCGTCG,FBgn0003720,FBtr0085709,3R,start_codon,30852547,30852549,+,TATGAGAAATCGTTGATTTCACTGATCGAGTATTGTCCAAAGCGCT...,GCCGGCGGTCCTCACAGCAGACAACACAACCCATCGTGATCTCAGC...,CAGTCGTCGGAGGGTTCACCAGACATGATGGATCAGAAATACAACA...,9,HAR,HAL,False,True,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-8


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11278450,11278472,2L,+,CATGCAGCTGCATCCCAATGCGG,FBgn0052831,FBtr0091683,2L,start_codon,11278451,11278453,+,TTAGCGGACCATTAGAAACACAATTGAGTTTGCATTGGGTGCTTTA...,GCGGGAGAGAGTCGAGAGTCATTCGTTTGTCTCACTCCTGCGAGTG...,CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTA...,19,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11278450,11278472,2L,+,CATGCAGCTGCATCCCAATGCGG,FBgn0052831,FBtr0091683,2L,start_codon,11278451,11278453,+,TTAGCGGACCATTAGAAACACAATTGAGTTTGCATTGGGTGCTTTA...,GCGGGAGAGAGTCGAGAGTCATTCGTTTGTCTCACTCCTGCGAGTG...,CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTA...,19,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8600563,8600585,2R,-,CCAGTAAAGCGCAGCAGCAATGA,FBgn0267792,FBtr0088692,2R,start_codon,8600582,8600584,+,GCCGCGGGTAAAGGCCTCTCACCAAACAAAGAAAATGTGACGAGCT...,GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCA...,ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACT...,1,HAR,HAL,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-16


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8600563,8600585,2R,-,CCAGTAAAGCGCAGCAGCAATGA,FBgn0267792,FBtr0088692,2R,start_codon,8600582,8600584,+,GCCGCGGGTAAAGGCCTCTCACCAAACAAAGAAAATGTGACGAGCT...,GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCA...,ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACT...,1,HAR,HAL,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-16


In [8]:
# test the function that extracts the sequence of the drosophila genome that is in the CDS of a given transcript and within the given window and chops it into codons 
from sgRNAutils import codonFragmenter
pd.set_option('display.max_column', None)

# create an empty list to store the chopped CDSs and the adjusted dataframes
CDS_list = []
df_adjusted_list = []

for bestsgRNA in bestsgRNAList:
    CDS, df_adjusted = codonFragmenter(bestsgRNA.iloc[0], bestsgRNA.iloc[0]["CDS_side"], bestsgRNA.iloc[0]["CDS_boundary"])
    CDS_list.append(CDS)
    df_adjusted_list.append(df_adjusted)
    # now display the chopped up CDS
    display(CDS)

['CGC', 'TGC', 'CTG', 'AAA', 'CGT', 'CTG', 'GAC']

['TCC', 'ATG', 'GAG', 'ATG', 'GAA', 'CTG', 'GTT']

['CTT', 'AGC', 'AGC', 'AAT', 'ACA', 'AGA', 'GGT']

['GCT', 'TTG', 'GGC', 'AGC', 'GAA', 'AAT', 'CAC']

['CAG', 'TCG', 'TCG', 'GAG', 'GGT', 'TCA', 'CCA']

['CAG', 'CTG', 'CAT', 'CCC', 'AAT', 'GCG', 'GAG']

['ATG', 'CCG', 'CTG', 'CAG', 'GAG', 'TCG', 'CCG']

In [9]:
# test the function that recombines the codons again and fuses the CDS with the rest of the HA
from sgRNAutils import codonReverseFragmenter

for i, sgRNA in enumerate(bestsgRNAList):
    # retrieve the HA which contains the CDS before the chopping 
    HA_before_chopping = sgRNA[f"{sgRNA["CDS_side"].iloc[0]}"].iloc[0]
    df_after_recombining = codonReverseFragmenter(CDS_list[i], df_adjusted_list[i], df_adjusted_list[i]["CDS_side"], df_adjusted_list[i]["CDS_boundary"])
    HA_after_recombining = df_after_recombining[df_after_recombining["CDS_side"]]
    # display HA before chopping and after chopping and test whether they are the same
    # you can also check manually whether they are the same
    display(HA_before_chopping)
    display(HA_after_recombining)
    # display the length of the HA after recombining, should be again 225 bp
    print(len(HA_after_recombining))
    if HA_after_recombining == HA_before_chopping:
        print("HA before chopping and HA after recombining are the same. All is well")
    else: print("we have a problem")

'AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTATGTTCTGTTGAATATTATACGTATTCACTTATGTTTAAGTCATTTCGAAGGTCTTACCCAGCCGAACATGGCTTCTATCAGCTGGAGGACTGTCTGATCCGAACTGCCACCGCTCCTGTGCTCCCGTATATCCACCGAAGCGCTGTCCACGAAGCAAATGCGGCACAGTTTACGCAGGTC'

'AATCATTACAGATCATGGGCAGCTCCTCAGTAAGATTAAGTGCTATGTTCTGTTGAATATTATACGTATTCACTTATGTTTAAGTCATTTCGAAGGTCTTACCCAGCCGAACATGGCTTCTATCAGCTGGAGGACTGTCTGATCCGAACTGCCACCGCTCCTGTGCTCCCGTATATCCACCGAAGCGCTGTCCACGAAGCAAATGCGGCACAGTTTACGCAGGTC'

225
HA before chopping and HA after recombining are the same. All is well


'CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGCTCCAGGATGTCGGCCTTCTCCAGCCGGGTGATGTGCTCGCCCTCCTGGGTCAGGCACTCCACCATGATGTCCTTGAGCTCGTCCAGGCACTTGTTAATCCTGGCACGGCGCTTACGCTCCAGCATGGGCTTCATCACCTTGCGGTACTGATAGGTCTTGGACATCTCCATTTCCAGAAC'

'CAGCTGCTTCTGGGCACGCAGCTTCTTCATGTGCTCCACGGTCAGCTCCAGGATGTCGGCCTTCTCCAGCCGGGTGATGTGCTCGCCCTCCTGGGTCAGGCACTCCACCATGATGTCCTTGAGCTCGTCCAGGCACTTGTTAATCCTGGCACGGCGCTTACGCTCCAGCATGGGCTTCATCACCTTGCGGTACTGATAGGTCTTGGACATCTCCATTTCCAGAAC'

225
HA before chopping and HA after recombining are the same. All is well


'CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCTATGAAACACAATATGCTTGAATATGCTTTTAATAAGAAAAGCAAACTAGCAATTCTATCTGATTTAGATGTAAACAAATTATTCATTCTATTTGGAATGTTTGGATAATTATGTTATATTAAATTTGAGATCTAAAGTGAATGTATCATTTGTTGTTGATATTGATTGTTGAATATATG'

'CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCTATGAAACACAATATGCTTGAATATGCTTTTAATAAGAAAAGCAAACTAGCAATTCTATCTGATTTAGATGTAAACAAATTATTCATTCTATTTGGAATGTTTGGATAATTATGTTATATTAAATTTGAGATCTAAAGTGAATGTATCATTTGTTGTTGATATTGATTGTTGAATATATG'

225
HA before chopping and HA after recombining are the same. All is well


'GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGTCATCTTCGGCCTTTAATGGACCCTCTGTTATCCGGAGAAATGCCCGGGAACGCAACCGCGTAAAGCAGGTCAACAATGGCTTCAGCCAACTACGACAACATATCCCTGCGGCCGTAATAGCCGATTTAAGCAATGGTCGCCGGGGAATTGGTCCCGGCGCCAATAAAAAACTGAGCAAA'

'GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGTCATCTTCGGCCTTTAATGGACCCTCTGTTATCCGGAGAAATGCCCGGGAACGCAACCGCGTAAAGCAGGTCAACAATGGCTTCAGCCAACTACGACAACATATCCCTGCGGCCGTAATAGCCGATTTAAGCAATGGTCGCCGGGGAATTGGTCCCGGCGCCAATAAAAAACTGAGCAAA'

225
HA before chopping and HA after recombining are the same. All is well


'CAGTCGTCGGAGGGTTCACCAGACATGATGGATCAGAAATACAACAGCGTGCGTCTTTCGCCAGCGGCATCGAGTAAGTATAGCAAAAAAAAAAATCAGATAGCAATGAAAAAAGAAGAAAACAATCTTCCTAATATATAAAATTAAGACAATCCTTTACTTATACCAACTTTCCCCCCTTCTTGAATTTCCAGGTCGCATTCTATACCATGTGCCCTGCAAAGT'

'CAGTCGTCGGAGGGTTCACCAGACATGATGGATCAGAAATACAACAGCGTGCGTCTTTCGCCAGCGGCATCGAGTAAGTATAGCAAAAAAAAAAATCAGATAGCAATGAAAAAAGAAGAAAACAATCTTCCTAATATATAAAATTAAGACAATCCTTTACTTATACCAACTTTCCCCCCTTCTTGAATTTCCAGGTCGCATTCTATACCATGTGCCCTGCAAAGT'

225
HA before chopping and HA after recombining are the same. All is well


'CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTATGCGGTGGAGTCGCGAGTTTTGTGGACACTAATCGGAGTGTTTCTGTGTGCCTTGTTCCCTCGCCACGGCTGCAGGAAGTGGACGGACGAGGAGATCGACATGCTGCACTCGTCCATAATGCGCTTCTCCGACGACCTGACCAAGATCAGCCTAAGCATCAAGAACCGCACCGTGTAAG'

'CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTATGCGGTGGAGTCGCGAGTTTTGTGGACACTAATCGGAGTGTTTCTGTGTGCCTTGTTCCCTCGCCACGGCTGCAGGAAGTGGACGGACGAGGAGATCGACATGCTGCACTCGTCCATAATGCGCTTCTCCGACGACCTGACCAAGATCAGCCTAAGCATCAAGAACCGCACCGTGTAAG'

225
HA before chopping and HA after recombining are the same. All is well


'ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACTACGCTTTCTTCCGCAAGTGCGGGGAAATCACCGTCTCGCCGGACGTGCAGAGCTTCGGCTTCAACTGCGCCTTCTGTCCGGCGATCTGCCTGCAGTTCTCCGTGTTCATGGACCACATCCGGGTGCAGCACACGCAGGACGTGAGCCGGCGCTACGAGACTGAGGAGGAGAGGTGTCCG'

'ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACTACGCTTTCTTCCGCAAGTGCGGGGAAATCACCGTCTCGCCGGACGTGCAGAGCTTCGGCTTCAACTGCGCCTTCTGTCCGGCGATCTGCCTGCAGTTCTCCGTGTTCATGGACCACATCCGGGTGCAGCACACGCAGGACGTGAGCCGGCGCTACGAGACTGAGGAGGAGAGGTGTCCG'

225
HA before chopping and HA after recombining are the same. All is well


In [10]:
# test function to identify a synonymous codons that changes a given nucleotide but does not change the encoded amino acid
from sgRNAutils import find_synonymous_codons
# try to mutate first, second and third nucleotide of the codon "ATG"
# note, its unlikely to find synonymous mutations if the nucleotide you wish to mutate is NOT located in the wobble position (e.g. nucleotide 3 of the codon)
codon = "GCT"
synonymous_codon_1 = find_synonymous_codons(codon, 1)
synonymous_codon_2 = find_synonymous_codons(codon, 2)
synonymous_codon_3 = find_synonymous_codons(codon, 3)
print(synonymous_codon_1)
print(synonymous_codon_2)
print(synonymous_codon_3)

[]
[]
['GCC', 'GCA', 'GCG']


In [11]:
# test find_best_mutation function which mutates the HA in a way that it cannot be cut by the sgRNA
# test case where PAM is in CDS, sixth dataframe in bestsgRNAList
from sgRNAutils import find_best_mutation
pd.set_option('display.max_columns', None)
display(bestsgRNAList[5])
bestsgRNAList[5]["mutated?"] = False
CDSside = bestsgRNAList[5]["CDS_side"]
mutated_sgRNAdf6 = find_best_mutation(bestsgRNAList[5].iloc[0])
# compare the HA wich contains the CDS before and after mutation
# check whether position of mutation is in the PAM by comparing the HA with the sgRNA
print(CDSside.iloc[0])
display(bestsgRNAList[5].iloc[0][CDSside].iloc[0])
display(mutated_sgRNAdf6[CDSside].iloc[0])
display(mutated_sgRNAdf6["sgRNA_sequence"])

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11278450,11278472,2L,+,CATGCAGCTGCATCCCAATGCGG,FBgn0052831,FBtr0091683,2L,start_codon,11278451,11278453,+,TTAGCGGACCATTAGAAACACAATTGAGTTTGCATTGGGTGCTTTA...,GCGGGAGAGAGTCGAGAGTCATTCGTTTGTCTCACTCCTGCGAGTG...,CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTA...,19,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14


HAR


'CAGCTGCATCCCAATGCGGAGTCGCCATCGGGGTAAGTTGATTCTATGCGGTGGAGTCGCGAGTTTTGTGGACACTAATCGGAGTGTTTCTGTGTGCCTTGTTCCCTCGCCACGGCTGCAGGAAGTGGACGGACGAGGAGATCGACATGCTGCACTCGTCCATAATGCGCTTCTCCGACGACCTGACCAAGATCAGCCTAAGCATCAAGAACCGCACCGTGTAAG'

'CAGCTGCATCCCAATGCTGAGTCGCCATCGGGGTAAGTTGATTCTATGCGGTGGAGTCGCGAGTTTTGTGGACACTAATCGGAGTGTTTCTGTGTGCCTTGTTCCCTCGCCACGGCTGCAGGAAGTGGACGGACGAGGAGATCGACATGCTGCACTCGTCCATAATGCGCTTCTCCGACGACCTGACCAAGATCAGCCTAAGCATCAAGAACCGCACCGTGTAAG'

'CATGCAGCTGCATCCCAATGCGG'

In [12]:
# test find_best_mutation function which mutates the HA in a way that it cannot be cut by the sgRNA
# test case where PAM is in CDS, third dataframe in bestsgRNAList
pd.set_option('display.max_columns', None)
display(bestsgRNAList[2])
bestsgRNAList[2]["mutated?"] = False
CDSside = bestsgRNAList[2]["CDS_side"]
mutated_sgRNAdf3 = find_best_mutation(bestsgRNAList[2].iloc[0])
# compare the HA wich contains the CDS before and after mutation
# check position of mutation by comparing the HA with the sgRNA
# no synoynmous mutations in the PAM were possible (likely because not in wobble)
# therefore the SRS was mutated
print(CDSside.iloc[0])
display(bestsgRNAList[2].iloc[0][CDSside].iloc[0])
display(mutated_sgRNAdf3[CDSside].iloc[0])
display(mutated_sgRNAdf3["sgRNA_sequence"])


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,524038,524060,2L,+,ATGCTTAGCAGCAATACAAGAGG,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...,AAGAGCTACAATACGATACCGCACGTTCGTCATAACGAGAATTTGA...,CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCT...,20,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",15


HAR


'CTTAGCAGCAATACAAGAGGTGAGAATTATTCATACACAGTTTTCTATGAAACACAATATGCTTGAATATGCTTTTAATAAGAAAAGCAAACTAGCAATTCTATCTGATTTAGATGTAAACAAATTATTCATTCTATTTGGAATGTTTGGATAATTATGTTATATTAAATTTGAGATCTAAAGTGAATGTATCATTTGTTGTTGATATTGATTGTTGAATATATG'

'CTTAGCAGCAACACTAGAGGTGAGAATTATTCATACACAGTTTTCTATGAAACACAATATGCTTGAATATGCTTTTAATAAGAAAAGCAAACTAGCAATTCTATCTGATTTAGATGTAAACAAATTATTCATTCTATTTGGAATGTTTGGATAATTATGTTATATTAAATTTGAGATCTAAAGTGAATGTATCATTTGTTGTTGATATTGATTGTTGAATATATG'

'ATGCTTAGCAGCAATACAAGAGG'

In [13]:
# test find_best_mutation function which mutates the HA in a way that it cannot be cut by the sgRNA
# test case where neither PAM nor SRS is in CDS, seventh dataframe in bestsgRNAList
# we expect that the PAM is simply mutated without respecting synonymous mutations
pd.set_option('display.max_columns', None)
display(bestsgRNAList[6])
bestsgRNAList[6]["mutated?"] = False
nonCDSside = bestsgRNAList[6]["non_CDS_side"]
mutated_sgRNAdf7 = find_best_mutation(bestsgRNAList[6].iloc[0])
# compare the HA wich does NOT contain the CDS before and after mutation
# check position of mutation by comparing the HA with the sgRNA
print(nonCDSside.iloc[0])
display(bestsgRNAList[6].iloc[0][nonCDSside].iloc[0])
display(mutated_sgRNAdf7[nonCDSside].iloc[0])
display(mutated_sgRNAdf7["sgRNA_sequence"])


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8600563,8600585,2R,-,CCAGTAAAGCGCAGCAGCAATGA,FBgn0267792,FBtr0088692,2R,start_codon,8600582,8600584,+,GCCGCGGGTAAAGGCCTCTCACCAAACAAAGAAAATGTGACGAGCT...,GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCA...,ATGCCGCTGCAGGAGTCGCCGTCGCCCAGTTGGCAGCTGGAGGACT...,1,HAR,HAL,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-24, -23, -22, -21, -20, -19, -18, -17, -16, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-16


HAL


'GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCACTGCCTCGGCCACACTGCAGTTCGGATTTGCCGGTTGTTTGGATCGCAGATTGAATCGGAAAGCCAGCGGCGTTGGTTCGGAAAAGTGCAGTCTCCGCGTCGCGAGTGCGCCAATAACGAGAAATAACGAGAATTGAAAAAGTTCCTTAAGAGGTGGAGTCCAGTAAAGCGCAGCAGCA'

'GAAGCGCATTGTATTTGGCCACCATCGAGCGATGGCCACACTGCCACTGCCTCGGCCACACTGCAGTTCGGATTTGCCGGTTGTTTGGATCGCAGATTGAATCGGAAAGCCAGCGGCGTTGGTTCGGAAAAGTGCAGTCTCCGCGTCGCGAGTGCGCCAATAACGAGAAATAACGAGAATTGAAAAAGTTCCTTAAGAGGTGGAGTCAAGTAAAGCGCAGCAGCA'

'CCAGTAAAGCGCAGCAGCAATGA'