In [1]:
window = 42

In [2]:
# test function to form the reverse complement of a DNA sequence
from sgRNAutils import revComp
revComp('ATG')

'CAT'

In [3]:
# test function to extract the DNA sequence at particular genomic coordinates of a particular chromosome
# to confirm look up the genome coordinates and the chromosome for drosophila melanogaster dm6 on the UCSC genome browser https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6
# below the link to to verify the results of the query below
# https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chrX%3A20000080%2D20000090&hgsid=1877457346_JpaxsFWfPkr6i1rbo2jG4jhD73Fb
from sgRNAutils import refSeq
refSeqPerChromosome= refSeq()
refSeqPerChromosome['X'][20000080:20000090]

Seq('TGCTAGAGGG')

In [4]:
# test function to identify known transcripts of transcription factors and extract their genomic information
# this cell takes about 60s to run but you can avoid it by running the next cell and loading the output from this cell from MockMaterials
# to confirm the results look up the Transcript_IDs below on the UCSC genome browser and double check the genomic information: https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6
# here an example for FBtr0070072: https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chrX%3A370031%2D370947&hgsid=1877489778_pLlolcHmOnq6afrqpZAa7WNNrw7r
# you can also check if the different transcripts belong to the same gene in the UCSC genome browser
from sgRNAutils import make_dataframe_from_TFs_list
TFsdf = make_dataframe_from_TFs_list('inputfiles/mockMaterials/TFsTruncatedLong.xlsx', refSeqPerChromosome)
display(TFsdf)

Unnamed: 0,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq
0,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...
1,FBgn0000022,FBtr0070072,X,stop_codon,370697,370699,+,GACATTGTGTCGTTCGTATGTCGCCCATTGAGACCGCCAATGGAGG...
2,FBgn0003963,FBtr0078063,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...
3,FBgn0003963,FBtr0078063,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...
4,FBgn0003963,FBtr0310027,2L,start_codon,491316,491318,+,CACTGATAAAAAATACCCTAAATTATCGCAACGCAAATGGTAAAGT...
5,FBgn0003963,FBtr0310027,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...
6,FBgn0003963,FBtr0329895,2L,start_codon,524038,524040,+,CAGAAAGGGATGGGCACACTCGCCTGTTTCCTGCAGTCGAACAGAT...
7,FBgn0003963,FBtr0329895,2L,stop_codon,540017,540019,+,TTTTTTAATGTTCTGCACTTGCTTAATTACCAGCATGTACCATCTG...
8,FBgn0003963,FBtr0333012,2L,start_codon,525418,525420,+,ATCATTGGCCAAAAAAGTAGGAATTCTGGGAGGAGGGCATGCCAAA...
9,FBgn0003963,FBtr0333012,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...


In [5]:
# test function to extract 225 bp of the genome left and right of the start/stop codon of a transcript as homology arms (HA)
# to check whether the identified HAs are correct, you can align them to the dm6 drosophila melanogaster genome using the 
# UCSC BLAT Search: https://genome.ucsc.edu/cgi-bin/hgBlat?hgsid=1885485482_QUkcgOVglCJ2ybPDnj4fC1f6KdNR&command=start
# here as an example the HA within window of FBtr0070072 aligned: https://genome.ucsc.edu/cgi-bin/hgTracks?db=dm6&lastVirtModeType=default&lastVirtModeExtraState=&virtModeType=default&virtMode=0&nonVirtPosition=&position=chrX%3A370074%2D370116&hgsid=1885485482_QUkcgOVglCJ2ybPDnj4fC1f6KdNR
from sgRNAutils import make_homology_arm_fragments
import pandas as pd
TFsdf = pd.read_excel('inputfiles/mockMaterials/TFsdfTruncatedLong.xlsx')
TFsdf = make_homology_arm_fragments(TFsdf, refSeqPerChromosome)
TFsdf = TFsdf.reset_index()
del TFsdf["index"]

# for every transcript in the dataframe above ...
for index, row in TFsdf.iterrows():
    # ... check that the HAs are 225 bp long and that the sequence between the HAs is a start or a stop codon in forward or reverse complement
    # stop codons are TAA, TAG and TGA and their reverse complements are TTA, CTA and TCA
    # start codons are ATG and its reverse complement is CAT
    print(refSeqPerChromosome[row["Chromosome"]][row['Start']-1:row['Stop']], len(TFsdf.iloc[0]["HAL"]), len(TFsdf.iloc[0]["HAR"]))
    # ... compare the region around the start/stop codon from the genome file to the end of HAL and the beginning of HAR that this function extracts
    # both sequences should be equal except for the "XXX" as placeholder for the start/stop codon in the HA sequence
    print(refSeqPerChromosome[row["Chromosome"]][row['Start']-21:row['Stop']+20])
    print(TFsdf.iloc[index]["HAL"][-20:] + "XXX" + TFsdf.iloc[index]["HAR"][:20])

ATG 225 225
ACGTTATACTATCTCTTAAAATGGCTTTGGGCAGCGAAAATCA
ACGTTATACTATCTCTTAAAXXXGCTTTGGGCAGCGAAAATCA
TAA 225 225
CACTCTGGCAGGACGACCTGTAAAAAAACAGATCAAATCTTCA
CACTCTGGCAGGACGACCTGXXXAAAAACAGATCAAATCTTCA
ATG 225 225
ATACACATATAGGGAAAACAATGCTTAGCAGCAATACAAGAGG
ATACACATATAGGGAAAACAXXXCTTAGCAGCAATACAAGAGG
TGA 225 225
ACCTGCAGGAGGCGGCCATTTGAGAAAGCCAGCTGCGGTGGGA
ACCTGCAGGAGGCGGCCATTXXXGAAAGCCAGCTGCGGTGGGA
ATG 225 225
GTCCAGCAGCTGGACTCGAAATGTCACGACGCAAACAGTCCAA
GTCCAGCAGCTGGACTCGAAXXXTCACGACGCAAACAGTCCAA
TGA 225 225
ACCTGCAGGAGGCGGCCATTTGAGAAAGCCAGCTGCGGTGGGA
ACCTGCAGGAGGCGGCCATTXXXGAAAGCCAGCTGCGGTGGGA
ATG 225 225
ATACACATATAGGGAAAACAATGCTTAGCAGCAATACAAGAGG
ATACACATATAGGGAAAACAXXXCTTAGCAGCAATACAAGAGG
TAA 225 225
AGTGGTATTACACGTGCTTCTAAGCAGCAGCCCAACCCAATCG
AGTGGTATTACACGTGCTTCXXXGCAGCAGCCCAACCCAATCG
ATG 225 225
GTTCCGATACCGCAGAAGAGATGACCGTCGACAGCAGAGGTAA
GTTCCGATACCGCAGAAGAGXXXACCGTCGACAGCAGAGGTAA
TGA 225 225
ACCTGCAGGAGGCGGCCATTTGAGAAAGCCAGCTGCGGTGGGA
ACCTGCAGGAGGCGGCCATTXXXGAAAGCCAGCTGCGGTGGGA


DO NOT RUN THE NEXT CELL without reading this!
The next cell might take about 30 min to run. You can avoid it by running the cell after the next cell and loading the output from this cell from MockMaterials.

In [6]:
# test function to identify sgRNAs within a given window around the start/stop codon of a transcript
# this cell takes 32 min to run, you can avoid it by running the next cell and loading the output from this cell from MockMaterials
from sgRNAutils import gRNA_stringencyIterator
import os

for index, row in TFsdf.iterrows():
    result_df = gRNA_stringencyIterator(row, window, refSeqPerChromosome)
    display(result_df)
    # test whether every sgRNA identified is within the window of 42 bp around the start/stop codon
    # you can manually do this by comparing the genomic coordinates of the sgRNA (fmin and fmax) with the coordinates of the start/stop codon (Start and Stop)
    for i in range(len(result_df)):
        if result_df['fmin'].iloc[i] >= result_df["Start"].iloc[i] - window & result_df['fmax'].iloc[i] <= result_df["Stop"].iloc[i] + window:
            print("sgRNA within window")

stringency 0
stringency 1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,370082,370104,X,+,CTATCTCTTAAAATGGCTTTGGG,FBgn0000022,FBtr0070072,X,start_codon,370094,370096,+,CTAATGAATAGATTGGTGTGTGATGTAGTGATCTAATATGGTGAAG...,TACTACCTCTCTATTAAAATCAGAGAAAACACTCATCTCAAGAGAC...,GCTTTGGGCAGCGAAAATCACTCTGTTTTCAACGACGACGAGGAGT...


sgRNA within window


KeyboardInterrupt: 

In [8]:
# to avoid the cell above but still check the results of the gRNA_stringencyIterator function, run this cell
# the gRNA_stringencyIterator() function is supposed to identify sgRNAs within a given window around the start/stop codon of a transcript
# to test the downstream functions, we need to identifiy representatives for conditions A-G and the sgRNA/start/stop codon orientations 
# (see MutationLogic_ML_20240124.pptx)
# there are 8 different combinations of sgRNA/start/stop codon orientations
# however, for two combinations each, the sgRNA and the start/stop codon have effectively the same relative position to each other 
# (e.g. the two combinations start codon - sgRNA strand (+) - gene strand (+) and stop codon - sgRNA strand (+) - gene strand (-))
# therefore, we have identified 4 combinations of sgRNA/start/stop codon orientations with unique relative positions to test the downstream pipeline
# further, we have identified representatives to test conditions A-D, which are relevant for the find_best_sgRNA function
# further, we have identified representatives to test conditions E-G, which are relevant for the find_best_mutation function
sgRNAdf1 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0032150_FBtr0079917.xlsx') # PAM in start/stop, start codon - gene (-) - gRNA (+)
sgRNAdf2 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0002733_FBtr0084982.xlsx') # max 15 bp overlap, start codon - gene (-) - gRNA (-)
sgRNAdf3 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0003963_FBtr0078063.xlsx') # more than 15 bp overlap, PAM in CDS
sgRNAdf4 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0000022_FBtr0070072.xlsx') # start codon - gene (+) - gRNA (+)
sgRNAdf5 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0003720_FBtr0085709.xlsx') # start codon - gene (+) - gRNA (-)
sgRNAdf6 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0052831_FBtr0091683.xlsx') # SRS in CDS (need to manually change "PAM in CDS" to False)
sgRNAdf7 = pd.read_excel('inputfiles/mockMaterials/sgRNAsdfs/window_42_bp/FBgn0267792_FBtr0088692.xlsx') # PAM and SRS bot outside CDS

for i in [sgRNAdf1, sgRNAdf2, sgRNAdf3, sgRNAdf4, sgRNAdf5, sgRNAdf6, sgRNAdf7]:
    display(i)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,9791954,9791976,2L,-,CCAATGTAACTATGTAATATAAT,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,9791984,-,AAAATCTTCCCGAATATTATGCATGTAAAAAAGGAAATGGCTAAGT...,TCTCCTTGATAGTTTTGTATGGGCCAGCGAGGTACCAGTTTGAGCA...,TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGC...
1,9791971,9791993,2L,+,TATAATCTTTATTATGAGCGCGG,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,9791984,-,AAAATCTTCCCGAATATTATGCATGTAAAAAAGGAAATGGCTAAGT...,TCTCCTTGATAGTTTTGTATGGGCCAGCGAGGTACCAGTTTGAGCA...,TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGC...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,26005304,26005326,3R,+,GATCACCAGGGACGCCACATGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...
1,26005272,26005294,3R,+,CGATTCAGGTGAATATCATCCGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...
2,26005292,26005314,3R,+,CGGAGTCCAGTTGATCACCAGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...
3,26005305,26005327,3R,+,ATCACCAGGGACGCCACATGGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...
4,26005312,26005334,3R,+,GGGACGCCACATGGGGCCAGAGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,539956,539978,2L,+,AGAAAGCCAGCTGCGGTGGGAGG,FBgn0003963,FBtr0078063,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...,ATGAAGAAGTACTGCTCAACCTGTGACATATCCTTCAATTACGTGA...,GAAAGCCAGCTGCGGTGGGAGGCGCAGCGCAAGCAGCTTGAGTGGT...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,370693,370715,X,-,CCTGTAAAAAAACAGATCAAATC,FBgn0000022,FBtr0070072,X,stop_codon,370697,370699,+,GACATTGTGTCGTTCGTATGTCGCCCATTGAGACCGCCAATGGAGG...,CACCAAGAGTTGCAGTTGCAATCTCCAACTGGCAGCACAAGTTCCT...,AAAAACAGATCAAATCTTCAGCTATTGCTAGTCGCACCCAACCATA...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,30853998,30854020,3R,-,CCGACATGTACAGTCAGCGCAAG,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,30854026,+,ACAGAATGATGAGGATTAGGCCAACACACACATCCATCGAAAGGAC...,TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGC...,AAAGTATGTAGAGCCTAGACTAATCGCCGCACTCGAAGTGCCTTCC...
1,30854040,30854062,3R,-,CCTAGACTAATCGCCGCACTCGA,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,30854026,+,ACAGAATGATGAGGATTAGGCCAACACACACATCCATCGAAAGGAC...,TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGC...,AAAGTATGTAGAGCCTAGACTAATCGCCGCACTCGAAGTGCCTTCC...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,11279466,11279488,2L,+,TTCGAGCCCTCGGAACGGGCTGG,FBgn0052831,FBtr0091683,2L,stop_codon,11279461,11279463,+,TAATTTTTGAACACAGTTACATAATATATTTGTTTCTTGTAAGCTT...,TCGCCCACGGTGGTGATTGCCGACGATGTGGTTAGCACCTCCAATC...,GGTTCGAGCCCTCGGAACGGGCTGGACTTCTTCTACTAGATCTCCG...


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR
0,8610891,8610913,2R,-,CCTTCAGCCACTCGAAGTTTTTG,FBgn0267792,FBtr0088692,2R,stop_codon,8610886,8610888,+,AATATTTTTCTTTTAAAATGAAACTTTCCCAAACTTTTAGTTTTTT...,CGCTTCACCGATCACATGAACTCCCACTGGGCGATCCGCAAGCACC...,AACCTTCAGCCACTCGAAGTTTTTGCTCTGCACTTTATAATATTTA...


In [9]:
# test the sgRNApositionCheck function which checks how the sgRNA is positioned relative to the start/stop codon
from sgRNAutils import sgRNApositionCheck

#Create list to store results in
positionCheckedList = []

#Derive min and max distance from start/stop codon from window size
# since the stop of the start/stop codon is zero, the minDistance has to be negative (left of the start/stop codon) 
# and shifted 3 nt to the left to account for the start/stop codon (see MutationLogic_ML_20240124.pptx)
minDistance = (window + 3) * -1
maxDistance = window

for sgRNAdf in [sgRNAdf1, sgRNAdf2, sgRNAdf3, sgRNAdf4, sgRNAdf5, sgRNAdf6, sgRNAdf7]:
    result_df = sgRNApositionCheck(sgRNAdf, minDistance, maxDistance)
    display(result_df)
    # check whether the sgRNA is positioned within the HA as expected
    for i in range(len(result_df)):
        positionScore = result_df.iloc[i]["positionScore"]
        # print positionScore which represents the distance between the end of the start/stop codon and the end of the sgRNA (see MutationLogic_ML_20240124.pptx)
        # note that the positionScore consequently represents the number of nts in the HAR that are part of the sgRNA
        print(positionScore)
        # check if sRNA is only in the HAR
        if positionScore >=20:
            print("sgRNA only in HAR")
            display(result_df.iloc[i]["HAR"][positionScore-23:positionScore])
            # print the part of the sgRNA that is the start/stop codon
            print(refSeqPerChromosome[result_df.iloc[i]["Chromosome"]][result_df.iloc[i]['Start']-1:result_df.iloc[i]['Stop']])
        elif positionScore <= 0:
            print("sgRNA only in HAL")
            # print the part of the sgRNA that is the start/stop codon
            print(refSeqPerChromosome[result_df.iloc[i]["Chromosome"]][result_df.iloc[i]['Start']-1:result_df.iloc[i]['Stop']])
            positionScore = positionScore + 2
            positionScore = positionScore  -22
            # print the only part of the sgRNA that is located in HAL (homology arm left)
            display(result_df.iloc[i]["HAL"][positionScore:positionScore+23])
        else: # print both HAL and HAR
            # print the part of the sgRNA that is located in HAR (homology arm right)
            display(result_df.iloc[i]["HAR"][:positionScore])
            # print the part of the sgRNA that is the start/stop codon
            print(refSeqPerChromosome[result_df.iloc[i]["Chromosome"]][result_df.iloc[i]['Start']-1:result_df.iloc[i]['Stop']])
            positionScore = positionScore + 2 
            positionScore = positionScore  -22
            display(result_df.iloc[i]["HAL"][positionScore:])
    positionCheckedList.append(result_df)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,9791954,9791976,2L,-,CCAATGTAACTATGTAATATAAT,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,...,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-25
1,9791971,9791993,2L,+,TATAATCTTTATTATGAGCGCGG,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,...,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",4


-8
sgRNA only in HAL
TTA


'CCAATGTAACTATGTAATATAAT'

9


'TGAGCGCGG'

TTA


'TATAATCTTTA'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,26005304,26005326,3R,+,GATCACCAGGGACGCCACATGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",13
1,26005272,26005294,3R,+,CGATTCAGGTGAATATCATCCGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-19
2,26005292,26005314,3R,+,CGGAGTCCAGTTGATCACCAGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",1
3,26005305,26005327,3R,+,ATCACCAGGGACGCCACATGGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14
4,26005312,26005334,3R,+,GGGACGCCACATGGGGCCAGAGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",21


18


'CCAGGGACGCCACATGGG'

TCA


'GA'

-14
sgRNA only in HAL
TCA


'CGATTCAGGTGAATATCATCCGG'

6


'CCAGGG'

TCA


'CGGAGTCCAGTTGA'

19


'CCAGGGACGCCACATGGGG'

TCA


'A'

26
sgRNA only in HAR


'GGGACGCCACATGGGGCCAGAGG'

TCA


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,539956,539978,2L,+,AGAAAGCCAGCTGCGGTGGGAGG,FBgn0003963,FBtr0078063,2L,stop_codon,539954,...,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",17


22
sgRNA only in HAR


''

TGA


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,370693,370715,X,-,CCTGTAAAAAAACAGATCAAATC,FBgn0000022,FBtr0070072,X,stop_codon,370697,...,False,True,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-1


16


'AAAAACAGATCAAATC'

TAA


'CCTG'

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,30853998,30854020,3R,-,CCGACATGTACAGTCAGCGCAAG,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,...,False,False,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-23
1,30854040,30854062,3R,-,CCTAGACTAATCGCCGCACTCGA,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,...,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",19


-6
sgRNA only in HAL
TGA


'CCGACATGTACAGTCAGCGCAAG'

36
sgRNA only in HAR


'CCTAGACTAATCGCCGCACTCGA'

TGA


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11279466,11279488,2L,+,TTCGAGCCCTCGGAACGGGCTGG,FBgn0052831,FBtr0091683,2L,stop_codon,11279461,...,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",20


25
sgRNA only in HAR


'TTCGAGCCCTCGGAACGGGCTGG'

TGA


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8610891,8610913,2R,-,CCTTCAGCCACTCGAAGTTTTTG,FBgn0267792,FBtr0088692,2R,stop_codon,8610886,...,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",8


25
sgRNA only in HAR


'CCTTCAGCCACTCGAAGTTTTTG'

TAG


In [10]:
# test the checkCDSCutandOrder function which selects a sgRNA that cuts in the CDS 
# and/or as a lower priority an sgRNA that cuts the closest to the start/stop codon (condition D, see MutationLogic_ML_20240124.pptx)
from sgRNAutils import checkCDSCutandOrder
# test this for some of the TF transcripts with multiple sgRNAs
display(positionCheckedList[0])
display(positionCheckedList[1])
# we expect the second sgRNA in first dataframe to be selected since it cuts inside the CDS (cut_site value is among the CDS_boundary values) and the first one cuts outside
# we expect the third sgRNA in the second dataframe to be selected since it is part of the CDS and closer to the start/stop codon (1 is smaller than 13, 14 and 21)
sgRNAdf1_checked = checkCDSCutandOrder(positionCheckedList[0])
sgRNAdf2_checked = checkCDSCutandOrder(positionCheckedList[1])
display(sgRNAdf1_checked)
display(sgRNAdf2_checked)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,9791954,9791976,2L,-,CCAATGTAACTATGTAATATAAT,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,...,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-25
1,9791971,9791993,2L,+,TATAATCTTTATTATGAGCGCGG,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,...,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",4


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,...,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,26005304,26005326,3R,+,GATCACCAGGGACGCCACATGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",13
1,26005272,26005294,3R,+,CGATTCAGGTGAATATCATCCGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-19
2,26005292,26005314,3R,+,CGGAGTCCAGTTGATCACCAGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",1
3,26005305,26005327,3R,+,ATCACCAGGGACGCCACATGGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14
4,26005312,26005334,3R,+,GGGACGCCACATGGGGCCAGAGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,...,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",21


'TATAATCTTTATTATGAGCGCGG'

'CGGAGTCCAGTTGATCACCAGGG'

In [11]:
# test condition A-C (see MutationLogic_ML_20240124.pptx) for identifying the best sgRNA (e.g. PAM in start/stop, max 15 bp overlap, none of the above)
# we expect that for the first dataframe the sgRNA is selected that has max 15 bp 3' overlap, no mutation needed
# we expect that for the third dataframe a mutation is needed since the PAM is not located in the start/stop codon and the 3' overlap is more than 15 bp

from sgRNAutils import find_best_gRNA  
pd.set_option('display.max_columns', None)

bestsgRNAList = []

for sgRNAdf in positionCheckedList:
    # first display all sgRNAs within the given windown around the start/stop codon
    display(sgRNAdf)
    sgRNA, mutationNeeded = find_best_gRNA(sgRNAdf)
    print("mutation needed: ", mutationNeeded)
    winnerdf = sgRNAdf[sgRNAdf["sgRNA_sequence"] == sgRNA] 
    # next display the sgRNA that is identified as the best sgRNA
    display(winnerdf)
    bestsgRNAList.append(winnerdf)

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
0,9791954,9791976,2L,-,CCAATGTAACTATGTAATATAAT,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,9791984,-,AAAATCTTCCCGAATATTATGCATGTAAAAAAGGAAATGGCTAAGT...,TCTCCTTGATAGTTTTGTATGGGCCAGCGAGGTACCAGTTTGAGCA...,TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGC...,-8,HAR,HAL,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-19, -18, -17, -16, -15, -14]","[-22, -21]",-25,25
1,9791971,9791993,2L,+,TATAATCTTTATTATGAGCGCGG,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,9791984,-,AAAATCTTCCCGAATATTATGCATGTAAAAAAGGAAATGGCTAAGT...,TCTCCTTGATAGTTTTGTATGGGCCAGCGAGGTACCAGTTTGAGCA...,TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGC...,9,HAR,HAL,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",4,4


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
1,9791971,9791993,2L,+,TATAATCTTTATTATGAGCGCGG,FBgn0032150,FBtr0079917,2L,stop_codon,9791982,9791984,-,AAAATCTTCCCGAATATTATGCATGTAAAAAAGGAAATGGCTAAGT...,TCTCCTTGATAGTTTTGTATGGGCCAGCGAGGTACCAGTTTGAGCA...,TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGC...,9,HAR,HAL,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",4,4


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
0,26005304,26005326,3R,+,GATCACCAGGGACGCCACATGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...,18,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",13,13
1,26005272,26005294,3R,+,CGATTCAGGTGAATATCATCCGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...,-14,HAR,HAL,False,False,False,True,False,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",-19,19
2,26005292,26005314,3R,+,CGGAGTCCAGTTGATCACCAGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...,6,HAR,HAL,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",1,1
3,26005305,26005327,3R,+,ATCACCAGGGACGCCACATGGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...,19,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",14,14
4,26005312,26005334,3R,+,GGGACGCCACATGGGGCCAGAGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...,26,HAR,HAL,False,False,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",21,21


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
2,26005292,26005314,3R,+,CGGAGTCCAGTTGATCACCAGGG,FBgn0002733,FBtr0084982,3R,stop_codon,26005306,26005308,-,TCCCCAAGCGAACCCAAAACCCCCAACCGAACTGCAGCTGATCCCC...,CTTTAAGAAGTGAGCAGCAGCCATCTCTATAGCCGAAATTTGAGGC...,CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGC...,6,HAR,HAL,False,True,True,False,True,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",1,1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,539956,539978,2L,+,AGAAAGCCAGCTGCGGTGGGAGG,FBgn0003963,FBtr0078063,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...,ATGAAGAAGTACTGCTCAACCTGTGACATATCCTTCAATTACGTGA...,GAAAGCCAGCTGCGGTGGGAGGCGCAGCGCAAGCAGCTTGAGTGGT...,22,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",17


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,539956,539978,2L,+,AGAAAGCCAGCTGCGGTGGGAGG,FBgn0003963,FBtr0078063,2L,stop_codon,539954,539956,+,CCTGTAATTTTTCGATGAGGCCCAAAGTGGCCACGTGGCACCCAGA...,ATGAAGAAGTACTGCTCAACCTGTGACATATCCTTCAATTACGTGA...,GAAAGCCAGCTGCGGTGGGAGGCGCAGCGCAAGCAGCTTGAGTGGT...,22,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",17


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,370693,370715,X,-,CCTGTAAAAAAACAGATCAAATC,FBgn0000022,FBtr0070072,X,stop_codon,370697,370699,+,GACATTGTGTCGTTCGTATGTCGCCCATTGAGACCGCCAATGGAGG...,CACCAAGAGTTGCAGTTGCAATCTCCAACTGGCAGCACAAGTTCCT...,AAAAACAGATCAAATCTTCAGCTATTGCTAGTCGCACCCAACCATA...,16,HAL,HAR,False,True,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-1


mutation needed:  False


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,370693,370715,X,-,CCTGTAAAAAAACAGATCAAATC,FBgn0000022,FBtr0070072,X,stop_codon,370697,370699,+,GACATTGTGTCGTTCGTATGTCGCCCATTGAGACCGCCAATGGAGG...,CACCAAGAGTTGCAGTTGCAATCTCCAACTGGCAGCACAAGTTCCT...,AAAAACAGATCAAATCTTCAGCTATTGCTAGTCGCACCCAACCATA...,16,HAL,HAR,False,True,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-1


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,30853998,30854020,3R,-,CCGACATGTACAGTCAGCGCAAG,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,30854026,+,ACAGAATGATGAGGATTAGGCCAACACACACATCCATCGAAAGGAC...,TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGC...,AAAGTATGTAGAGCCTAGACTAATCGCCGCACTCGAAGTGCCTTCC...,-6,HAL,HAR,False,False,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-23
1,30854040,30854062,3R,-,CCTAGACTAATCGCCGCACTCGA,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,30854026,+,ACAGAATGATGAGGATTAGGCCAACACACACATCCATCGAAAGGAC...,TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGC...,AAAGTATGTAGAGCCTAGACTAATCGCCGCACTCGAAGTGCCTTCC...,36,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",19


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
0,30853998,30854020,3R,-,CCGACATGTACAGTCAGCGCAAG,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,30854026,+,ACAGAATGATGAGGATTAGGCCAACACACACATCCATCGAAAGGAC...,TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGC...,AAAGTATGTAGAGCCTAGACTAATCGCCGCACTCGAAGTGCCTTCC...,-6,HAL,HAR,False,False,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-23,23


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11279466,11279488,2L,+,TTCGAGCCCTCGGAACGGGCTGG,FBgn0052831,FBtr0091683,2L,stop_codon,11279461,11279463,+,TAATTTTTGAACACAGTTACATAATATATTTGTTTCTTGTAAGCTT...,TCGCCCACGGTGGTGATTGCCGACGATGTGGTTAGCACCTCCAATC...,GGTTCGAGCCCTCGGAACGGGCTGGACTTCTTCTACTAGATCTCCG...,25,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",20


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,11279466,11279488,2L,+,TTCGAGCCCTCGGAACGGGCTGG,FBgn0052831,FBtr0091683,2L,stop_codon,11279461,11279463,+,TAATTTTTGAACACAGTTACATAATATATTTGTTTCTTGTAAGCTT...,TCGCCCACGGTGGTGATTGCCGACGATGTGGTTAGCACCTCCAATC...,GGTTCGAGCCCTCGGAACGGGCTGGACTTCTTCTACTAGATCTCCG...,25,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-8, -7, -6, -5, -4, -3]","[-1, 0]",20


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8610891,8610913,2R,-,CCTTCAGCCACTCGAAGTTTTTG,FBgn0267792,FBtr0088692,2R,stop_codon,8610886,8610888,+,AATATTTTTCTTTTAAAATGAAACTTTCCCAAACTTTTAGTTTTTT...,CGCTTCACCGATCACATGAACTCCCACTGGGCGATCCGCAAGCACC...,AACCTTCAGCCACTCGAAGTTTTTGCTCTGCACTTTATAATATTTA...,25,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",8


mutation needed:  True


Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8610891,8610913,2R,-,CCTTCAGCCACTCGAAGTTTTTG,FBgn0267792,FBtr0088692,2R,stop_codon,8610886,8610888,+,AATATTTTTCTTTTAAAATGAAACTTTCCCAAACTTTTAGTTTTTT...,CGCTTCACCGATCACATGAACTCCCACTGGGCGATCCGCAAGCACC...,AACCTTCAGCCACTCGAAGTTTTTGCTCTGCACTTTATAATATTTA...,25,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",8


In [12]:
# test the function that extracts the sequence of the drosophila genome that is in the CDS of a given transcript and within the given window and chops it into codons 
from sgRNAutils import codonFragmenter
pd.set_option('display.max_column', None)

# create an empty list to store the chopped CDSs and the adjusted dataframes
CDS_list = []
df_adjusted_list = []

for bestsgRNA in bestsgRNAList:
    CDS, df_adjusted = codonFragmenter(bestsgRNA.iloc[0], bestsgRNA.iloc[0]["CDS_side"], bestsgRNA.iloc[0]["CDS_boundary"])
    CDS_list.append(CDS)
    df_adjusted_list.append(df_adjusted)
    # now display the chopped up CDS
    display(CDS)

['TCA',
 'CGC',
 'CCG',
 'AAT',
 'GAT',
 'CGG',
 'GAT',
 'CAG',
 'AGT',
 'GCC',
 'CCA',
 'AGG',
 'TAC',
 'GAT']

['TGG',
 'CCC',
 'CGT',
 'TGG',
 'ATG',
 'CCC',
 'GGC',
 'TCT',
 'ACC',
 'TCC',
 'AGC',
 'GCC',
 'GAG',
 'AGC']

['GGC',
 'CAG',
 'CAG',
 'AAG',
 'AAC',
 'AAG',
 'GAA',
 'AAC',
 'CTG',
 'CAG',
 'GAG',
 'GCG',
 'GCC',
 'ATT']

['GAG',
 'GAC',
 'ATC',
 'CTC',
 'GAC',
 'TAT',
 'ATA',
 'TCA',
 'CTC',
 'TGG',
 'CAG',
 'GAC',
 'GAC',
 'CTG']

['ATT',
 'GTG',
 'CGC',
 'CTC',
 'ATC',
 'TCC',
 'GAC',
 'ATG',
 'TAC',
 'AGT',
 'CAG',
 'CGC',
 'AAG',
 'ATC']

['GAG',
 'GTG',
 'GTC',
 'AAG',
 'CTG',
 'GAT',
 'TTT',
 'GTC',
 'AGC',
 'GAG',
 'GAG',
 'GTG',
 'GCC',
 'GGG']

['AAA',
 'AAG',
 'CGA',
 'GAG',
 'CAC',
 'AGC',
 'TCC',
 'GTG',
 'GGG',
 'CAA',
 'GCG',
 'GGC',
 'GGC',
 'AAG']

In [13]:
# test the function that recombines the codons again and fuses the CDS with the rest of the HA
from sgRNAutils import codonReverseFragmenter

for i, sgRNA in enumerate(bestsgRNAList):
    # retrieve the HA which contains the CDS before the chopping 
    HA_before_chopping = sgRNA[f"{sgRNA["CDS_side"].iloc[0]}"].iloc[0]
    df_after_recombining = codonReverseFragmenter(CDS_list[i], df_adjusted_list[i], df_adjusted_list[i]["CDS_side"], df_adjusted_list[i]["CDS_boundary"])
    HA_after_recombining = df_after_recombining[df_after_recombining["CDS_side"]]
    # display HA before chopping and after chopping and test whether they are the same
    # you can also check manually whether they are the same
    display(HA_before_chopping)
    display(HA_after_recombining)
    # display the length of the HA after recombining, should be again 225 bp
    print(len(HA_after_recombining))
    if HA_after_recombining == HA_before_chopping:
        print("HA before chopping and HA after recombining are the same. All is well")
    else: print("we have a problem")

'TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGCAGTGCCACCTGTGCGGGCAGCTGAACTCCATGAATTTTCTTTAAGTGAGCCCTCAGATTGCTACTGGTACTAAAGTTGTTGGAGCAGACAACGCACTGGTGATGCGGATGCTTCTGCCGAATATGCTCATTGAGATCGCGAACCTTGGCGAAGGCGGTGGTGCAGTACTTGCAGACGTG'

'TGAGCGCGGATTATCCCGATCCTGACTGGCTGGCCTGTAATCCAGCAGTGCCACCTGTGCGGGCAGCTGAACTCCATGAATTTTCTTTAAGTGAGCCCTCAGATTGCTACTGGTACTAAAGTTGTTGGAGCAGACAACGCACTGGTGATGCGGATGCTTCTGCCGAATATGCTCATTGAGATCGCGAACCTTGGCGAAGGCGGTGGTGCAGTACTTGCAGACGTG'

225
HA before chopping and HA after recombining are the same. All is well


'CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGCGCCGGGCTGCAGGCTCCGCTCTCCAAGCTGTCGCATTCCGAGGGAGGTGGAGTGACCATAGCTTGATCCTCTACTGGAGCTTGAAGAGGCACTCCAATGGGCAGCGAGGGAACCACCACTTGCAGGTAGTTGAGACGATGGCCAAGGTGACTCATCAGCTGGGTGCCGAGATCCACACT'

'CCAGGGACGCCACATGGGGCCAGAGGTGGAGCTGGCCTCGCTGGGCGCCGGGCTGCAGGCTCCGCTCTCCAAGCTGTCGCATTCCGAGGGAGGTGGAGTGACCATAGCTTGATCCTCTACTGGAGCTTGAAGAGGCACTCCAATGGGCAGCGAGGGAACCACCACTTGCAGGTAGTTGAGACGATGGCCAAGGTGACTCATCAGCTGGGTGCCGAGATCCACACT'

225
HA before chopping and HA after recombining are the same. All is well


'ATGAAGAAGTACTGCTCAACCTGTGACATATCCTTCAATTACGTGAAGACCTACCTGGCTCACAAGCAGTTCTACTGCAAAAACAAGCCCATTCGCCCCGAGGCCAGCGACAGCCCCAGTCCGAATCACCTGGGCGGAGGCGTTGCCGTGGGCCTGGGCATTGGCGGCCTGGTCGGCGGACACGGCCAGCAGAAGAACAAGGAAAACCTGCAGGAGGCGGCCATT'

'ATGAAGAAGTACTGCTCAACCTGTGACATATCCTTCAATTACGTGAAGACCTACCTGGCTCACAAGCAGTTCTACTGCAAAAACAAGCCCATTCGCCCCGAGGCCAGCGACAGCCCCAGTCCGAATCACCTGGGCGGAGGCGTTGCCGTGGGCCTGGGCATTGGCGGCCTGGTCGGCGGACACGGCCAGCAGAAGAACAAGGAAAACCTGCAGGAGGCGGCCATT'

225
HA before chopping and HA after recombining are the same. All is well


'CACCAAGAGTTGCAGTTGCAATCTCCAACTGGCAGCACAAGTTCCTGCAACAGCATTAGCTCTTATTGCAAGCCAGCAACATCGACGATTCCGGGAGCAACACCTCCTAACAATTTTCATACCAAGTTGGAAGCCAGTTTTGAAGACTACCGTAACAATTCCTGCAGTTCTGGTACTGAAGATGAGGACATCCTCGACTATATATCACTCTGGCAGGACGACCTG'

'CACCAAGAGTTGCAGTTGCAATCTCCAACTGGCAGCACAAGTTCCTGCAACAGCATTAGCTCTTATTGCAAGCCAGCAACATCGACGATTCCGGGAGCAACACCTCCTAACAATTTTCATACCAAGTTGGAAGCCAGTTTTGAAGACTACCGTAACAATTCCTGCAGTTCTGGTACTGAAGATGAGGACATCCTCGACTATATATCACTCTGGCAGGACGACCTG'

225
HA before chopping and HA after recombining are the same. All is well


'TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGCACAACTACATCCAGAGGACCCATCCCTCGCAGCCCATGCGATTCCAGACGCTCTTGGGCGTGGTGCAGCTGATGCACAAGGTCTCAAGCTTCACCATCGAGGAGCTGTTCTTCCGAAAGACCATCGGCGACATCACCATTGTGCGCCTCATCTCCGACATGTACAGTCAGCGCAAGATC'

'TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGCACAACTACATCCAGAGGACCCATCCCTCGCAGCCCATGCGATTCCAGACGCTCTTGGGCGTGGTGCAGCTGATGCACAAGGTCTCAAGCTTCACCATCGAGGAGCTGTTCTTCCGAAAGACCATCGGCGACATCACCATTGTGCGCCTCATCTCCGACATGTACAGTCAGCGCAAGATC'

225
HA before chopping and HA after recombining are the same. All is well


'TCGCCCACGGTGGTGATTGCCGACGATGTGGTTAGCACCTCCAATCCTCCAGCGGCAGCGGCGACAGTAGGATCGCCAGCAGGGGCGCCAGCAGGCTTAAAGTGCGGACCAGACGTGTCCATGACCCTCAATCGCATCAATGTCCAGGAGAACGAGGTGGACGTGGAGGAGTGCCTGCCCGCAGAGGTGGTCAAGCTGGATTTTGTCAGCGAGGAGGTGGCCGGG'

'TCGCCCACGGTGGTGATTGCCGACGATGTGGTTAGCACCTCCAATCCTCCAGCGGCAGCGGCGACAGTAGGATCGCCAGCAGGGGCGCCAGCAGGCTTAAAGTGCGGACCAGACGTGTCCATGACCCTCAATCGCATCAATGTCCAGGAGAACGAGGTGGACGTGGAGGAGTGCCTGCCCGCAGAGGTGGTCAAGCTGGATTTTGTCAGCGAGGAGGTGGCCGGG'

225
HA before chopping and HA after recombining are the same. All is well


'CGCTTCACCGATCACATGAACTCCCACTGGGCGATCCGCAAGCACCTGTGCACGGTGTGCGGCAAGACCTTCACCACGTACGGCAACCTCAAGAAGCACACGGAGCTGCATTTGGCGGTGAAGAAGTACAAGTGCGGCACGTGCGGCAAGCGATTTGCACAATTCGCCAGTCTTCGATGGCACAAAAAGCGAGAGCACAGCTCCGTGGGGCAAGCGGGCGGCAAG'

'CGCTTCACCGATCACATGAACTCCCACTGGGCGATCCGCAAGCACCTGTGCACGGTGTGCGGCAAGACCTTCACCACGTACGGCAACCTCAAGAAGCACACGGAGCTGCATTTGGCGGTGAAGAAGTACAAGTGCGGCACGTGCGGCAAGCGATTTGCACAATTCGCCAGTCTTCGATGGCACAAAAAGCGAGAGCACAGCTCCGTGGGGCAAGCGGGCGGCAAG'

225
HA before chopping and HA after recombining are the same. All is well


In [14]:
# test function to identify synonymous codons that changes a given nucleotide but does not change the encoded amino acid
from sgRNAutils import find_synonymous_codons
# try to mutate first, second and third nucleotide of the codon "ATG"
# note, its unlikely to find synonymous mutations if the nucleotide you wish to mutate is NOT located in the wobble position (e.g. nucleotide 3 of the codon)
codon = "GCT"
synonymous_codon_1 = find_synonymous_codons(codon, 1)
synonymous_codon_2 = find_synonymous_codons(codon, 2)
synonymous_codon_3 = find_synonymous_codons(codon, 3)
print(synonymous_codon_1)
print(synonymous_codon_2)
print(synonymous_codon_3)

[]
[]
['GCC', 'GCA', 'GCG']


In [15]:
# test find_best_mutation function which mutates the HA in a way that it cannot be cut by the sgRNA
# test case where PAM is in CDS
from sgRNAutils import find_best_mutation
pd.set_option('display.max_columns', None)
display(bestsgRNAList[4])
bestsgRNAList[4]["mutated?"] = False
bestsgRNAList[4]["PAM_in_CDS"] = False
CDSside = bestsgRNAList[4]["CDS_side"]
mutated_sgRNAdf6 = find_best_mutation(bestsgRNAList[4].iloc[0])
# compare the HA wich contains the CDS before and after mutation
# PAM was not possible to synonymously mutate, therefore SRS was mutated, only one mutation possible
print(CDSside.iloc[0])
display(bestsgRNAList[4].iloc[0][CDSside].iloc[0])
display(mutated_sgRNAdf6[CDSside].iloc[0])
display(mutated_sgRNAdf6["sgRNA_sequence"])

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site,cutsite-CDSbound
0,30853998,30854020,3R,-,CCGACATGTACAGTCAGCGCAAG,FBgn0003720,FBtr0085709,3R,stop_codon,30854024,30854026,+,ACAGAATGATGAGGATTAGGCCAACACACACATCCATCGAAAGGAC...,TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGC...,AAAGTATGTAGAGCCTAGACTAATCGCCGCACTCGAAGTGCCTTCC...,-6,HAL,HAR,False,False,True,False,True,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",-23,23


HAL


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bestsgRNAList[4]["mutated?"] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bestsgRNAList[4]["PAM_in_CDS"] = False


'TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGCACAACTACATCCAGAGGACCCATCCCTCGCAGCCCATGCGATTCCAGACGCTCTTGGGCGTGGTGCAGCTGATGCACAAGGTCTCAAGCTTCACCATCGAGGAGCTGTTCTTCCGAAAGACCATCGGCGACATCACCATTGTGCGCCTCATCTCCGACATGTACAGTCAGCGCAAGATC'

'TCGGGAAAAGTGGCGGCCATGCACAACGATGCCCGGAGTGCGCTGCACAACTACATCCAGAGGACCCATCCCTCGCAGCCCATGCGATTCCAGACGCTCTTGGGCGTGGTGCAGCTGATGCACAAGGTCTCAAGCTTCACCATCGAGGAGCTGTTCTTCCGAAAGACCATCGGCGACATCACCATTGTGCGCCTCATCTCCGATATGTACAGTCAGCGCAAGATC'

'CCGACATGTACAGTCAGCGCAAG'

In [16]:
# test find_best_mutation function which mutates the HA in a way that it cannot be cut by the sgRNA
# test case where PAM is outside the CDS
from sgRNAutils import find_best_mutation
pd.set_option('display.max_columns', None)
display(bestsgRNAList[6])
bestsgRNAList[6]["mutated?"] = False
nonCDSside = bestsgRNAList[6]["non_CDS_side"]
mutated_sgRNAdf6 = find_best_mutation(bestsgRNAList[6].iloc[0])
# compare the HA wich does not contain the CDS before and after mutation
# check whether position of mutation is in the PAM by comparing the HA with the sgRNA
print(nonCDSside.iloc[0])
display(bestsgRNAList[6].iloc[0][nonCDSside].iloc[0])
display(mutated_sgRNAdf6[nonCDSside].iloc[0])
display(mutated_sgRNAdf6["sgRNA_sequence"])

Unnamed: 0,fmin,fmax,#chr,strand,sgRNA_sequence,Gene_ID,Transcript_ID,Chromosome,Gene_Region,Start,Stop,Strand,Reference_Seq,HAL,HAR,positionScore,CDS_side,non_CDS_side,PAM_in_start/stop,max_15_bp_3’_overhang,PAM_in_CDS,PAM_outside_CDS,SRS_in_CDS,CDS_boundary,non_CDS_boundary,SRS_boundary,mutable_PAM,cut_site
0,8610891,8610913,2R,-,CCTTCAGCCACTCGAAGTTTTTG,FBgn0267792,FBtr0088692,2R,stop_codon,8610886,8610888,+,AATATTTTTCTTTTAAAATGAAACTTTCCCAAACTTTTAGTTTTTT...,CGCTTCACCGATCACATGAACTCCCACTGGGCGATCCGCAAGCACC...,AACCTTCAGCCACTCGAAGTTTTTGCTCTGCACTTTATAATATTTA...,25,HAL,HAR,False,False,False,True,False,"[-45, -44, -43, -42, -41, -40, -39, -38, -37, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-19, -18, -17, -16, -15, -14]","[-22, -21, -20, -19, -18, -17, -16, -15, -14, ...",8


HAR


'AACCTTCAGCCACTCGAAGTTTTTGCTCTGCACTTTATAATATTTACAAAGCAATCGTAGGGCGTTAGATGTGTGTCCGTGTACTTAAAGTTATTTTGCCGTCTAGCATATACCAATCATATCGTTAATCTAGCACTAATGTAAGTAATGTATTTATATGCTATTTAAGAATGTCTGATAAATGGCTACAAGTGTTGCCTTTTGCGTTCTGTAAGTCTTATCGTC'

'AAACTTCAGCCACTCGAAGTTTTTGCTCTGCACTTTATAATATTTACAAAGCAATCGTAGGGCGTTAGATGTGTGTCCGTGTACTTAAAGTTATTTTGCCGTCTAGCATATACCAATCATATCGTTAATCTAGCACTAATGTAAGTAATGTATTTATATGCTATTTAAGAATGTCTGATAAATGGCTACAAGTGTTGCCTTTTGCGTTCTGTAAGTCTTATCGTC'

'CCTTCAGCCACTCGAAGTTTTTG'