In [29]:
#Example params:
import pandas as pd

tfsdict = {
        'Gene_ID': 'FBgn0004652',
        'Transcript_ID': 'FBtr0083651', 
        'Chromosome': '3R', 
        'Gene_Region': 'stop_codon', 
        'Start': 18426145, 
        'Stop': 18426147, 
        'Strand': '-', 
        'Reference_Seq': 'TTGATCGTAGGACAC',
        'upstreamHA': 'CTTGATCGTAGGACACCTGCACAG',
        'downstreamHA': 'CCTTGATCGTAGGACACCTGCACA',
        "start/stop":"start",#is it N or C termini -> do we need to look at start or stop codon for teh cut 
        'genome_start_codon_pos':400, 
        'genome_stop_codon_pos':700, # or only 1 of those 
        'strand_type':'+',
        "sgRNA_list_positions":[[401,425],[456,467],[478,489],[395,415]],#those wil be as genome positions -assumptions - the coordinates correspond to the 1st and last bp of the strand to which the gsRNA will be complementary to
        "sgRNA_list_values":["AAGCGACTA","AAAAAAAATAAAAA","ATATATTTTTTTTTTAAAAA","AGCGCGAAATAATA"],
        "sgRNA_strand" = ['-', '+', '-']
}

tfsDF = pd.DataFrame.to_dict(tfsdict)

tfsDF = {'Gene_ID': 'FBgn0004652',
                    'Transcript_ID': 'FBtr0083651', 
                    'Chromosome': '3R', 
                    'Gene_Region': 'stop_codon', 
                    'Start': 18426145, 
                    'Stop': 18426147, 
                    'Strand': '-', 
                    'Reference_Seq': 'TTGATCGTAGGACACCTGCACAGATGCTTGATCGTAGGACACCTGCACAGATGCCTTGATCGTAGGACACCTGCACAGATGCTTGATCGTAGGACACCTGCACAGATGCTTGATCGTAGGACACCTGCACAGATGC', 
                    'upstreamHA': 'CTTGATCGTAGGACACCTGCACAG',
                    'downstreamHA': 'CCTTGATCGTAGGACACCTGCACA'
                }
        


SyntaxError: invalid syntax (2689803032.py, line 13)

In [79]:
def revComp(inputSeq):
  """
  This function takes an input sequence and returns the reverse complement.

  Input: inputSeq in str format
  Output: revComp in str format

  """
  complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
  
  revComp = ""
  for base in inputSeq[::-1]:
    revComp += complement[(base.upper())]

  return revComp

In [68]:
def sgRNACataloguer(df):
    """
    Given filtered sgRNAs for a start/stop site in the 'df' format, will create a 'catalogue' dataframe containing 
    information about the last G position and whether the sgRNAs in the list meet certain conditions.

    params: df: a dataframe for one start/stop site of the format: 
            df= {"start/stop":"start", #is it N or C termini -> do we need to look at start or stop codon for teh cut 
            'genome_start_codon_pos':403, 
            'genome_stop_codon_pos':406, # only one of these - the other is n/a
            'strand_type':'+',
            "sgRNA_list_positions":[[401,425],[403, 427]],#those wil be as genome positions -assumptions - the coordinates correspond to the 1st and last bp of the strand to which the gsRNA will be complementary to
            "sgRNA_list_values":["AAGCGACTA", "CCTGTAA"],
            "sgRNA_strand" : ['-', '+']}
    
    output: dataframe with columns "start/stop", "genome_start_codon_pos", "genome_stop_codon_pos",
        "strand_type", "sgRNA_list_values", "sgRNA_strand", "sgRNA_fmax", "positionScore", "PAM_in_start/stop",
        "<15_bp3’_overhang", "PAM_in_CDS", "PAM_outside_CDS", "Cutsite_in_CDS", "relative_position_score", "sgRNA_mutable_bases"
    
    """
    import pandas as pd

    #Making 'df' into an sgRNA catalogue dataframe. This makes it easier to subset sgRNAs by condition.
    sgRNACatalogue = pd.DataFrame.from_dict(df)
    #keep only fmax to simplify indexing later
    for ind, sgRNA in sgRNACatalogue.iterrows():
        sgRNACatalogue.at[ind, "sgRNA_fmax"] = sgRNACatalogue["sgRNA_list_positions"][ind][1] #split to fmax and fmin, delete []
    sgRNACatalogue = sgRNACatalogue.drop(columns=["sgRNA_list_positions"]) #delete the old position value to simplify dataframe (optional)
    sgRNACatalogue["sgRNA_fmax"] = sgRNACatalogue["sgRNA_fmax"].astype(int) #convert to int

    #Adding position scores (fmax - stop)
    sgRNACatalogue["positionScore"] = sgRNACatalogue["sgRNA_fmax"] - sgRNACatalogue["genome_stop_codon_pos"]
                                                                         
    #Dataframe containing parameter ranges to interpret the positon score, based on gene strand, sgRNA strand, and start/stop
    positionScoreParameters = pd.read_excel("inputfiles/fmaxStopScore.xlsx")

    #Per parameter, append the sgRNACatalogue with a TRUE/FALSE value per condition.
    #Also translate the column of the positionScoreParameters df indicating how many sgRNA bases within 6bp of PAM ccan be mutated inside CDS.
    #Lastly, translate positionScore into the relative position of the last 'G' or 'C' of PAM.
    booleanColumns = ["PAM_in_start/stop", "<15_bp3’_overhang", "PAM_in_CDS", "PAM_outside_CDS", "Cutsite_in_CDS"]
    positionColumns = ["relative_position_score", "sgRNA_mutable_bases"]
    sgRNACatalogue = sgRNACatalogue.reindex(columns = sgRNACatalogue.columns.tolist() + booleanColumns + positionColumns)
    
    for ind, sgRNA in sgRNACatalogue.iterrows():

        #Extract the appropriate parameter row per sgRNA
        conditions = positionScoreParameters.loc[(positionScoreParameters["start/stop"] == sgRNA['start/stop']) & (positionScoreParameters["strand_type"] == sgRNA['strand_type']) & (positionScoreParameters["sgRNA_strand"] == sgRNA['sgRNA_strand'])]
        conditions = conditions.reset_index(drop = True)

        #Per column, input true/false as to whether the position score meets that condition
        for col in booleanColumns:
            colValue = conditions.at[0,col] #extract parameter range values from dataframe
            #Process the value into a range (in format list [min, max])
            #If the values should be 'more than' or 'less than', 25 is used as a max or -25 as min because distances cannot be more than 20
            if ">" in colValue: #could simplify this further by just defining all as ranges in excel
                minMax = [int(colValue[1:]), 25]
            elif "<" in colValue:
                minMax = [-25,int(colValue[1:])]
            elif ":" in colValue:
                min, max = colValue.split(":")
                minMax = [int(min), int(max)]
            else:
                print("Incorrect format of range value. Verify inputs.")
            
            #Into the output dataframe, print true/false as to whether the positionScore has met the condition for that column
            sgRNACatalogue.at[ind, col] = bool(sgRNACatalogue.at[ind, "positionScore"] in range(minMax[0], minMax[1]))
       
        #Calculate PAM last G or C position using positionScore. Add this to the sgRNA catalogue.
        sgRNACatalogue.at[ind, "relative_position_score"] = sgRNACatalogue.at[ind, "positionScore"] + int(conditions.at[0, "relative_position_score"])

        #Calculate how many bases of the sgRNA recognition site are in the CDS and can therefore be mutated.
        #This is given in the column "sgRNA_mutable_bases", in a list of the format [x,y,z,...] indicating the position score for which 1-6 bases can be mutated.
        #Accounting for the case where there is more than one positionScore for which 6bp can be mutated, this translates to:
        mutateBases = [1,2,3,4,5,6,6,6,6,6,6,6,6,6,6,6,6]
        if str(sgRNACatalogue.at[ind, "positionScore"]) in conditions.at[0, "sgRNA_mutable_bases"]:
            index = conditions.at[0, "sgRNA_mutable_bases"].index(str(sgRNACatalogue.at[ind, "positionScore"]))
            sgRNACatalogue.at[ind, "sgRNA_mutable_bases"] = str(mutateBases[index])
        else:
            sgRNACatalogue.at[ind, "sgRNA_mutable_bases"] = False
    
    sgRNACatalogue["relative_position_score"] = sgRNACatalogue["relative_position_score"].astype(int) #convert to int

    return sgRNACatalogue

In [75]:
sgRNACatalogue = sgRNACataloguer(df)
display(sgRNACatalogue)


Unnamed: 0,start/stop,genome_start_codon_pos,genome_stop_codon_pos,strand_type,sgRNA_list_values,sgRNA_strand,sgRNA_fmax,positionScore,PAM_in_start/stop,<15_bp3’_overhang,PAM_in_CDS,PAM_outside_CDS,Cutsite_in_CDS,relative_position_score,sgRNA_mutable_bases
0,start,403,406,+,AAGCGACTA,-,425,19,True,True,False,False,True,19,6
1,start,403,406,+,CCTGTAA,+,427,21,False,False,True,False,True,44,False


In [77]:
def checkCDSCutandOrder(sgRNACatalogue):
    """
    Given an sgRNACatalogue or subset, will calculate firstly rows where the sgRNA cuts inside CDS. If multiple, selects that which cuts closest to start/stop.
    If none, selects closest cut sgRNA that is outside CDS.
    
    """
    #C. Check cut site in CDS
    conditionC = sgRNACatalogue[sgRNACatalogue["Cutsite_in_CDS"] == True]
    conditionCclosestCut = conditionC.sort_values('positionScore') #This value is the absolute value of positionScore, so indicates those that cut closest to start/stop

    if len(conditionCclosestCut) > 0: #cuts in CDS, closest cut (C1, C2)
        conditionCclosestCut = conditionCclosestCut.reset_index(drop = True) #reset index
        winnersgRNA = conditionCclosestCut.at[0, "sgRNA_sequence"]
        winnerFound = True
    else: #no sgRNAs cut in CDS, select closest that still met condition B (C3)
        nonCDSclosestCut = sgRNACatalogue.sort_values('PAMRelativeEnd')
        winnersgRNA = nonCDSclosestCut.at[0, "sgRNA_sequence"]
        winnerFound = True

    return winnersgRNA, winnerFound

def find_best_gRNA(df):
    """
    params: df={
            "start/stop":"start",#is it N or C termini -> do we need to look at start or stop codon for teh cut 
            'genome_start_codon_pos':400, 
            'genome_stop_codon_pos':700, # or only 1 of those 
            'strand_type':'+',
            "sgRNA_list_positions":[[401,425],[456,467],[478,489],[395,415]],#those wil be as genome positions -assumptions - the coordinates correspond to the 1st and last bp of the strand to which the gsRNA will be complementary to
            "sgRNA_list_values":["AAGCGACTA","AAAAAAAATAAAAA","ATATATTTTTTTTTTAAAAA","AGCGCGAAATAATA"]
        }
    
    """
    #Score the sgRNAs for this site
    sgRNACatalogue = sgRNACataloguer(df)

    #Set up the winning guide RNA
    winnerFound = False
    mutationNeeded = False
    winnersgRNA = "" #Could update this to be more information about the guide e.g. strand, position

    #If there is no guide RNAs at this site, flag this. The resulting sgRNA will be an empty string.
    if len(sgRNACatalogue) == 0:
        print(f"No guide RNAs were found for the start/stop site beginning at position {df['genome_start_codon_pos']}")
        winnerFound = True #To avoid looping through other conditions if there are no sgRNAs

    #A. Ideal condition - PAM in start/stop
    if winnerFound == False:
        conditionA = sgRNACatalogue[sgRNACatalogue["PAM_in_start/stop"] == True] #This is the subset df for which PAM is in the start/stop
        if len(conditionA) > 0: #if there is one or more sgRNAs for this condition, select the first as the winner
            conditionA = conditionA.reset_index(drop = True) #reset index
            winnersgRNA = conditionA.at[0, "sgRNA_list_values"]
            winnerFound = True

    #B. sgRNA overhang is less than 15bp
    if winnerFound == False:
        conditionB = sgRNACatalogue[sgRNACatalogue["<15_bp3’_overhang"] == True]
        if len(conditionB) == 1:
            winnersgRNA = conditionB
            winnerFound = True
        elif len(conditionB) > 1: #more than one, select in CDS preferencially, and closest cut
            winnersgRNA, winnerFound = checkCDSCutandOrder(conditionB)    

    #D. sgRNA overhang is more than 15bp, need to mutate
    if winnerFound == False:
        if len(sgRNACatalogue) == 1:
            winnersgRNA = sgRNACatalogue.at[0, "sgRNA_list_values"]
        else: #more than 1 sgRNA
            winnersgRNA, winnerFound = checkCDSCutandOrder(sgRNACatalogue)            

        mutationNeeded = True

    return winnersgRNA, mutationNeeded, sgRNACatalogue[sgRNACatalogue["sgRNA_list_values"] == winnersgRNA] #could adapt return to be just the true/false for mutation and the catalogue
    #output dictionary with one set of values

In [78]:
winnersgRNA, mutationNeeded, sgRNACatalogue = find_best_gRNA(df)

('AAGCGACTA',
 False,
   start/stop  genome_start_codon_pos  genome_stop_codon_pos strand_type  \
 0      start                     403                    406           +   
 
   sgRNA_list_values sgRNA_strand  sgRNA_fmax  positionScore PAM_in_start/stop  \
 0         AAGCGACTA            -         425             19              True   
 
   <15_bp3’_overhang PAM_in_CDS PAM_outside_CDS Cutsite_in_CDS  \
 0              True      False           False           True   
 
    relative_position_score sgRNA_mutable_bases  
 0                       19                   6  )

In [None]:
#Before mutating, iterate through stringency files

#This should be specified in the runner. Also in the runner, pass winner sgRNA catalogue to mutator only where mutationNeeded == True.
#Make sure to re-index the dataframe so that the one row can be indexed as '0'.

In [104]:
def codonFragmenter(winner_sgRNACatalogue, tfsDF):
    """
    For the start/stop site, will create a list of codons in the appropriate range where sgRNAs might be found (start/stop ±20 bp).
    If gene is on - strand, these will be revComp codons.
    """
    
    #Extract HA sequences - just the 21bp near start/stop
    HAL = tfsDF["upstreamHA"][-21:]
    HAR = tfsDF["downstreamHA"][0:21]

    #Start codon list
    codonList = []

    #Add HAL codons
    for codonBase1 in range(0, len(HAL), 3):
        codonList.append(HAL[codonBase1:codonBase1+3])

    #add start/stop - this is added as 'ATG' (even if stop) but will never be mutated.
    codonList.append('ATG')

    #Add HAR codons
    for codonBase1 in range(0, len(HAR), 3):
        codonList.append(HAR[codonBase1:codonBase1+3])
    
    #Define gene strand:
    if winner_sgRNACatalogue["strand_type"] == '-': #If on the minus strand, take revComp per codon
        for ind, codon in enumerate(codonList):
            codonList[ind] = revComp(codon)

    return codonList

def codonReverseFragmenter(mutableCodons, tfsDF, winner_sgRNACatalogue):
    """
    """
    HAL = ""
    HAR = ""

    #return to + strand if the gene is on -
    if winner_sgRNACatalogue["strand_type"] == '-': #If on the minus strand, take revComp per codon
        for ind, codon in enumerate(mutableCodons):
            mutableCodons[ind] = revComp(codon)
    
    #replace HAL
    for codon in range (0, 7):
        HAL += codon

    #replace HAR
    for codon in range (8, 15):
        HAR += codon

    #check this indexing
    tfsDF["upstreamHA"] = tfsDF["upstreamHA"][:-21] + HAL
    tfsDF["downstreamHA"] = HAR + tfsDF["downstreamHA"][21:]

In [74]:
def mutator(winner_sgRNACatalogue, tfsDF):
    """
    In the case where a fragment or primer needs to be mutated, will mutate in CDS (preferably PAM, if not in the sgRNA). 
    If not possible, will mutate PAM outside of CDS to NTG/CTN.
    
    params:
        sequenceToMutate: the homology arm or primer fragment that will be mutated.
        winner_sgRNACatalogue: sgRNACatalogue in format as above, with only the row for the winner sgRNA selected.
    """

    import pandas as pd

    #Mutated starts as false
    mutated = False

    #Codon fragmenter - codons are now in mutable format in a list, from leftmost of HAL to right-most of HAR. RevComp if gene is -
    mutableCodons = codonFragmenter(winner_sgRNACatalogue, tfsDF)

    ## Coordinate information per codon
    if winner_sgRNACatalogue.at[0, "strand_type"] == '+':
        codonCoordinates = pd.read_excel("inputfiles/fmaxStopScore.xlsx", sheet_name="CodonCoordinatePlus", index_col= 0)
    elif winner_sgRNACatalogue.at[0, "strand_type"] == '-':
        codonCoordinates = pd.read_excel("inputfiles/fmaxStopScore.xlsx", sheet_name="CodonCoordinateMinus", index_col= 0)

    #PAM last G coordinate
    PAMcoordinate = winner_sgRNACatalogue.at[0, 'relative_position_score']

    #Direction to move away from PAM
    if winner_sgRNACatalogue.at[0, 'sgRNA_strand'] == '+':
        direction = 'decreasing'
    else:
        direction = 'increasing'

    #Coordinates for bases: PAM1, PAM2, PAM3, sgRNA1, sgRNA2, sgRNA3, sgRNA4, sgRNA5, sgRNA6
    mutationCoordinates = []
    if direction == 'decreasing':
        for shift in range(0, 8):
            mutationCoordinates.append(PAMcoordinate - shift)
    elif direction == 'increasing':
        for shift in range(0, 8):
            mutationCoordinates.append(PAMcoordinate + shift)

    #A. Mutate PAM in CDS
    if sgRNACatalogue["PAM_in_CDS"] == True:
        mutationIndex = 0
        while mutated == False:
            PAMcoordinate = mutationCoordinates[mutationIndex]
            codon = mutableCodons[codonCoordinates.at[PAMcoordinate, 'codon']] #This is the codon that this base is in
            base = codonCoordinates.at[PAMcoordinate, 'base']
            potentialCodons = find_synonymous_codons(querycodon = codon, baseToChange = base) #base is the base number within the codon, 1-3
            if potentialCodons > 0:
                mutableCodons[codon] = potentialCodons[0]
                mutated = True
            else:
                mutationIndex +=1
            if mutationIndex == 3:
                break
    
    #B. Mutate sgRNA in CDS
    #Check if mutation was successful in A, then check there are mutable bases of the sgRNA in CDS 
    if mutated == False and sgRNACatalogue["PAM_in_CDS"] != False:
        mutate = sgRNACatalogue["sgRNA_mutable_bases"] #This is the number of bases (1-6) of the sgRNA that we can mutate
        sgMutatedBases = 0
        baseIndex = 3 #this is the first index in mutationCoordinates that is part of the sgRNA rather than PAM
        moveIndex = 0 #this is to keep track of how many bases we try to mutate, as this should be capped at 'mutate' distance
        while moveIndex < mutate: #mutate a max of 2 bases, and can only cover bases within the 'mutate' range
            coordinate = mutableCodons[baseIndex + moveIndex]
            codon = mutableCodons[codonCoordinates.at[coordinate, 'codon']] #This is the codon that this base is in
            base = codonCoordinates.at[coordinate, 'base']
            potentialCodons = find_synonymous_codons(querycodon = codon, baseToChange = base) #base is the base number within the codon, 1-3
            if potentialCodons > 0: #if mutation is possible, accept this mutation
                mutableCodons[codon] = potentialCodons[0]
                sgMutatedBases += 1 #add 1 to the mutated bases count
            moveIndex += 1 #move to the next base
            if sgMutatedBases == 2: #if we've made two mutations, set mutation check to true and exit loop
                mutated == True
                break

    #Mutate PAM outside CDS - to NTG or CTN
    if mutated == False:
        PAM2pos = mutationCoordinates[1] #this is the relative position of the second G/C of PAM
        codon = mutableCodons[codonCoordinates.at[PAM2pos, 'codon']] #This is the codon that this base is in
        base = codonCoordinates.at[PAMcoordinate, 'base']
        newCodon = codon[:base-1] + 'T' + codon[base:] #add a 'T' where the old base was 
        mutableCodons[PAM2pos] = newCodon

    #Replace the potentially mutated homology arms
    mutableCodons = codonReverseFragmenter(mutableCodons, tfsDF, winner_sgRNACatalogue)

    #Indicate if mutation has occurred
    tfsDF['mutated'] = mutated

    return tfsDF


SyntaxError: unmatched ')' (203083978.py, line 1)

In [110]:
import pandas as pd
codonCoordinates = pd.read_excel("inputfiles/fmaxStopScore.xlsx", sheet_name="CodonCoordinateMinus", index_col= 0)
display(codonCoordinates)


Unnamed: 0_level_0,Codon,Base
Relative_coordinate_in_mutable_area,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2
1,0,1
2,1,3
3,1,2
4,1,1
5,2,3
6,2,2
7,2,1
8,3,3
9,3,2


In [23]:
def find_synonymous_codons(query_codon, base_to_change, codon_table_excel = "inputfiles/codon_table.xlsx"):

    '''
    Uses the amino acids table to select codons that encode for the same amino acid as the query codon. Will mutate at the specified base only.

    Params:
        query_codon: string, codon to select synonymous codons for
        base_to_change: the base within the codon that needs to change. Value from 1-3.
        codon_table_excel: string, path to an excel file that lists per codon which amino acid that codon encodes.
    
    Returns:
        synonymous_codons: list of strings, each string is a codon that encodes for the same amino acid as the query codon.
    
    '''

    import pandas as pd

    synonymous_codons = []

    codon_table = pd.read_excel(codon_table_excel)

    #Extract amino acid given codon
    amino_acid_query = codon_table[codon_table['codon'] == query_codon].iloc[0]["amino_acid"]

    #Subset df for other rows corresponding to this amino acid
    same_aa_df=codon_table[codon_table["amino_acid"] == amino_acid_query]

    #Convert the codons to a list
    codon_list = same_aa_df["codon"].values.tolist()

    #Keep only codons where the indicated base has changed
    for codon in codon_list:
        if codon[base_to_change-1] == query_codon[base_to_change-1]:
            codon_list.remove(codon)

    return codon_list


In [None]:
def sgRNACataloguer(df):
    """
    Given filtered sgRNAs for a start/stop site in the 'df' format, will create a 'catalogue' dataframe containing 
    information about the last G position and whether the sgRNAs in the list meet certain conditions.

    params: df: a dataframe for one start/stop site of the format: 
            df= {"start/stop":"start", #is it N or C termini -> do we need to look at start or stop codon for teh cut 
            'genome_start_codon_pos':403, 
            'genome_stop_codon_pos':406, # only one of these - the other is n/a
            'strand_type':'+',
            "sgRNA_list_positions":[[401,425],[403, 427]],#those wil be as genome positions -assumptions - the coordinates correspond to the 1st and last bp of the strand to which the gsRNA will be complementary to
            "sgRNA_list_values":["AAGCGACTA", "CCTGTAA"],
            "sgRNA_strand" : ['-', '+']}
    
    output: dataframe with columns "start/stop", "genome_start_codon_pos", "genome_stop_codon_pos",
        "strand_type", "sgRNA_list_values", "sgRNA_strand", "sgRNA_fmax", "positionScore", "PAM_in_start/stop",
        "<15_bp3’_overhang", "PAM_in_CDS", "PAM_outside_CDS", "Cutsite_in_CDS", "relative_position_score", "sgRNA_mutable_bases"
    
    """
    import pandas as pd

    #Making 'df' into an sgRNA catalogue dataframe. This makes it easier to subset sgRNAs by condition.
    sgRNACatalogue = pd.DataFrame.from_dict(df)
    #keep only fmax to simplify indexing later
    for ind, sgRNA in sgRNACatalogue.iterrows():
        sgRNACatalogue.at[ind, "sgRNA_fmax"] = sgRNACatalogue["sgRNA_list_positions"][ind][1] #split to fmax and fmin, delete []
    sgRNACatalogue = sgRNACatalogue.drop(columns=["sgRNA_list_positions"]) #delete the old position value to simplify dataframe (optional)
    sgRNACatalogue["sgRNA_fmax"] = sgRNACatalogue["sgRNA_fmax"].astype(int) #convert to int

    #Adding position scores (fmax - stop)
    sgRNACatalogue["positionScore"] = sgRNACatalogue["sgRNA_fmax"] - sgRNACatalogue["genome_stop_codon_pos"]
                                                                         
    #Dataframe containing parameter ranges to interpret the positon score, based on gene strand, sgRNA strand, and start/stop
    positionScoreParameters = pd.read_excel("inputfiles/fmaxStopScore.xlsx")

    #Per parameter, append the sgRNACatalogue with a TRUE/FALSE value per condition.
    #Also translate the column of the positionScoreParameters df indicating how many sgRNA bases within 6bp of PAM ccan be mutated inside CDS.
    #Lastly, translate positionScore into the relative position of the last 'G' or 'C' of PAM.
    booleanColumns = ["PAM_in_start/stop", "<15_bp3’_overhang", "PAM_in_CDS", "PAM_outside_CDS", "Cutsite_in_CDS"]
    positionColumns = ["relative_position_score", "sgRNA_mutable_bases"]
    sgRNACatalogue = sgRNACatalogue.reindex(columns = sgRNACatalogue.columns.tolist() + booleanColumns + positionColumns)
    
    for ind, sgRNA in sgRNACatalogue.iterrows():

        #Extract the appropriate parameter row per sgRNA
        conditions = positionScoreParameters.loc[(positionScoreParameters["start/stop"] == sgRNA['start/stop']) & (positionScoreParameters["strand_type"] == sgRNA['strand_type']) & (positionScoreParameters["sgRNA_strand"] == sgRNA['sgRNA_strand'])]
        conditions = conditions.reset_index(drop = True)

        #Per column, input true/false as to whether the position score meets that condition
        for col in booleanColumns:
            colValue = conditions.at[0,col] #extract parameter range values from dataframe
            #Process the value into a range (in format list [min, max])
            #If the values should be 'more than' or 'less than', 25 is used as a max or -25 as min because distances cannot be more than 20
            if ">" in colValue: #could simplify this further by just defining all as ranges in excel
                minMax = [int(colValue[1:]), 25]
            elif "<" in colValue:
                minMax = [-25,int(colValue[1:])]
            elif ":" in colValue:
                min, max = colValue.split(":")
                minMax = [int(min), int(max)]
            else:
                print("Incorrect format of range value. Verify inputs.")
            
            #Into the output dataframe, print true/false as to whether the positionScore has met the condition for that column
            sgRNACatalogue.at[ind, col] = bool(sgRNACatalogue.at[ind, "positionScore"] in range(minMax[0], minMax[1]))
       
        #Calculate PAM last G or C position using positionScore. Add this to the sgRNA catalogue.
        sgRNACatalogue.at[ind, "relative_position_score"] = sgRNACatalogue.at[ind, "positionScore"] + int(conditions.at[0, "relative_position_score"])

        #Calculate how many bases of the sgRNA recognition site are in the CDS and can therefore be mutated.
        #This is given in the column "sgRNA_mutable_bases", in a list of the format [x,y,z,...] indicating the position score for which 1-6 bases can be mutated.
        #Accounting for the case where there is more than one positionScore for which 6bp can be mutated, this translates to:
        mutateBases = [1,2,3,4,5,6,6,6,6,6,6,6,6,6,6,6,6]
        if str(sgRNACatalogue.at[ind, "positionScore"]) in conditions.at[0, "sgRNA_mutable_bases"]:
            index = conditions.at[0, "sgRNA_mutable_bases"].index(str(sgRNACatalogue.at[ind, "positionScore"]))
            sgRNACatalogue.at[ind, "sgRNA_mutable_bases"] = str(mutateBases[index])
        else:
            sgRNACatalogue.at[ind, "sgRNA_mutable_bases"] = False
    
    sgRNACatalogue["relative_position_score"] = sgRNACatalogue["relative_position_score"].astype(int) #convert to int

    return sgRNACatalogue

In [2]:
a = ["ATG", "ATA"]

if (x[:-1] for x in a.any()) == "A":
    print("A")
    #a.remove[x]



AttributeError: 'list' object has no attribute 'any'

In [4]:
codon = "ATA"
a.remove(codon)


In [5]:
print(a)


['ATG']


In [None]:
#Free variables

#refSeqPerChromosome
from Bio import SeqIO

refSeqPerChromosome = {}

for seq in SeqIO.parse(open("inputfiles/dmel-all-chromosome-r6.48.fasta"), 'fasta'):
    refSeqPerChromosome[seq.id] = seq.seq 

#TFsdf
TFsdf = make_dataframe_from_TFs_list("inputfiles/TFs.xlsx", refSeqPerChromosome)
display(TFsdf)

"""
#singletfsDF = pandas series for one row extracted from tfsDF
for index, row in TFsdf.iterrows():
    if index == 0:
        singletfsDF = row
        print(singletfsDF)
        
#gRNAFileAnnotation = dataframe of gRNAs as extracted from original gff file
import pandas as pd
gRNAFileAnnotation = pd.read_csv("inputfiles/Hu.2019.8.28.sgRNA_designs/1to3NonCdsOffTarget_low_stringency.gff", sep = "\t", index_col = False)
gRNAFileAnnotation = gRNAFileAnnotation.assign(target_site_variation= "")
index = 0 #TODO@Marina improve this code
for attribute in gRNAFileAnnotation['attributes']:
    fullatt = (attribute).split(";")
    gRNAFileAnnotation.at[index,"target_site_variation"] = fullatt[8]
    index+=1

print(type(gRNAFileAnnotation))

#GenomeCoordinatesFiltered = filtered gRNAs for 20bp region, no target site variation, and with start/stop info from tfsDF
GenomeCoordinatesFiltered = gRNAFileAnnotation[(gRNAFileAnnotation['target_site_variation'] == "target_site_variation=none observed") & 
                                                (gRNAFileAnnotation['#chr'] == singletfsDF["Chromosome"]) & 
                                                (gRNAFileAnnotation['fmin']-1 >= singletfsDF["Start"] - 20)]
for (columnName, columnData) in singletfsDF.items():
       GenomeCoordinatesFiltered = GenomeCoordinatesFiltered.assign(**{columnName: columnData})
display(GenomeCoordinatesFiltered)
"""
#filteredgRNAs - theoretical sgRNAs for one site
filteredgRNAs = pd.read_excel("inputfiles/mockMaterials/filtered_gRNAs.xlsx", index_col= 0)

display(filteredgRNAs)

sgRNAdf = sgRNApositionCheck(filteredgRNAs)

display(sgRNAdf)

In [None]:
def workingsgRNArunner(refSeqPerChromosome = refSeqPerChrom, TFsdf = TFsDF):
    """
    Check functions so far work with the proposed changes.
    """
    import pandas as pd
    
    #set up the output DF - this is the winning sgRNA per site in TFsdf
    TFsdfWinnersandMutated = pd.DataFrame(columns=["fmin", "fmax", "#chr", "strand", "sgRNA_sequence", "Gene_ID",
                                                   "Transcript_ID", "Chromosome", "Gene_Region", "Start", "Stop",
                                                   "Strand", "Reference_Seq", "upstreamHA", "downstreamHA", "positionScore",
                                                    "PAM_in_start/stop", "<15_bp3’_overhang", "PAM_in_CDS", "PAM_outside_CDS",
                                                    "CDS_boundary", "lastG", "cutSite", "mutated"])
    
    #Per row of this dataframe, select a guideRNA
    for ind, row in TFsdf.iterrows():
        print(f"Selecting guide RNA for TFsdf row {ind}")

        filtered_sgRNA = gRNA_stringencyIterator(row, refSeqPerChromosome)

        #Score the sgRNAs for this site
        sgRNAdf = sgRNApositionCheck(filtered_sgRNA)

        #Add a column to establish whether mutation has occurred. This will be set to 'True' in the mutator function and starts as False by default.
        sgRNAdf['mutated'] = False

        #If no sgRNAs were found at any stringency
        if len(sgRNAdf) == 0:
            print(f"No sgRNAs found at all stringencies for {ind}.")
            print(sgRNAdf)

            #Just input the information about this site into the final DF. The columns about the guideRNA will be filled with 'NaN', indicating no guideRNA could be found.
            for col in list(row.index):
                TFsdfWinnersandMutated.at[ind, col] = row[col]
            display(TFsdfWinnersandMutated)
        else: #If we have at least one sgRNA
            #Select winner
            winnersgRNA, mutationNeeded = find_best_gRNA(sgRNAdf) #winnersgRNA is a string of the winning sequence, mutationNeeded is a bool variable
            winnerdf = sgRNAdf[sgRNAdf["sgRNA_sequence"] == winnersgRNA] #This is the dataframe row for the winning sgRNA from the original sgRNA dataframe
            winnerdf = winnerdf.loc[0] #This is the pandas series for the same information
            display(winnerdf)
            if mutationNeeded == True: #run mutator if indicated
                winnerdf = mutator(winnerdf) #this will mutate HAL or HAR as needed and return the original DF with mutated sequences, and the mutated column set to True
        
            #Add the winning sgRNA into the output df for this TF start/stop site
            TFsdfWinnersandMutated.loc[ind] = winnerdf
            display(TFsdfWinnersandMutated)
    
        #Return dataframe as an excel file (This should be unindented one, but while testing I'd like to see the output file updated after each row of the TFsdf)
        TFsdfWinnersandMutated.to_excel("outputFiles/winningsgRNAs.xlsx")

    return TFsdfWinnersandMutated
