In [None]:
# default_exp core

# core

> Core functions for analyzing CRISPR base editor screens

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import re
import numpy as np

def get_most_severe_mutation_type(mut_types):
    """
    Annotate the most severe mutation type of the
    predicted mutation types for a guide sequence
    
    mut_types: string |
    returns: string
    """
    if type(mut_types) == float:
        return 'No edits'
    elif 'Nonsense' in mut_types:
        return 'Nonsense'
    elif 'Splice-acceptor' in mut_types or 'Splice-donor' in mut_types:
        return 'Splice site'
    elif 'Missense' in mut_types:
        return 'Missense'
    elif 'Intron' in mut_types:
        return 'Intron'        
    elif 'Silent' in mut_types:
        return 'Silent'
    elif 'UTR' in mut_types:
        return 'UTR'
    elif 'Flank' in mut_types:
        return 'Flank'
    
def get_residues(predicted_aa_edits):
    """
    Get the residue positions of all amino acid 
    predicted edits for a guide sequence
    
    predicted_aa_edits: string |
    returns: string
    """
    res_string = ''
    if type(predicted_aa_edits) != float:
        edits = [x.strip() for x in re.split(';|,', predicted_aa_edits)]
        for edit in edits:
            if edit == '(NC)':
                res_string += 'non-coding;'
            elif edit == '':
                continue
            else:
                res = [x for x in re.findall('\d+|\D+', edit) if x.isdigit()][0]
                res_string += res
                res_string += ';'
    return res_string

def get_median_residues(mutation_bin, predicted_aa_edits):
    """
    Get the median residue position of predicted 
    edit positions
    
    mutation_bin: string |
    predicted_aa_edits: string |
    returns: float
    """
    residues = get_residues(predicted_aa_edits)
    residues = residues.split(';')
    if (mutation_bin == 'Missense') or (mutation_bin == 'Nonsense') or (mutation_bin == 'Silent'):
        residues = [int(res) for res in residues if res not in ['','non-coding']]
        if len(residues) != 0:
            return np.median(residues)
        else:
            return np.nan
    else:
        return np.nan

To demonstrate the use of these functions, we will first design a base editor tiling library with guides tiling the transcript ENST00000380152 of BRCA2. These guides are annotated with predicted edits using the C>T base editor in the window of nucleotide 4-8.

In [None]:
import pandas as pd

design_df = pd.read_csv('sample_input/crisprbe-guides.txt', sep='\t')
design_df.head()

Unnamed: 0,Input,CRISPR Enzyme,Edit Type,Edit Window,Target Assembly,Target Genome Sequence,Target Gene ID,Target Gene Symbol,Target Gene Strand,Target Transcript ID,...,PAM Sequence,sgRNA Target Sequence Start Pos. (global),sgRNA Orientation,Nucleotide Edits (global),Guide Edits,Nucleotide Edits,Amino Acid Edits,Mutation Category,Constraint Violations,Note
0,ENST00000380152,SpyoCas9,C-T,4..8,GRCh38 (9606),NC_000013.11,ENSG00000139618,BRCA2,+,ENST00000380152.8,...,TGG,32316449,sense,,,,,,,
1,ENST00000380152,SpyoCas9,C-T,4..8,GRCh38 (9606),NC_000013.11,ENSG00000139618,BRCA2,+,ENST00000380152.8,...,AGG,32316462,sense,32316465C>T,C_4,5C>T,Pro2Leu,Missense,,
2,ENST00000380152,SpyoCas9,C-T,4..8,GRCh38 (9606),NC_000013.11,ENSG00000139618,BRCA2,+,ENST00000380152.8,...,AGG,32316467,antisense,"32316479G>A;32316481G>A, 32316483G>A","C_8_6, C_4","19G>A;21G>A, 23G>A","Glu7Lys, Arg8Lys","Missense, Missense",,
3,ENST00000380152,SpyoCas9,C-T,4..8,GRCh38 (9606),NC_000013.11,ENSG00000139618,BRCA2,+,ENST00000380152.8,...,TGG,32316477,antisense,,,,,,,
4,ENST00000380152,SpyoCas9,C-T,4..8,GRCh38 (9606),NC_000013.11,ENSG00000139618,BRCA2,+,ENST00000380152.8,...,TGG,32316488,antisense,,,,,,,


# Assign severe mutation bin

As noted in the "Mutation Category" column, each guide is predicted to make more one or more types of mutations if Cs are present in the editing window. We can then annotate each guide with the most severe mutation bin in the order Nonsense > Splice site > Missense > Intron > Silent > UTR > no edit.

In [None]:
design_df['Mutation Bin'] = design_df['Mutation Category'].apply(get_most_severe_mutation_type)
design_df[['sgRNA Target Sequence','Mutation Category','Mutation Bin']].head()

Unnamed: 0,sgRNA Target Sequence,Mutation Category,Mutation Bin
0,TCGTAGGTAAAAATGCCTAT,,No edits
1,TGCCTATTGGATCCAAAGAG,Missense,Missense
2,GGCCTCTCTTTGGATCCAAT,"Missense, Missense",Missense
3,AAAAAATGTTGGCCTCTCTT,,No edits
4,TTAAAAATTTCAAAAAATGT,,No edits


# Calculate median residue
We can then get the median residue of the predicted edits.

In [None]:
design_df['Median Residue'] = design_df.apply(lambda x: get_median_residues(x['Mutation Bin'], x['Amino Acid Edits']), axis=1)
design_df[['sgRNA Target Sequence','Amino Acid Edits','Mutation Category','Mutation Bin','Median Residue']].head(15)

Unnamed: 0,sgRNA Target Sequence,Amino Acid Edits,Mutation Category,Mutation Bin,Median Residue
0,TCGTAGGTAAAAATGCCTAT,,,No edits,
1,TGCCTATTGGATCCAAAGAG,Pro2Leu,Missense,Missense,2.0
2,GGCCTCTCTTTGGATCCAAT,"Glu7Lys, Arg8Lys","Missense, Missense",Missense,7.5
3,AAAAAATGTTGGCCTCTCTT,,,No edits,
4,TTAAAAATTTCAAAAAATGT,,,No edits,
5,AAGACACGCTGCAACAAAGC,"Thr17Ile, Arg18Cys","Missense, Missense",Missense,17.5
6,TTTTTTTTTTAAATAGATTT,,,No edits,
7,TAGGACCAATAAGTCTTAAT,Pro26Leu,Missense,Missense,26.0
8,TCAAACCAATTAAGACTTAT,Trp31Ter,Nonsense,Nonsense,31.0
9,GCAGGTTCAGAATTATAGGG,Glu45Lys,Missense,Missense,45.0
