# BCL2 Read Counts Processing

The purpose of this notebook is to process the BCL2 tiling data from Supplementary Data 7 to annotated files with lognorms, LFC, and z-scores.

This notebook was also used to process the BRCA1 tiling data from Supplementary Data 5 and 6.

This notebook describes and performs the calculations used to "preprocess" all BCL2 data, starting from raw read counts and ending with LFC values. The following steps are performed for each screen:

- Calculate lognorms
- Filter out any sgRNAs for which the lognorm of pDNA is > 3 SD from the mean
- Filter out any sgRNAs with > 5 Match Bin I off-targets
    - Note: only the WT library was designed without these guides prefiltered (BRCA1 only). 
- Calculate log-fold-change from pDNA or un-treated arm
- Average log-fold-change across replicates of the same condition
- Calculate z-score for averaged log-fold-change
- Merge data with annotations file
- Calculate Mutation bin, Residues, and Median Residues



Returns two files: (1) unfiltered lognorm file and (2) filtered LFC file with annotations, averaged replicates, and z-scores (for use in all downstream calculations)

In [1]:
import pandas as pd
from poola import core as pool
import seaborn as sns
import gpplot
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import re

In [2]:
import sys
print('Python version: ' + sys.version)
modules = ['pandas','poola','seaborn','gpplot','matplotlib','numpy']
for module in modules:
    try:
        print(module + ' ' + sys.modules[module].__version__)
    except:
        print(module + ' has no __version__ attribute')

Python version: 3.8.5 (default, Sep  4 2020, 02:22:02) 
[Clang 10.0.0 ]
pandas 1.1.3
poola 0.0.13
seaborn 0.11.0
gpplot 0.5.0
matplotlib 3.3.2
numpy 1.20.3


## Functions

In [3]:
def get_z_score(data,col,control_category,control_col):
    mean = data.loc[data[control_col].str.contains(control_category),col].mean()
    std = data.loc[data[control_col].str.contains(control_category),col].std()
    data[str(col + ';z-score')] = data[col].apply(lambda x: (x-mean)/std)
    return data


def GetMostSevereMutationType(string):
    if type(string) == float:
        return 'No edits'
    elif 'Nonsense' in string:
        return 'Nonsense'
    elif 'Splice-acceptor' in string or 'Splice-donor' in string:
        return 'Splice site'
    elif 'Missense' in string:
        return 'Missense'
    elif 'Intron' in string:
        return 'Intron'        
    elif 'Silent' in string:
        return 'Silent'
    elif 'UTR' in string:
        return 'UTR'
   
    
def GetResidues(string):
    new_string = ''
    if type(string) != float:
        edits = [x.strip() for x in re.split(';|,', string)]
        for edit in edits:
            if edit.startswith('Exon'):
                new_string += 'intron;'
            elif edit == 'utr':
                new_string += 'utr;'
            elif edit == '':
                continue
            else:
                # Strip off all non-digit characters
                for character in edit:
                    if character.isdigit():        
                        new_string += character
                new_string += ';'
    return new_string


'''
GetMedianResidues_v2 is updated so that sgRNAs that are binned as "Missense" but contain
intronic (not splice site!) edits still get a "median residue" and can appear on a protein plot. This
is consistent with the ordering of GetMostSevereMutationType, in which Missense > Intron.

Note that sgRNAs with 'Mutation bin' == 'Splice site' still do NOT receive a "median residue," consistent
with Splice site > Missense in mutation bin ordering.
'''
    
def GetMedianResidues_v2(row):
    residues = row['Residues'].split(';')
    if (row['Mutation bin'] == 'Splice site') or (row['Mutation bin'] == 'UTR'):
        return np.nan
    residues = [int(res) for res in residues if res not in ['','intron','utr']]
    if len(residues) != 0:
        return np.median(residues)
    else:
        return np.nan
    

    
def getAddedAnnotations(df):
    df['Mutation bin'] = df['Mutation category'].apply(GetMostSevereMutationType)
    df['Residues'] = df['Amino acid edits'].apply(GetResidues)
    df['Median Residues'] = df.apply(GetMedianResidues_v2,axis=1)
    #Add "Non-targeting control" annotation for NO_SITE Constructs
    df.loc[df['Gene symbol'].str.contains('NO_SITE'),'Mutation bin'] = 'Non-targeting control'
    #Add "Targeting control" annotation for ONE_NON-GENE_SITE Constructs
    df.loc[df['Gene symbol'].str.contains('ONE_NON-GENE_SITE'),'Mutation bin'] = 'Targeting control'
    #Add "pan-lethal control" annotation
    panlethal_list = ['EEF2','HNRNPU','KPNB1','PELP1','RPS20','SF3B1','SNRPD1','TFRC']
    df.loc[df['Gene symbol'].isin(panlethal_list),'Mutation bin'] = 'Pan-lethal'
    
    
def processData(df,replicates,annotations,zscore_controls,zscore_control_col,Match_bin_filter=False):
    if replicates == 3:
        cols = ['pDNA','MOLM13_Dropout_RepA', 'MOLM13_Dropout_RepB','MOLM13_Dropout_RepC',
                'MOLM13_Venetoclax_RepA', 'MOLM13_Venetoclax_RepB','MOLM13_Venetoclax_RepC']
    #Calculate lognorms
    lognorm = pool.lognorm_columns(reads_df=df, columns=cols)
    #Filter lognorms
    filtered_lognorm = pool.filter_pdna(lognorm_df=lognorm, pdna_cols=['pDNA'], z_low=-3)
    pdna_filtered = lognorm.shape[0] - filtered_lognorm.shape[0]
    print('Filtered ' + str(lognorm.shape[0] - filtered_lognorm.shape[0]) + ' guides due to low pDNA abundance')
    
    if Match_bin_filter:
        counts = ['0','1','2','3','4','5']
        filtered_on_target = annotations[annotations['Match Bin I counts'].isin(counts)]
        filtered_lognorm = filtered_lognorm.merge(filtered_on_target['sgRNA sequence'],how='inner',on='sgRNA sequence')
        filtered_lognorm.drop(labels='sgRNA sequence',axis=1,inplace=True)
        print('Filtered ' + str(lognorm.shape[0] - pdna_filtered - filtered_lognorm.shape[0]) + ' additional guides due a Match Bin I count >5.') 
    
    #calculate log-fold changes
    lfc = pool.calculate_lfcs(filtered_lognorm, ref_map = {'MOLM13_Dropout_RepA': 'pDNA', 'MOLM13_Dropout_RepB': 'pDNA',
                                                    'MOLM13_Dropout_RepC': 'pDNA','MOLM13_Venetoclax_RepA': 'MOLM13_Dropout_RepA',
                                                    'MOLM13_Venetoclax_RepB': 'MOLM13_Dropout_RepB','MOLM13_Venetoclax_RepC': 'MOLM13_Dropout_RepC'}) 
       
    #average lfcs for each conditions
    condition_list = ['MOLM13_Dropout', 'MOLM13_Venetoclax']
    construct_cols = ['sgRNA sequence', 'Gene symbol']
    for condition in condition_list:
        # selecting columns for that condition
        condition_reps = [col for col in lfc.columns if condition in col]
        condition_df = lfc[condition_reps]
        avg_lfc_col = condition + '_RepABC'
        # storing average lfc in new column in old dataframe 
        lfc[avg_lfc_col] = condition_df.mean(axis = 1)
   
    #merge with annotations
    lfc = lfc.merge(annotations,on='sgRNA sequence')
    #calculate z-score
    get_z_score(lfc,'MOLM13_Venetoclax_RepABC',zscore_controls,zscore_control_col);
    getAddedAnnotations(lfc)
    return lognorm, lfc   

## NG

### CBE

In [4]:
ng_cbe = pd.read_excel('Supplementary_Data7_v2.xlsx', "NG CBE",
                      skiprows=2, names = ['sgRNA sequence', 'pDNA', 'MOLM13_Dropout_RepA', 'MOLM13_Dropout_RepB',
                                        'MOLM13_Dropout_RepC', 'MOLM13_Venetoclax_RepA', 'MOLM13_Venetoclax_RepB',
                                        'MOLM13_Venetoclax_RepC'])

In [5]:
ng_cbe_annot = pd.read_excel('Supplementary_Data7_v2.xlsx', "NG CBE Library Annotation")

In [6]:
ng_cbe_lognorm, lfc_ng_cbe = processData(df=ng_cbe,
                                         replicates=3,
                                         annotations=ng_cbe_annot,
                                         zscore_controls='ONE_NON-GENE_SITE',
                                         zscore_control_col='Gene symbol')

Filtered 1 guides due to low pDNA abundance


In [7]:
ng_cbe_lognorm.to_csv('BCL2-NG-CBE_lognorm.csv',index=False)
lfc_ng_cbe.to_csv('BCL2-NG-CBE_LFC.csv',index=False)

### ABE

In [8]:
ng_abe = pd.read_excel('Supplementary_Data7_v2.xlsx', "NG ABE",
                      skiprows=2, names = ['sgRNA sequence', 'pDNA', 'MOLM13_Dropout_RepA', 'MOLM13_Dropout_RepB',
                                        'MOLM13_Dropout_RepC', 'MOLM13_Venetoclax_RepA', 'MOLM13_Venetoclax_RepB',
                                        'MOLM13_Venetoclax_RepC'])

In [9]:
ng_abe_annot = pd.read_excel('Supplementary_Data7_v2.xlsx', "NG ABE Library Annotation")

In [10]:
ng_abe_lognorm, lfc_ng_abe = processData(df=ng_abe,
                                         replicates=3,
                                         annotations=ng_abe_annot,
                                         zscore_controls='ONE_NON-GENE_SITE',
                                         zscore_control_col='Gene symbol')

Filtered 3 guides due to low pDNA abundance


In [11]:
ng_abe_lognorm.to_csv('BCL2-NG-ABE_lognorm.csv',index=False)
lfc_ng_abe.to_csv('BCL2-NG-ABE_LFC.csv',index=False)