#### This notebook is used to analyze the mutation dependent synthetic lethalaty using the siRNA dataset from the Broad institute. Data resoures are from https://depmap.org/portal/download/. Four datasets are used in this pipeline:
1. DEMETER2 Data v6
2. CCLE_mutations.csv from DepMap Public 20Q3
3. sample_info.csv from DepMap Public 20Q3

In [1]:
import sys
import matplotlib.pyplot as plt
import pandas as pd
import scipy
from scipy import stats 
import numpy as np
import json
import statsmodels.stats.multitest as multi
import matplotlib.pyplot as plt
import math
import ipywidgets as widgets
import plotly
import plotly.express as px
from google.cloud import bigquery

In [2]:
def Cohen_dist(x,y):

    n1 = len(x)
    n2 = len(y)
    s = np.sqrt(((n1 - 1)*(np.std(x))*(np.std(x)) + (n2 - 1) * (np.std(y)) * (np.std(y))) / (n1 + n2 -2))
    d = (np.mean(x) - np.mean(y)) / s
    return(d)
 

In [3]:
#!gcloud auth application-default login

In [4]:
#%load_ext google.cloud.bigquery

In [5]:
directories = {"input_dir":"~/Documents/ISB/KG",
               "output_dir":"../Output_SL/"}

input_data = {
         "input_mut": "Depmap_Crispr_data/20Q3/CCLE_mutations.csv",
         "input_depmap": "Depmap_RNAi_data/D2_combined_gene_dep_scores.csv",
         "input_sample_info": "Depmap_Crispr_data/20Q3/sample_info_Depmap.csv",
        }

In [6]:
Mut_mat = pd.read_csv(directories['input_dir']+'/'+input_data['input_mut'], sep = '\t') 
Depmap_matrix = pd.read_csv(directories['input_dir']+'/'+input_data['input_depmap'], index_col = "Unnamed: 0") 
sample_info = pd.read_csv(directories['input_dir']+'/'+input_data['input_sample_info'])


  interactivity=interactivity, compiler=compiler, result=result)


#### Overview of the datasets
There siRNA knockdown effects for 712 cell lines in the DEMETER2 v6 datasets
708 among the 712 cell lines were found in the sample info from the Depmap dataset 20Q3
For exceptions: 
1. AZ521_STOMACH was labeled as AZ521_SMALL_INTESTINE, with ACH id of ACH-001015
2. GISTT1_GASTROINTESTINAL_TRACT was labeled as GISTT1_STOMACH, with ACH id of ACH-002332
3. MB157_BREAST was not found
4. SW527_BREAST was label as SW527_LARGE_INTESTINE in the Depmap dataset 20Q3, which is inconsistance with the report from DEMTER2.
For the following analysis, we ignored the four cell lines with unbious annoation for further statistical models, as the mismatching from the data resources.

#### Generating the sample dictionary for the CCLE_ID to the ACH_ID 

In [7]:
sample_info
sample_map = {}
for i in range(0, sample_info.shape[0]):
    Depmap_id = sample_info.iloc[i,0]
    CCLE_Name = sample_info.iloc[i,2]
    sample_map[CCLE_Name]  = Depmap_id


#### Check the matched samples and select the matched samples

In [8]:
Matched_cellLines = []
for CCLE_Name in list(Depmap_matrix.columns):
    if CCLE_Name not in sample_map:
        print(CCLE_Name)
    else:
        Matched_cellLines.append(CCLE_Name)        

AZ521_STOMACH
GISTT1_GASTROINTESTINAL_TRACT
MB157_BREAST
SW527_BREAST


In [9]:
Depmap_matrix_sele = Depmap_matrix.loc[:,Matched_cellLines]

#### Rename the matched names

In [10]:
ACH_ID_list = []
for CCLE_Name in list(Depmap_matrix_sele.columns):
    if CCLE_Name not in sample_map:
        print(CCLE_Name)
    else:
        ACH_ID_list.append(sample_map[CCLE_Name])
Depmap_matrix_sele.columns = ACH_ID_list 

#### Select the samples with both mutation data and knockdown data
???? 674 cell lines were with mutation data and gene knockdown data, whether the rest of the cell lines doesn't contain any alteration or they are not measured at all?

In [11]:
Samples_with_mut_kd = list(set(Mut_mat.loc[Mut_mat['DepMap_ID'].isin(ACH_ID_list)]['DepMap_ID']))
print(len(Samples_with_mut_kd))

674


#### Select the gene knockdown results with gene mutation data

In [12]:
Depmap_matrix_sele = Depmap_matrix_sele.loc[:,Samples_with_mut_kd ]

##### Format the depency map and rename the gene name

In [13]:
Depmap_matrix_transpose = Depmap_matrix_sele.transpose()
gene_names_old = list(Depmap_matrix_transpose.columns.values)
gene_names_new = []
for item in gene_names_old:
    name = item.split(' (')[0]
    gene_names_new.append(name)
Depmap_matrix_transpose.columns = gene_names_new


#### Select the muation categories

In [14]:
print(list(set(Mut_mat['Variant_Classification'])))
selected_variants = ['Splice_Site',
                     'Frame_Shift_Del',
                     'Frame_Shift_Ins',
                     'Nonstop_Mutation',
                     'In_Frame_Del',
                     'In_Frame_Ins',
                     'Missense_Mutation',
                     'Nonsense_Mutation',
                     'Nonstop_Mutation',
                     'Start_Codon_Del',
                     'Start_Codon_Ins',
                     'Start_Codon_SNP',
                     'Stop_Codon_Del',
                     'Stop_Codon_Del',
                     'Stop_Codon_Ins',
                     'De_novo_Start_OutOfFrame']

[nan, 'IGR', 'Nonsense_Mutation', "5'Flank", "3'UTR", 'Stop_Codon_Del', 'De_novo_Start_OutOfFrame', 'Frame_Shift_Ins', 'Stop_Codon_Ins', 'Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins', 'Start_Codon_Del', 'Nonstop_Mutation', 'Frame_Shift_Del', 'Intron', "5'UTR", 'Splice_Site', 'Start_Codon_SNP', 'Silent', 'Start_Codon_Ins']


In [15]:
Mut_mat_sele1 = Mut_mat.loc[Mut_mat['Tumor_Sample_Barcode'].isin(Samples_with_mut_kd)]
Mut_mat_sele2 = Mut_mat_sele1.loc[Mut_mat_sele1['Variant_Classification'].isin(selected_variants)]


#### Input mutated genes

In [16]:
mut_gene = pd.read_csv("../Github_version/SyntheticLethality/data/CancerDriverGenes.csv")
mut_gene = list(mut_gene['HGNC_gene_symbol'])


In [17]:
Mut_mat_sele3 = Mut_mat_sele2.loc[Mut_mat_sele2['Hugo_Symbol'].isin(mut_gene),['Hugo_Symbol','DepMap_ID']]


#### T-test and effect size estimation between the mutated group and the wt-group

In [18]:
#Sci_test = stats.ranksums(D_mut_new, D_wt_new)
Gene_mut_list = []
Gene_kd_list = []
p_list = []
es_list = []
size_mut = []
FDR_List = []


for Gene in mut_gene:
    print(Gene)
    p_list_curr = []
    Mut_group = list(Mut_mat_sele3.loc[Mut_mat_sele3['Hugo_Symbol'] == Gene]['DepMap_ID'].values)
    WT_group = list(set(Samples_with_mut_kd) - set(Mut_group))

    for Gene_kd in list(Depmap_matrix_sele.index.values):
        D_mut_new = Depmap_matrix_sele.loc[Gene_kd,Mut_group].values
        D_wt_new = Depmap_matrix_sele.loc[Gene_kd,WT_group].values
        
        nan_array = np.isnan(D_mut_new)
        not_nan_array = ~ nan_array
        D_mut_new = D_mut_new[not_nan_array]
        
        nan_array = np.isnan(D_wt_new)
        not_nan_array = ~ nan_array
        D_wt_new = D_wt_new[not_nan_array]
        
        
        if len(D_mut_new) > 2:
            size_mut.append(len(D_mut_new))
            Sci_test = stats.ttest_ind(D_mut_new, D_wt_new, nan_policy = 'omit')
            pvalue = Sci_test[1]
            p_list_curr.append(pvalue)
            Size_effect =Cohen_dist(D_mut_new, D_wt_new)
            es_list.append(Size_effect)
            Gene_mut_list.append(Gene)
            Gene_kd_list.append(Gene_kd)
            
    if len(p_list_curr) > 0:
        p_list = p_list + p_list_curr
        FDR_List_table = multi.multipletests(p_list_curr, alpha=0.05, method='fdr_bh', is_sorted=False)[1]
        FDR_List = FDR_List + list(FDR_List_table)
        


BRCA2


In [19]:
print(len(p_list))
print(len(es_list))

17309
17309


In [20]:
result = pd.DataFrame({"Gene_mut": Gene_mut_list, 
                       "Gene_kd": Gene_kd_list, 
                       "Mutated_samples":size_mut,
                       "pvalue": p_list, 
                       "ES":es_list,
                       "FDR": FDR_List
                      })

#### Result_output

In [21]:
result.sort_values(by = ['pvalue']).to_csv("driver_gene_mutation_dependency_siRNA_panCancer_withFDR.csv")