### Title: Mutation dependent synthetic lethyal pipeline -- tumor type comparison
<font color='blue'> Author:</font>  Guangrong Qin<br/>

<font color='blue'> Contact:</font>  gqin@systemsbiology.org<br/>

<font color='blue'> Description:</font> This notebook is used to answer which gene knockout or gene knockdown  show sensitivity to certain gene mutation or the mutation of a group of genes in different tumor types <br/>

<font color='blue'>Citations:</font> The functional screening data and omics data for cell lines is from the Depmap and CCLE project from the Broad institute (DepMap Public 20Q3). To use this jupyter notebook and the data which are used in the jupyter notebook, Please cite the following papers<br/>

Bahar Tercan, Guangrong Qin, Taek-Kyun Kim, Boris Aguilar, Christopher J. Kemp, Nyasha Chambwe, Ilya Shmulevich. SL-Cloud: A Computational Resource to Support Synthetic Lethal Interaction Discovery. BioRxiv 2021.09.18.459450; doi: https://doi.org/10.1101/2021.09.18.459450

For this DepMap release:
DepMap, Broad (2020): DepMap 20Q3 Public. figshare. Dataset doi:10.6084/m9.figshare.11791698.v2.

For CRISPR datasets:
Robin M. Meyers, Jordan G. Bryan, James M. McFarland, Barbara A. Weir, ... David E. Root, William C. Hahn, Aviad Tsherniak. Computational correction of copy number effect improves specificity of CRISPR-Cas9 essentiality screens in cancer cells. Nature Genetics 2017 October 49:1779–1784. doi:10.1038/ng.3984. PMID: 29083409

Dempster, J. M., Rossen, J., Kazachkova, M., Pan, J., Kugener, G., Root, D. E., & Tsherniak, A. (2019). Extracting Biological Insights from the Project Achilles Genome-Scale CRISPR Screens in Cancer Cell Lines. BioRxiv, 720243.

For omics datasets:
Mahmoud Ghandi, Franklin W. Huang, Judit Jané-Valbuena, Gregory V. Kryukov, ... Todd R. Golub, Levi A. Garraway & William R. Sellers. 2019. Next-generation characterization of the Cancer Cell Line Encyclopedia. Nature 569, 503–508 (2019).PMID: 31068700


In [None]:
#Check the required libraries
try:
    from google.cloud import bigquery
    print("module 'google-cloud-bigquery' is installed")
except ModuleNotFoundError:
    !pip install google-cloud-bigquery
    from google.cloud import bigquery

try:
    import ipywidgets as widgets
    print("module 'ipywidgets' is installed")
except ModuleNotFoundError:
    !pip install ipywidgets
    import ipywidgets as widgets

try:
    import pyarrow
    print("module 'pyarrow' is installed")
except ModuleNotFoundError:
    !pip install pyarrow
    import pyarrow

try:
    import pandas as pd
    print("module 'pandas' is installed")
except ModuleNotFoundError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
    print("module 'numpy' is installed")
except ModuleNotFoundError:
    !pip install numpy
    import numpy as np

try:
    from scipy import stats    
    print("module 'scipy' is installed")
except ModuleNotFoundError:
    !pip install scipy
    from scipy import stats    

try:
    import statsmodels.stats.multitest as multi   
    print("module 'statsmodels' is installed")
except ModuleNotFoundError:
    !pip install statsmodels
    import statsmodels.stats.multitest as multi

try:
    import seaborn as sns
    print("module 'seaborn' is installed")
except ModuleNotFoundError:
    !pip install seaborn
    import seaborn as sns
    

try:
    import matplotlib.pyplot as plt
    print("module 'matplotlib' is installed")
except ModuleNotFoundError:
    !pip install matplotlib
    import matplotlib.pyplot as plt
            
import sys
sys.path.append('../Scripts/')
import MDSLP

In [None]:
!gcloud auth application-default login

In [None]:
# Users need to a google cloud project to query the data in the BigQuery tables. 
project_id='syntheticlethality' #users need to have their own google project to query the datasets on ISB-CGC
client = bigquery.Client(project_id)

In [None]:
#This step may take a little bit longer time
Mut_mat = MDSLP.get_ccle_mutation_data(project_id) # Get mutation table for the ccle cell lines (version: Depmap 20Q3)
Demeter_data = MDSLP.get_demeter_shRNA_data(project_id) # Get shRNA-based gene knockdown effects from the Depmap project (Demeter2)
Depmap_matrix = MDSLP.get_depmap_crispr_data(project_id) #Get the CRISPR-based gene knockout effects from the Depmap project (version: Depmap 20Q3) 

In [None]:
Data_source = "shRNA" # only two options are avaiable, "shRNA" or "Crispr", datatype: string
Gene_list = ['BRCA2'] # data type: list of gene symbols


In [None]:
# ID mapping between the CCLE annotation and input gene symbols
id_mapping, Gene_list_matched = MDSLP.GeneSymbol_standardization(Gene_list,project_id)


In [None]:
#### Selection of tumor types for comparison
query = ''' 
SELECT DepMap_ID, primary_disease,TCGA_subtype
FROM `syntheticlethality.DepMap_public_20Q3.sample_info_Depmap_withTCGA_labels` 
'''
sample_info = client.query(query).result().to_dataframe()

pancancer_cls = sample_info.loc[~sample_info['primary_disease'].isin(['Non-Cancerous','Unknown','Engineered','Immortalized'])]
pancancer_cls = pancancer_cls.loc[~(pancancer_cls['primary_disease'].isna())]

TCGA_list = [x for x in list(set(pancancer_cls['primary_disease'])) if x == x]

Not_none_values = filter(None.__ne__, TCGA_list)
TCGA_list = list(Not_none_values)

tumor_type = widgets.SelectMultiple(
    options=['pancancer'] + TCGA_list  ,
    value=[],
    description='Tumor type',
    disabled=False
)
display(tumor_type)

In [None]:
#Test whether two genes are SL pairs in different tumor types using the shRNA dataset
Gene_list = ['ARID1A'] #Genes mutated
pan_cancer_result =  pd.DataFrame()
for tumor in list(tumor_type.value):
    print(tumor)
    Data_source = "shRNA"
    if Data_source == "shRNA":
        result_shRNA = MDSLP.Mutational_based_SL_pipeline([tumor], Gene_list, Mut_mat, Demeter_data, Data_source,project_id)
        if result_shRNA.shape[0] > 0:
            result_shRNA_ARID1B = result_shRNA.loc[result_shRNA['Gene_kd_symbol'] =='ARID1B'] #Genes being knockdown
            pan_cancer_result = pd.concat([pan_cancer_result, result_shRNA_ARID1B])

In [None]:
pan_cancer_result['source']=['MDSLP-shRNA']*pan_cancer_result.shape[0]

In [None]:
pan_cancer_result

In [None]:
#Test whether two genes are SL pairs in different tumor types using the CRISPR dataset

Gene_list = ['ARID1A'] #Genes mutated
pan_cancer_result_crispr =  pd.DataFrame()
for tumor in list(tumor_type.value):
    print(tumor)
    Data_source = "Crispr"
    if Data_source == "Crispr":
        result_crispr = MDSLP.Mutational_based_SL_pipeline([tumor], Gene_list, Mut_mat, Depmap_matrix, Data_source,project_id)
        if result_crispr.shape[0] > 0:
            result_crispr_ARID1B = result_crispr.loc[result_crispr['Gene_kd_symbol'] =='ARID1B'] #Genes being knockout
            pan_cancer_result_crispr = pd.concat([pan_cancer_result_crispr, result_crispr_ARID1B])
            

In [None]:
pan_cancer_result_crispr['source']=['MDSLP-CRISPR']*pan_cancer_result_crispr.shape[0]

In [None]:
result = pd.concat([pan_cancer_result_crispr,pan_cancer_result])

In [None]:
result['-log(FDR)'] = -1 *np.log(result['FDR_all_exp'])

In [None]:
result

In [None]:
#result.to_csv("tumor_specific_analysis_ARID1A_ARID1B.csv") output to a file

In [None]:
#Plot the results of the gene pairs in different tumor types. Effect size is shown in the figure below.

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = [4,4], dpi = 300)

clrs = []
for x in range(0,int(result.shape[0]/2)):
    clrs.append('#5477b4')
    clrs.append('#dc895a')

ax = sns.barplot(x="ES", y="Tumor_type", hue="source",data=result,
                 orient = 'h', 
                 palette = clrs)

plt.setp(ax.get_legend().get_texts(), fontsize='8') # for legend text
ax.set_xlabel('Effect size (Mut - WT)', fontsize=14)
ax.set_ylabel('', fontsize=0)
ax.set(xlim=(-2, 0))
plt.legend(loc='lower left')


In [None]:
#Plot the results of the gene pairs in different tumor types. negative log-transformed FDR is shown in the figure below.
clrs = []
for x in range(0,int(result.shape[0]/2)):
    clrs.append('#5477b4')
    clrs.append('#dc895a')
    
plt.figure(figsize = [4,4], dpi = 300)
ax1 = sns.barplot(x="-log(FDR)", y="Tumor_type", hue="source",
                  data=result,
                  orient = 'h' ,
                 
                  palette = clrs)
plt.setp(ax1.get_legend().get_texts(), fontsize='8') # for legend text
ax1.set_xlabel('Statistical Significance', fontsize=14) #-1 * log (FDR)
ax1.set_ylabel('', fontsize=0) #ignore the y axis label
plt.legend(loc='lower right')

plt.plot([1.301029996, 1.301029996], [0, int(result.shape[0]/2)], 'k-', lw=0.5) # Plot the significance threshold 1.301029996 = -log(0.05)


In [None]:
result.loc[result['source'] == 'MDSLP-CRISPR'].sort_values(by = ['FDR_all_exp'])

In [None]:
## End analysis