In [1]:
import pandas as pd 
import numpy as np
from scipy.stats import chisquare, chi2_contingency
import matplotlib.pyplot as plt

## GI Scores
Load data from [Mapping the Genetic Landscape of Human Cells](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc2) paper. 
> [**Table S5.**](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc5)

In [2]:
# read data from file 
myfile = ("CRISPRi_Mapping_paper/Table_S5.xlsx") 
xl = pd.ExcelFile(myfile)
# print ('Sheets:')
# print (xl.sheet_names)
# make data frame from each Excel sheet
sheets = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
# Read data from *gene GI scores sheet*:
Gene1 = np.array(sheets['gene GI scores and correlations']['Unnamed: 0'][3:])
Gene2 = np.array(sheets['gene GI scores and correlations']['Unnamed: 1'][3:])
GIdataset = {
    'Gene1': Gene1, 'Gene2': Gene2, 'GenePairs': np.array(["_".join(i) for i in zip (Gene1, Gene2)]),
    'K562': np.array(sheets['gene GI scores and correlations']['K562.4'][3:]),     # gene1 <-> gene2 GI scores
    'Jurkat': np.array(sheets['gene GI scores and correlations']['Jurkat.4'][3:])  # gene1 <-> gene2 GI scores
}

In [3]:
SL_thr = 1 # Synthetic-Lethal threshold
SLdataset = {'K562':{'GI': GIdataset['K562'][GIdataset['K562'] > SL_thr],
                     'GenePair': GIdataset['GenePairs'][GIdataset['K562'] > SL_thr],
                     'Gene1': GIdataset['Gene1'][GIdataset['K562'] > SL_thr],
                     'Gene2': GIdataset['Gene2'][GIdataset['K562'] > SL_thr]},
             'Jurkat':{'GI': GIdataset['Jurkat'][GIdataset['Jurkat'] > SL_thr],
                     'GenePair': GIdataset['GenePairs'][GIdataset['Jurkat'] > SL_thr],
                     'Gene1': GIdataset['Gene1'][GIdataset['Jurkat'] > SL_thr],
                     'Gene2': GIdataset['Gene2'][GIdataset['Jurkat'] > SL_thr]}}
# function to select GI data based on cell-line and gene-pair 1 or 2
def filter_dataset(dataset):
    def cell_gene_sorter(dataset,cell,gene):
        output = []
        for sl in set(SLdataset[cell][gene]):
            for i, g in enumerate(dataset['Genes']):
                if g == sl:
                    Q1 = np.quantile(dataset['RNA-Seq'][i], 0.25)
                    Q3 = np.quantile(dataset['RNA-Seq'][i], 0.75)
                    is_low = 1*(dataset['RNA-Seq'][i] <= Q1)    # Low expression threshold
                    is_high = 1*(dataset['RNA-Seq'][i] >= Q3)   # High expression threshold
                    output.append([g, is_low, is_high])
        return output 
    return {
        'K562_Gene1': cell_gene_sorter(dataset,'K562','Gene1'),
        'K562_Gene2': cell_gene_sorter(dataset,'K562','Gene2'),
        'Jurkat_Gene1': cell_gene_sorter(dataset,'Jurkat','Gene1'),
        'Jurkat_Gene2': cell_gene_sorter(dataset,'Jurkat','Gene2')
    }

Read combined study clinical data (downloaded from cBioPortal):

In [4]:
clinical_data = pd.read_csv('cBioPortal/combined_study_clinical_data.csv')
# function to add_survival_data 
def add_survival_data(dataset):
    survival_status = []
    survival_months = []
    for i,sam in enumerate(clinical_data['Sample ID']):
        for dat in dataset['sample_ids']:
            if sam == dat:
                survival_status.append(clinical_data['Overall Survival Status'][i])
                survival_months.append(clinical_data['Overall Survival (Months)'][i])

    dataset['survival_status'] = survival_status
    dataset['survival_months'] = survival_months
    return dataset

# Expression patterns 

## 1st
- ### [Acute Myeloid Leukemia (OHSU, Nature 2018)](https://www.cbioportal.org/study?id=aml_ohsu_2018)

In [5]:
# print meta data
filepath = 'cBioPortal/aml_ohsu_2018/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

type_of_cancer: aml
cancer_study_identifier: aml_ohsu_2018
name: Acute Myeloid Leukemia (OHSU, Nature 2018)
description: Whole-exome sequencing of 672 acute myeloid leukemia samples (with 454 matched normals) from the Beat AML program.
citation: Tyner et al. Nature 2018
pmid: 30333627
short_name: AML (OHSU)
groups: PUBLIC


In [6]:
# ls cBioPortal/aml_ohsu_2018

In [23]:
# read all RNA-Seq data 
filepath = 'cBioPortal/aml_ohsu_2018/data_RNA_Seq_mRNA_median_Zscores.txt'
with open(filepath) as fp:
    lines = [l.split('\t') for l in fp.readlines()]
    aml_ohsu_dataset = {
        'sample_ids': lines[0][2:],
        'Genes': [d[0] for d in lines[1:]], 
        'RNA-Seq': [np.array(d[2:], dtype = float) for d in lines[1:]]
    }
# subset RNA-Seq data to GI gene-pairs 
data = filter_dataset(aml_ohsu_dataset)

### Chi-square Test

For K562 GI pairs:

In [24]:
K562_test = {}
for g1,g2 in zip(data['K562_Gene1'],data['K562_Gene2']):
    if g1[0] != g2[0]:
        g_key = "_".join([g1[0], g2[0]])
        Obs = [[sum((g1[1]) * (g2[1])), # "low_low"
                sum((g1[2]) * (g2[1]))], #"high_low"
               [sum((g1[1]) * (g2[2])), #"low_high"
                sum((g1[2]) * (g2[2])) # "high_high"
                ]]
        Pval = list(chi2_contingency(Obs))[1]
        Exp = list(chi2_contingency(Obs))[3]
        K562_test[g_key] = {'Obs': Obs, 'Exp': Exp,
                            'Obs/Exp': Obs/Exp, 'Pvalue': Pval}

Low-Low gene pairs
$$Obs/Exp < 1$$

In [25]:
for t in K562_test:
    if (K562_test[t]['Obs/Exp'][0][0] < 1) & (K562_test[t]['Pvalue'] < 5e-14):
        print (t, '>> Obs:', K562_test[t]['Obs'], '>> Pvalue:', K562_test[t]['Pvalue'])
        print ('\tObs/Exp [low_low, high_low]:\t', [round(x, 4) for x in K562_test[t]['Obs/Exp'][0]])
        print ('\tObs/Exp [low_high, high_high]:\t', [round(x, 4) for x in K562_test[t]['Obs/Exp'][1]], '\n')

GOLT1B_CSTF3 >> Obs: [[2, 77], [56, 3]] >> Pvalue: 9.869438986774179e-27
	Obs/Exp [low_low, high_low]:	 [0.0602, 1.6813]
	Obs/Exp [low_high, high_high]:	 [2.2583, 0.0877] 

CSTF3_PSMD6 >> Obs: [[6, 47], [59, 13]] >> Pvalue: 2.359608202145314e-14
	Obs/Exp [low_low, high_low]:	 [0.2177, 1.8475]
	Obs/Exp [low_high, high_high]:	 [1.5759, 0.3762] 

SNIP1_TCOF1 >> Obs: [[8, 60], [48, 7]] >> Pvalue: 2.867723837359165e-16
	Obs/Exp [low_low, high_low]:	 [0.2584, 1.6198]
	Obs/Exp [low_high, high_high]:	 [1.9169, 0.2336] 

NAMPT_FARSA >> Obs: [[6, 61], [64, 4]] >> Pvalue: 2.26264038448349e-22
	Obs/Exp [low_low, high_low]:	 [0.1727, 1.8909]
	Obs/Exp [low_high, high_high]:	 [1.8151, 0.1222] 

RNF20_MED9 >> Obs: [[3, 51], [70, 4]] >> Pvalue: 5.673730615089704e-23
	Obs/Exp [low_low, high_low]:	 [0.0974, 2.198]
	Obs/Exp [low_high, high_high]:	 [1.6586, 0.1258] 

MED9_DDX46 >> Obs: [[4, 61], [52, 11]] >> Pvalue: 1.4492577701452405e-17
	Obs/Exp [low_low, high_low]:	 [0.1407, 1.6684]
	Obs/Exp [low_high, 

For Jurkat GI pairs:

In [26]:
Jurkat_test = {}
for g1,g2 in zip(data['Jurkat_Gene1'],data['Jurkat_Gene2']):
    if g1[0] != g2[0]:
        g_key = "_".join([g1[0], g2[0]])
        Obs = [[sum((g1[1]) * (g2[1])), # "low_low"
                sum((g1[2]) * (g2[1]))], #"high_low"
               [sum((g1[1]) * (g2[2])), #"low_high"
                sum((g1[2]) * (g2[2])) # "high_high"
                ]]
        Pval = list(chi2_contingency(Obs))[1]
        Exp = list(chi2_contingency(Obs))[3]
        Jurkat_test[g_key] = {'Obs': Obs, 'Exp': Exp,
                              'Obs/Exp': Obs/Exp, 'Pvalue': Pval}

Low-Low gene pairs
$$Obs/Exp < 1$$

In [28]:
for t in Jurkat_test:
    if (Jurkat_test[t]['Obs/Exp'][0][0] < 1) & (Jurkat_test[t]['Pvalue'] < 5e-13):
        print (t, '>> Obs:', Jurkat_test[t]['Obs'], '>> Pvalue:', Jurkat_test[t]['Pvalue'])
        print ('\tObs/Exp [low_low, high_low]:\t', [round(x, 4) for x in Jurkat_test[t]['Obs/Exp'][0]])
        print ('\tObs/Exp [low_high, high_high]:\t', [round(x, 4) for x in Jurkat_test[t]['Obs/Exp'][1]], '\n')

FAM208A_DCTN4 >> Obs: [[3, 46], [77, 7]] >> Pvalue: 1.4769061163465606e-21
	Obs/Exp [low_low, high_low]:	 [0.1018, 2.3558]
	Obs/Exp [low_high, high_high]:	 [1.524, 0.2091] 

DNTTIP2_MRPS18B >> Obs: [[1, 61], [57, 6]] >> Pvalue: 1.3533253127841908e-22
	Obs/Exp [low_low, high_low]:	 [0.0348, 1.8356]
	Obs/Exp [low_high, high_high]:	 [1.9499, 0.1777] 

MTPAP_FAM50A >> Obs: [[7, 45], [54, 10]] >> Pvalue: 1.1723388697309157e-13
	Obs/Exp [low_low, high_low]:	 [0.256, 1.8252]
	Obs/Exp [low_high, high_high]:	 [1.6045, 0.3295] 

FARSA_NAMPT >> Obs: [[6, 64], [61, 4]] >> Pvalue: 2.26264038448349e-22
	Obs/Exp [low_low, high_low]:	 [0.1727, 1.8151]
	Obs/Exp [low_high, high_high]:	 [1.8909, 0.1222] 

MRPL18_SSBP1 >> Obs: [[7, 37], [59, 7]] >> Pvalue: 5.980051547119442e-14
	Obs/Exp [low_low, high_low]:	 [0.2652, 2.1023]
	Obs/Exp [low_high, high_high]:	 [1.4899, 0.2652] 

YTHDC1_DDX46 >> Obs: [[8, 74], [46, 6]] >> Pvalue: 7.258214646670845e-19
	Obs/Exp [low_low, high_low]:	 [0.2421, 1.5116]
	Obs/Exp [

### Survival analysis 

In [12]:
# https://plot.ly/python/v3/ipython-notebooks/survival-analysis-r-vs-python/

In [13]:
survival_data = add_survival_data(aml_ohsu_dataset)

# 2nd
- ### [Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)](https://www.cbioportal.org/study?id=all_phase2_target_2018_pub)

In [14]:
# meta data
filepath = 'cBioPortal/all_phase2_target_2018_pub/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

type_of_cancer: bll
cancer_study_identifier: all_phase2_target_2018_pub
name: Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)
short_name: ALL-Phase II (TARGET, 2018)
description: Comprehensive profiling of ALL Phase 2 samples. <p>TARGET data is intended exclusively for biomedical research using pediatric data (i.e., the research objectives cannot be accomplished using data from adults) that focus on the development of more effective treatments, diagnostic tests, or prognostic markers for childhood cancers. Moreover, TARGET data can be used for research relevant to the biology, causes, treatment and late complications of treatment of pediatric cancers, but is not intended for the sole purposes of methods and/or tool development (please see <a href="https://ocg.cancer.gov/programs/target/using-target-data">Using TARGET Data</a> section of the OCG website). If you are interested in using TARGET data for publication or other research purposes, you must follow the <a href="https

In [15]:
# ls cBioPortal/all_phase2_target_2018_pub

In [16]:
# read all RNA-Seq data 
filepath = 'cBioPortal/all_phase2_target_2018_pub/data_RNA_Seq_mRNA_median_Zscores.txt'
with open(filepath) as fp:
    lines = [l.split('\t') for l in fp.readlines()]
# switch NAs -> 0s
    for i,l in enumerate(lines):
        if 'NA' in l:
            n = len(l[2:])
            l = l[0:2]
            for z in [0]*n:
                l.append(z)
        lines[i] = l
    all_phase2 = {
        'sample_ids': lines[0][2:],
        'Genes': [d[0] for d in lines[1:]], 
        'RNA-Seq': [np.array(d[2:], dtype = float) for d in lines[1:]]
    }
# subset RNA-Seq data to GI gene-pairs 
data = filter_dataset(all_phase2)

### Chi-square Test

For K562 GI pairs:

In [17]:
K562_test = {}
for g1,g2 in zip(data['K562_Gene1'],data['K562_Gene2']):
    if g1[0] != g2[0]:
        g_key = "_".join([g1[0], g2[0]])
        Obs = [[sum((g1[1]) * (g2[1])), # "low_low"
                sum((g1[2]) * (g2[1]))], #"high_low"
               [sum((g1[1]) * (g2[2])), #"low_high"
                sum((g1[2]) * (g2[2])) # "high_high"
                ]]
        Pval = list(chi2_contingency(Obs))[1]
        Exp = list(chi2_contingency(Obs))[3]
        K562_test[g_key] = {'Obs': Obs, 'Exp': Exp,
                            'Obs/Exp': Obs/Exp, 'Pvalue': Pval}

Low-Low gene pairs
$$Obs/Exp < 1$$

In [22]:
for t in K562_test:
    if (K562_test[t]['Obs/Exp'][0][0] < 1) & (K562_test[t]['Pvalue'] < 5e-12):
        print (t, '>> Obs:', K562_test[t]['Obs'], '>> Pvalue:', K562_test[t]['Pvalue'])
        print ('\tObs/Exp [low_low, high_low]:\t', [round(x, 4) for x in K562_test[t]['Obs/Exp'][0]])
        print ('\tObs/Exp [low_high, high_high]:\t', [round(x, 4) for x in K562_test[t]['Obs/Exp'][1]], '\n')

GTPBP4_MRPS11 >> Obs: [[2, 29], [30, 0]] >> Pvalue: 1.6914330122079393e-12
	Obs/Exp [low_low, high_low]:	 [0.123, 1.9677]
	Obs/Exp [low_high, high_high]:	 [1.9062, 0.0] 

FH_SNIP1 >> Obs: [[0, 31], [31, 1]] >> Pvalue: 1.0286130192631635e-13
	Obs/Exp [low_low, high_low]:	 [0.0, 1.9688]
	Obs/Exp [low_high, high_high]:	 [1.9688, 0.0615] 

PNISR_COPE >> Obs: [[0, 22], [37, 2]] >> Pvalue: 2.3733651789734316e-12
	Obs/Exp [low_low, high_low]:	 [0.0, 2.5417]
	Obs/Exp [low_high, high_high]:	 [1.5641, 0.1303] 

ASNA1_PNN >> Obs: [[2, 40], [28, 2]] >> Pvalue: 3.513344076855311e-13
	Obs/Exp [low_low, high_low]:	 [0.1143, 1.6327]
	Obs/Exp [low_high, high_high]:	 [2.24, 0.1143] 

CDC27_NUTF2 >> Obs: [[1, 33], [32, 1]] >> Pvalue: 9.181537170200546e-14
	Obs/Exp [low_low, high_low]:	 [0.0597, 1.9126]
	Obs/Exp [low_high, high_high]:	 [1.9688, 0.0597] 

CCDC84_MRPL37 >> Obs: [[2, 25], [33, 0]] >> Pvalue: 3.0742381196899276e-12
	Obs/Exp [low_low, high_low]:	 [0.127, 2.2222]
	Obs/Exp [low_high, high_high]:

For Jurkat GI pairs:

In [19]:
Jurkat_test = {}
for g1,g2 in zip(data['Jurkat_Gene1'],data['Jurkat_Gene2']):
    if g1[0] != g2[0]:
        g_key = "_".join([g1[0], g2[0]])
        Obs = [[sum((g1[1]) * (g2[1])), # "low_low"
                sum((g1[2]) * (g2[1]))], #"high_low"
               [sum((g1[1]) * (g2[2])), #"low_high"
                sum((g1[2]) * (g2[2])) # "high_high"
                ]]
        Pval = list(chi2_contingency(Obs))[1]
        Exp = list(chi2_contingency(Obs))[3]
        Jurkat_test[g_key] = {'Obs': Obs, 'Exp': Exp,
                              'Obs/Exp': Obs/Exp, 'Pvalue': Pval}

Low-Low gene pairs
$$Obs/Exp < 1$$

In [20]:
for t in Jurkat_test:
    if (Jurkat_test[t]['Obs/Exp'][0][0] < 1) & (Jurkat_test[t]['Pvalue'] < 5e-10):
        print (t, '>> Obs:', Jurkat_test[t]['Obs'], '>> Pvalue:', Jurkat_test[t]['Pvalue'])
        print ('\tObs/Exp [low_low, high_low]:\t', [round(x, 4) for x in Jurkat_test[t]['Obs/Exp'][0]])
        print ('\tObs/Exp [low_high, high_high]:\t', [round(x, 4) for x in Jurkat_test[t]['Obs/Exp'][1]], '\n')

EIF6_RBM17 >> Obs: [[1, 35], [29, 1]] >> Pvalue: 1.5908223345963228e-13
	Obs/Exp [low_low, high_low]:	 [0.0611, 1.7824]
	Obs/Exp [low_high, high_high]:	 [2.1267, 0.0611] 

EFR3A_MRPL11 >> Obs: [[0, 27], [36, 0]] >> Pvalue: 1.5905368276201003e-14
	Obs/Exp [low_low, high_low]:	 [0.0, 2.3333]
	Obs/Exp [low_high, high_high]:	 [1.75, 0.0] 

POLR2B_SNRPC >> Obs: [[0, 25], [35, 2]] >> Pvalue: 1.1767932226554716e-12
	Obs/Exp [low_low, high_low]:	 [0.0, 2.2963]
	Obs/Exp [low_high, high_high]:	 [1.6757, 0.1241] 

SDHC_CDK12 >> Obs: [[4, 26], [32, 2]] >> Pvalue: 4.1406823675848444e-10
	Obs/Exp [low_low, high_low]:	 [0.237, 1.981]
	Obs/Exp [low_high, high_high]:	 [1.6732, 0.1345] 

OPA1_RPL32 >> Obs: [[1, 22], [33, 2]] >> Pvalue: 6.548899066609702e-11
	Obs/Exp [low_low, high_low]:	 [0.0742, 2.3116]
	Obs/Exp [low_high, high_high]:	 [1.6084, 0.1381] 

RPS27A_PAXBP1 >> Obs: [[1, 36], [25, 2]] >> Pvalue: 3.0929771409731258e-12
	Obs/Exp [low_low, high_low]:	 [0.0665, 1.6387]
	Obs/Exp [low_high, high_hi

In [21]:
### Other folders:
# aml_target_2018_pub
# laml_tcga
# laml_tcga_pan_can_atlas_2018
# laml_tcga_pub