In [1]:
import pandas as pd 
import numpy as np
from scipy.stats import chisquare
import matplotlib.pyplot as plt

## GI Scores
Load data from [Mapping the Genetic Landscape of Human Cells](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc2) paper. 
> [**Table S5.**](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc5)

In [2]:
# read data from file 
myfile = ("CRISPRi_Mapping_paper/Table_S5.xlsx") 
xl = pd.ExcelFile(myfile)
# print ('Sheets:')
# print (xl.sheet_names)
# make data frame from each Excel sheet
sheets = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
# Read data from *gene GI scores sheet*:
Gene1 = np.array(sheets['gene GI scores and correlations']['Unnamed: 0'][3:])
Gene2 = np.array(sheets['gene GI scores and correlations']['Unnamed: 1'][3:])
GIdataset = {
    'Gene1': Gene1, 'Gene2': Gene2, 'GenePairs': np.array(["_".join(i) for i in zip (Gene1, Gene2)]),
    'K562': np.array(sheets['gene GI scores and correlations']['K562.4'][3:]),     # gene1 <-> gene2 GI scores
    'Jurkat': np.array(sheets['gene GI scores and correlations']['Jurkat.4'][3:])  # gene1 <-> gene2 GI scores
}

In [3]:
SL_thr = 1 # Synthetic-Lethal threshold
SLdataset = {'K562':{'GI': GIdataset['K562'][GIdataset['K562'] > SL_thr],
                     'GenePair': GIdataset['GenePairs'][GIdataset['K562'] > SL_thr],
                     'Gene1': GIdataset['Gene1'][GIdataset['K562'] > SL_thr],
                     'Gene2': GIdataset['Gene2'][GIdataset['K562'] > SL_thr]},
             'Jurkat':{'GI': GIdataset['Jurkat'][GIdataset['Jurkat'] > SL_thr],
                     'GenePair': GIdataset['GenePairs'][GIdataset['Jurkat'] > SL_thr],
                     'Gene1': GIdataset['Gene1'][GIdataset['Jurkat'] > SL_thr],
                     'Gene2': GIdataset['Gene2'][GIdataset['Jurkat'] > SL_thr]}}
# function to select GI data based on cell-line and gene-pair 1 or 2
def cell_gene_sorter(dataset,cell,gene):
    output = []
    for sl in set(SLdataset[cell][gene]):
        for i, g in enumerate(dataset['Genes']):
            if g == sl:
                is_low = 1*(dataset['RNA-Seq'][i] < 0)
                output.append([g, is_low])
    return output 
# 
def filter_dataset(dataset):
    return {
        'K562_Gene1': cell_gene_sorter(dataset,'K562','Gene1'),
        'K562_Gene2': cell_gene_sorter(dataset,'K562','Gene2'),
        'Jurkat_Gene1': cell_gene_sorter(dataset,'Jurkat','Gene1'),
        'Jurkat_Gene2': cell_gene_sorter(dataset,'Jurkat','Gene2')
    }



# Expression patterns 

Read combined study clinical data (downloaded from cBioPortal):

In [4]:
clinical_data = pd.read_csv('cBioPortal/combined_study_clinical_data.csv')
# function to add_survival_data 
def add_survival_data(dataset):
    survival_status = []
    survival_months = []
    for i,sam in enumerate(clinical_data['Sample ID']):
        for dat in dataset['sample_ids']:
            if sam == dat:
                survival_status.append(clinical_data['Overall Survival Status'][i])
                survival_months.append(clinical_data['Overall Survival (Months)'][i])

    dataset['survival_status'] = survival_status
    dataset['survival_months'] = survival_months
    return dataset

## 1st
- ### [Acute Myeloid Leukemia (OHSU, Nature 2018)](https://www.cbioportal.org/study?id=aml_ohsu_2018)

In [5]:
# print meta data
filepath = 'cBioPortal/aml_ohsu_2018/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

type_of_cancer: aml
cancer_study_identifier: aml_ohsu_2018
name: Acute Myeloid Leukemia (OHSU, Nature 2018)
description: Whole-exome sequencing of 672 acute myeloid leukemia samples (with 454 matched normals) from the Beat AML program.
citation: Tyner et al. Nature 2018
pmid: 30333627
short_name: AML (OHSU)
groups: PUBLIC


In [6]:
# ls cBioPortal/aml_ohsu_2018

In [None]:
# read all RNA-Seq data 
filepath = 'cBioPortal/aml_ohsu_2018/data_RNA_Seq_mRNA_median_Zscores.txt'
with open(filepath) as fp:
    lines = [l.split('\t') for l in fp.readlines()]
    OHSUdataset = {
        'sample_ids': lines[0][2:],
        'Genes': [d[0] for d in lines[1:]], 
        'RNA-Seq': [np.array(d[2:], dtype = float) for d in lines[1:]]
    }
# subset RNA-Seq data to GI gene-pairs 
data = filter_dataset(OHSUdataset)

### Chi-square Test

In [None]:
test = {}
for g1,g2 in zip(data['K562_Gene1'],data['K562_Gene2']):
    g_key = "_".join([g1[0], g2[0]])
    test[g_key] = {
        "low_low":sum((g1[1]) * (g2[1])),
        "high_low":sum((g1[1] == 0) * (g2[1])),
        "low_high":sum((g1[1]) * (g2[1] == 0)),
        "high_high":sum((g1[1] == 0) * (g2[1] == 0))
    }
    test[g_key]['Pvalue'] = list(chisquare(list(test[g_key].values())))[1]

In [None]:
for t in test:
    if test[t]['Pvalue'] <1e-15:
        if test[t]['low_low'] > 0:
            print (t, '\t', '# of low_low:', test[t]['low_low'], 
                   ' from ', len(data['K562_Gene1'][0][1]), )

### Survival analysis 

In [None]:
# https://plot.ly/python/v3/ipython-notebooks/survival-analysis-r-vs-python/

In [None]:
OHSUdataset = add_survival_data(OHSUdataset)

# 2nd
- ### [Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)](https://www.cbioportal.org/study?id=all_phase2_target_2018_pub)

In [None]:
# meta data
filepath = 'cBioPortal/all_phase2_target_2018_pub/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

In [None]:
# ls cBioPortal/all_phase2_target_2018_pub

In [None]:
# read all RNA-Seq data 
filepath = 'cBioPortal/all_phase2_target_2018_pub/data_RNA_Seq_mRNA_median_Zscores.txt'
with open(filepath) as fp:
    lines = [l.split('\t') for l in fp.readlines()]
# switch NAs -> 0s
    for i,l in enumerate(lines):
        if 'NA' in l:
            n = len(l[2:])
            l = l[0:2]
            for z in [0]*n:
                l.append(z)
        lines[i] = l
    all_phase2 = {
        'sample_ids': lines[0][2:],
        'Genes': [d[0] for d in lines[1:]], 
        'RNA-Seq': [np.array(d[2:], dtype = float) for d in lines[1:]]
    }
# subset RNA-Seq data to GI gene-pairs 
data = filter_dataset(all_phase2)

### Chi-square Test

In [None]:
test = {}
for g1,g2 in zip(data['K562_Gene1'],data['K562_Gene2']):
    g_key = "_".join([g1[0], g2[0]])
    test[g_key] = {
        "low_low":sum((g1[1]) * (g2[1])),
        "high_low":sum((g1[1] == 0) * (g2[1])),
        "low_high":sum((g1[1]) * (g2[1] == 0)),
        "high_high":sum((g1[1] == 0) * (g2[1] == 0))
    }
    test[g_key]['Pvalue'] = list(chisquare(list(test[g_key].values())))[1]

In [None]:
for t in test:
    if test[t]['Pvalue'] <1e-15:
        if test[t]['low_low'] > 0:
            print (t, '\t', '# of low_low:', test[t]['low_low'], 
                   ' from ', len(data['K562_Gene1'][0][1]), )

In [None]:
### Other folders:
# aml_target_2018_pub
# laml_tcga
# laml_tcga_pan_can_atlas_2018
# laml_tcga_pub