In [1]:
import numpy as np
import pandas as pd 
import lifelines as ll
import matplotlib.pyplot as plt
import plotly as py
import plotly.tools as tls   
from time import time
from scipy.stats import chisquare, chi2_contingency

First of all we define some functions for our purpose: 

## GI Scores
Load data from [Mapping the Genetic Landscape of Human Cells](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc2) paper. 
> [**Table S5.**](https://www.sciencedirect.com/science/article/pii/S0092867418307359?via%3Dihub#mmc5)

In [283]:
def get_SLdataset(SL_thr = 1):
    # read data from file 
    myfile = ("CRISPRi_Mapping_paper/Table_S5.xlsx") 
    xl = pd.ExcelFile(myfile)
    sheets = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
    # print ('Sheets:')
    # print (xl.sheet_names)
    # Read data from *gene GI scores sheet*:
    GIsheet = sheets['gene GI scores and correlations']
    raw_data = pd.DataFrame(data = {
        'Gene1': np.array(GIsheet['Unnamed: 0'][3:]),
        'Gene2': np.array(GIsheet['Unnamed: 1'][3:]), 
        'K562': np.array(GIsheet['K562.4'][3:]),                # g1 <-> g2 GI scores
        'Jurkat': np.array(GIsheet['Jurkat.4'][3:])             # g1 <-> g2 GI scores
    })
    raw_K562   = raw_data.drop(columns='Jurkat').dropna()
    raw_Jurkat = raw_data.drop(columns='K562').dropna()
    K562   = raw_K562[(raw_K562['K562'] > SL_thr) & 
                      (raw_K562['Gene1'] != raw_K562['Gene2'])].reset_index(drop=True)
    Jurkat = raw_Jurkat[(raw_Jurkat['Jurkat'] > SL_thr) &
                        (raw_Jurkat['Gene1'] != raw_Jurkat['Gene2'])].reset_index(drop=True)
    print (f'K562: {round(100* len(K562) / len(raw_K562), 2)}%', end = '\t') 
    print (f'{len(K562)} SLs from {len(raw_K562)} unique gene pairs')
    print (f'Jurkat: {round(100* len(Jurkat) / len(raw_Jurkat), 2)}%', end = '\t') 
    print (f'{len(Jurkat)} SLs from {len(raw_Jurkat)} unique gene pairs ')
    return {'K562':K562, 'Jurkat':Jurkat}

## Read expression data


In [284]:
# make final data set 
def make_study_dataset(filepath, get_RNASeq = False):
    t0 = time()
    with open(filepath) as fp:
        lines = [l.split('\t') for l in fp.readlines()]
        # switch NAs -> 0s
        for i,l in enumerate(lines):
            if 'NA' in l:
                n = len(l[2:])
                l = l[0:2]
                for z in [0]*n:
                    l.append(z)
            lines[i] = l
        genes = [d[0] for d in lines[1:]]
        RNA_seq = [np.array(d[2:], dtype = float) for d in lines[1:]]
        Q1 = [np.quantile(p, 0.25) for p in RNA_seq]       # Low expression threshold
        Q3 = [np.quantile(p, 0.75) for p in RNA_seq]       # High expression threshold        
        data = pd.DataFrame(data = [i + j for i,j in zip([-1*(p <= q1) for p,q1 in zip(RNA_seq, Q1)], 
                                                         [ 1*(p >= q3) for p,q3 in zip(RNA_seq, Q3)])],
                            index = [d[0] for d in lines[1:]],
                            columns= lines[0][2:]
        )
    print("done in %fs" % (time() - t0))       
    if get_RNASeq == False:
        return data
    else:
        return RNA_seq, data

## Statistical Test 

0       1.21985
1       1.26319
2       1.28139
3       1.15275
4        2.1395
         ...   
9766    1.14273
9767    1.37603
9768    1.37749
9769    3.14675
9770    2.57599
Name: Jurkat, Length: 9771, dtype: object

In [474]:
def SL_stat_test(data, SLdata, cell_line = 'K562'):
    t0 = time()
    tmp = SLdata[cell_line]
    # filter out missing gene names between GI and expression studies 
    tmp = tmp[pd.DataFrame(tmp.Gene1.tolist()).isin(data.index.tolist()).any(1)].reset_index(drop=True)
    tmp = tmp[pd.DataFrame(tmp.Gene2.tolist()).isin(data.index.tolist()).any(1)].reset_index(drop=True)
    Obs = []
    for g1,g2 in zip(tmp.Gene1,tmp.Gene2):    # g1   g2
         Obs.append([
            [sum(np.array(data.loc[g1] < 0)
             * np.array(data.loc[g2] < 0)),   # low_low
             sum(np.array(data.loc[g1] > 0)
             * np.array(data.loc[g2] < 0))],   # high_low
            [sum(np.array(data.loc[g1] < 0)
             * np.array(data.loc[g2] > 0)),  # low_high
             sum(np.array(data.loc[g1] > 0)
             * np.array(data.loc[g2] > 0))]   # high_high
         ])   
    # do chi-square test:
    chi2_res = [chi2_contingency(o) for o in np.array(Obs)]
    Chi2 = [c[0] for c in chi2_res]
    Pval = [p[1] for p in chi2_res]
    Exp = [np.concatenate((x[3][0], x[3][1])) for x in chi2_res]
    Obs = [np.concatenate((o[0], o[1])) for o in Obs]         
    # make the test result dataframe:
    test_res = pd.DataFrame(data= np.concatenate((
        np.stack(( tmp.Gene1 ,tmp.Gene2, tmp[cell_line]), axis = 1),
        np.array(Obs), np.array(Exp), 
        np.stack(( Pval, Chi2), axis = 1),
        ), axis = 1), columns=[
        'Gene1', 'Gene2', 'GI_Score',
        'Obs_Low_Low', 'Obs_High_Low', 'Obs_Low_High', 'Obs_High_High',
        'Exp_Low_Low' ,'Exp_High_Low', 'Exp_Low_High', 'Exp_High_High', 'Pval', 'Chi2'])
    print("done in %fs" % (time() - t0))
    return test_res

In [320]:
    test_res = pd.DataFrame(data= np.concatenate((
        np.stack(( tmp.Gene1 ,tmp.Gene2), axis = 1),
        np.array(Obs), np.array(Exp), 
        np.stack(( Pval, Chi2), axis = 1),
        ), axis = 1), columns=[
        'Gene1', 'Gene2', 
        'Obs_Low_Low', 'Obs_High_Low', 'Obs_Low_High', 'Obs_High_High',
        'Exp_Low_Low' ,'Exp_High_Low', 'Exp_Low_High', 'Exp_High_High', 'Pval', 'Chi2'])

## Survival analysis 
Read combined study clinical data (downloaded from cBioPortal):

In [94]:
def SL_survival(data, test):
    t0 = time()
    clinical_data = pd.read_csv('cBioPortal/combined_study_clinical_data.csv')
    dfs = []
    for t in test:
        survival_data = []
        g1, g2 = t.split('_') 
        g1_index = [ i for i, g in enumerate(data['raw']['Genes']) if g == g1][0]
        g2_index = [ i for i, g in enumerate(data['raw']['Genes']) if g == g2][0]
        Q1g1 = np.quantile(data['raw']['RNA-Seq'][g1_index], 0.25) # Gene 1 threshold
        Q1g2 = np.quantile(data['raw']['RNA-Seq'][g2_index], 0.25) # Gene 2 threshold
        for p, sam in enumerate(data['raw']['sample_ids']):
            dat1 = 1*(data['raw']['RNA-Seq'][data['raw']['Genes'] == g1][p] <= Q1g1)    
            dat2 = 1*(data['raw']['RNA-Seq'][data['raw']['Genes'] == g2][p] <= Q1g2)
            surS = clinical_data['Overall Survival Status'][clinical_data['Sample ID'] == sam]
            surM = clinical_data['Overall Survival (Months)'][clinical_data['Sample ID'] == sam]
            if len(surS) > 0:
                survival_data.append([sam,dat1,dat2,surS.values[0],surM.values[0]])
        dfs.append(
            pd.DataFrame(survival_data, columns=['sample_ids', g1 + ' is low', g2 + ' is low', 
                                                 'Status', 'Months']).dropna()
        )
    print("done in %fs" % (time() - t0))
    return dfs

# Investigate expression patterns 

## 1st
- ### [Acute Myeloid Leukemia (OHSU, Nature 2018)](https://www.cbioportal.org/study?id=aml_ohsu_2018)

In [8]:
# ls cBioPortal/aml_ohsu_2018

In [271]:
# print meta data
with open('cBioPortal/aml_ohsu_2018/meta_study.txt') as fp:
    print( fp.read() )

type_of_cancer: aml
cancer_study_identifier: aml_ohsu_2018
name: Acute Myeloid Leukemia (OHSU, Nature 2018)
description: Whole-exome sequencing of 672 acute myeloid leukemia samples (with 454 matched normals) from the Beat AML program.
citation: Tyner et al. Nature 2018
pmid: 30333627
short_name: AML (OHSU)
groups: PUBLIC


In [272]:
SLdata = get_SLdataset()
data = make_study_dataset('cBioPortal/aml_ohsu_2018/data_RNA_Seq_mRNA_median_Zscores.txt')

done in 15.731779s


### Chi-square Test

Low-Low gene pairs for **K562** GI pairs:

In [467]:
test = SL_stat_test(data, SLdata, cell_line = 'K562')

done in 134.052290s


> Low Low $Obs/Exp <  $ High High $Obs/Exp$

In [473]:
test[
    ((test.Obs_Low_Low/ test.Exp_Low_Low) < (test.Obs_High_High/test.Exp_High_High)) 
    & (test.Pval < 1e-10)
].sort_values('K562', ascending = False)

Unnamed: 0,Gene1,Gene2,K562,Obs_Low_Low,Obs_High_Low,Obs_Low_High,Obs_High_High,Exp_Low_Low,Exp_High_Low,Exp_Low_High,Exp_High_High,Pval,Chi2
10662,LEO1,MRPL36,5.69817,72,1,1,70,37.0069,35.9931,35.9931,35.0069,1.32063e-30,132.248
10680,LEO1,MRPS5,5.40063,60,16,7,40,41.3984,34.6016,25.6016,21.3984,1.52926e-11,45.4963
10675,LEO1,MRPS23,5.28603,55,13,6,43,35.453,32.547,25.547,23.453,9.00831e-13,51.0491
1224,ATP5A1,NDUFB1,5.25815,53,6,11,46,32.5517,26.4483,31.4483,25.5517,9.35713e-14,55.4976
16661,RPL23,RPL35,4.9084,63,9,5,61,35.4783,36.5217,32.5217,33.4783,3.24159e-20,84.8361
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9732,GTPBP4,NOP58,1.00064,60,14,4,46,38.1935,35.8065,25.8065,24.1935,5.94653e-15,60.9192
7934,EMG1,MRPL46,1.00054,63,6,9,57,36.8,32.2,35.2,30.8,7.34535e-19,78.6686
4771,CNOT1,DARS,1.00026,60,9,13,50,38.1591,30.8409,34.8409,28.1591,7.44205e-14,55.9478
13527,MZT1,PGK1,1.00019,57,15,7,48,36.2835,35.7165,27.7165,27.2835,4.45065e-13,52.4335


In [433]:
test[
    (test.Obs_Low_Low/ test.Exp_Low_Low < test.Obs_High_High/test.Exp_High_High) & 
#     (test.Obs_High_Low > 3) & (test.Obs_Low_High > 3) &
    (test.Obs_Low_Low > test.Obs_High_Low) &
    (test.Pval < 1e-10)
].sort_values('Pval')

Unnamed: 0,Gene1,Gene2,Obs_Low_Low,Obs_High_Low,Obs_Low_High,Obs_High_High,Exp_Low_Low,Exp_High_Low,Exp_Low_High,Exp_High_High,Pval,Chi2
3886,CDCA8,CDK1,98,0,0,90,51.0851,46.9149,46.9149,43.0851,6.44217e-42,184.014
16725,RPL3,RPL5,94,0,0,86,49.0889,44.9111,44.9111,41.0889,3.59495e-40,176.014
10459,KIF11,SGOL1,91,0,0,88,46.2626,44.7374,44.7374,43.2626,5.92338e-40,175.021
4266,CENPM,RAD51,92,0,0,86,47.5506,44.4494,44.4494,41.5506,9.80978e-40,174.018
3903,CDCA8,KIF14,92,0,0,86,47.5506,44.4494,44.4494,41.5506,9.80978e-40,174.018
...,...,...,...,...,...,...,...,...,...,...,...,...
13911,NDUFA8,TMEM261,59,16,8,39,41.1885,33.8115,25.8115,21.1885,9.63616e-11,41.8939
8995,GEMIN4,PDCD7,49,9,14,46,30.9661,27.0339,32.0339,27.9661,9.65069e-11,41.891
14876,PDSS2,TFB1M,56,9,16,43,37.7419,27.2581,34.2581,24.7419,9.72974e-11,41.875
14628,NUTF2,TUBB,55,19,9,48,36.1527,37.8473,27.8473,29.1527,9.90243e-11,41.8406


Low-Low gene pairs for **Jurkat** GI pairs:

In [383]:
test = SL_stat_test(data, SLdata, cell_line = 'Jurkat')

done in 125.003775s


> Low Low $Obs/Exp <  $ High High $Obs/Exp$

### Survival analysis 

https://plot.ly/python/v3/ipython-notebooks/survival-analysis-r-vs-python/

# 2nd
- ### [Pediatric Acute Lymphoid Leukemia - Phase II (TARGET, 2018)](https://www.cbioportal.org/study?id=all_phase2_target_2018_pub)

In [None]:
# meta data
filepath = 'cBioPortal/all_phase2_target_2018_pub/meta_study.txt'
with open(filepath) as fp:
    print( fp.read() )

In [None]:
# ls cBioPortal/all_phase2_target_2018_pub

In [61]:
data = make_study_dataset('cBioPortal/all_phase2_target_2018_pub/data_RNA_Seq_mRNA_median_Zscores.txt')

### Chi-square Test

Low-Low gene pairs for **K562** GI pairs:
$$Obs/Exp < 1$$

In [62]:
cell = 'K562'
test = GI_stat_test(data['GI'], cell, t_pval = 5e-14)
for t in test:
    print (t, '>> Obs:', test[t]['Obs'], '>> Pvalue:', test[t]['Pvalue'])
    print ('\tObs/Exp [low_low, high_low]:\t', [round(x, 4) for x in test[t]['Obs/Exp'][0]])
    print ('\tObs/Exp [low_high, high_high]:\t', [round(x, 4) for x in test[t]['Obs/Exp'][1]], '\n')

Low-Low gene pairs for **Jurkat** GI pairs:
$$Obs/Exp < 1$$

In [None]:
cell = 'Jurkat'
test = GI_stat_test(data['GI'], cell, t_pval = 5e-14)
for t in test:
    print (t, '>> Obs:', test[t]['Obs'], '>> Pvalue:', test[t]['Pvalue'])
    print ('\tObs/Exp [low_low, high_low]:\t', [round(x, 4) for x in test[t]['Obs/Exp'][0]])
    print ('\tObs/Exp [low_high, high_high]:\t', [round(x, 4) for x in test[t]['Obs/Exp'][1]], '\n')

In [None]:
### Other folders:
# aml_target_2018_pub
# laml_tcga
# laml_tcga_pan_can_atlas_2018
# laml_tcga_pub