## Univariate correlation analysis

TODO: describe

In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

import pancancer_evaluation.config as cfg
import pancancer_evaluation.utilities.data_utilities as du

%load_ext autoreload
%autoreload 2

In [2]:
gene = 'TP53'
mad_threshold = 100

### Load expression data and mutation label data

In [3]:
print('Loading gene label data...', file=sys.stderr)
genes_df = du.load_top_50()
sample_info_df = du.load_sample_info(verbose=True)

# this returns a tuple of dataframes, unpack it below
pancancer_data = du.load_pancancer_data(verbose=True)
(sample_freeze_df,
 mutation_df,
 copy_loss_df,
 copy_gain_df,
 mut_burden_df) = pancancer_data

rnaseq_df = du.load_expression_data(verbose=True)

# standardize columns of expression dataframe
print('Standardizing columns of expression data...', file=sys.stderr)
rnaseq_df[rnaseq_df.columns] = StandardScaler().fit_transform(rnaseq_df[rnaseq_df.columns])

Loading gene label data...
Loading sample info...
Loading pan-cancer data from cached pickle file...
Loading gene expression data...
Standardizing columns of expression data...


In [4]:
print(rnaseq_df.shape)
rnaseq_df.iloc[:5, :5]

(11060, 16148)


Unnamed: 0_level_0,1,10,100,1000,10000
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,-0.1441,-0.13645,-0.207065,1.049402,0.644625
TCGA-02-0055-01,-0.124925,-0.197893,-0.132694,0.704438,0.154763
TCGA-02-2483-01,-0.133543,-0.174587,-0.103291,1.47342,0.669303
TCGA-02-2485-01,-0.147052,-0.072888,-0.213119,4.405612,11.503035
TCGA-02-2486-01,-0.145321,-0.181076,-0.147395,1.013468,0.117745


In [5]:
sample_freeze_df.head()

Unnamed: 0,PATIENT_BARCODE,SAMPLE_BARCODE,DISEASE,SUBTYPE
0,TCGA-OR-A5J1,TCGA-OR-A5J1-01,ACC,Not_Applicable
1,TCGA-OR-A5J2,TCGA-OR-A5J2-01,ACC,Not_Applicable
2,TCGA-OR-A5J3,TCGA-OR-A5J3-01,ACC,Not_Applicable
3,TCGA-OR-A5J5,TCGA-OR-A5J5-01,ACC,Not_Applicable
4,TCGA-OR-A5J6,TCGA-OR-A5J6-01,ACC,Not_Applicable


In [6]:
mutation_df.iloc[:5, :5]

Unnamed: 0_level_0,5S_rRNA,A1BG,A1CF,A2M,A2ML1
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,0,0,0,0,0
TCGA-02-0055-01,0,0,0,0,0
TCGA-02-2483-01,0,0,0,0,0
TCGA-02-2485-01,0,0,0,0,0
TCGA-02-2486-01,0,0,0,0,0


In [7]:
y_df = (mutation_df
    .loc[:, [gene]]
    .merge(sample_freeze_df, left_index=True, right_on='SAMPLE_BARCODE')
    .drop(columns='PATIENT_BARCODE')
    .set_index('SAMPLE_BARCODE')
    .rename(columns={gene: 'status',
                     'DISEASE': 'cancer_type',
                     'SUBTYPE': 'subtype'})
)
print(y_df.shape)
y_df.head()

(9074, 3)


Unnamed: 0_level_0,status,cancer_type,subtype
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-02-0047-01,0,GBM,IDHwt
TCGA-02-0055-01,1,GBM,IDHwt
TCGA-02-2483-01,1,GBM,IDHmut-non-codel
TCGA-02-2485-01,1,GBM,IDHwt
TCGA-02-2486-01,0,GBM,IDHwt


In [8]:
X_df = rnaseq_df.reindex(y_df.index)
print(X_df.shape)
print(X_df.isna().sum().sum())
X_df.iloc[:5, :5]

(9074, 16148)
0


Unnamed: 0_level_0,1,10,100,1000,10000
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,-0.1441,-0.13645,-0.207065,1.049402,0.644625
TCGA-02-0055-01,-0.124925,-0.197893,-0.132694,0.704438,0.154763
TCGA-02-2483-01,-0.133543,-0.174587,-0.103291,1.47342,0.669303
TCGA-02-2485-01,-0.147052,-0.072888,-0.213119,4.405612,11.503035
TCGA-02-2486-01,-0.145321,-0.181076,-0.147395,1.013468,0.117745


### Subset genes by mean absolute deviation

In [9]:
# first subset by MAD
mad_genes_df = (rnaseq_df
    .mad(axis=0)
    .sort_values(ascending=False)
    .reset_index()
)
mad_genes_df.head()

Unnamed: 0,index,0
0,126695,0.82109
1,147798,0.808853
2,10053,0.808764
3,54845,0.803626
4,582,0.802497


In [10]:
mad_genes_df.columns=['gene_id', 'mad']
mad_genes = mad_genes_df.iloc[:mad_threshold, :].gene_id.astype(str).values
print(mad_genes[:5])

['126695' '147798' '10053' '54845' '582']


In [11]:
X_df = X_df.reindex(mad_genes, axis='columns')
print(X_df.shape)
X_df.iloc[:5, :5]

(9074, 100)


Unnamed: 0_level_0,126695,147798,10053,54845,582
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,-1.307892,-0.977865,-1.119059,-1.116767,0.458711
TCGA-02-0055-01,-1.30582,-0.964695,-1.1119,-1.114346,-0.551517
TCGA-02-2483-01,-1.307892,-0.984642,-1.119059,-1.116851,-0.470321
TCGA-02-2485-01,-1.306134,-0.984678,-1.118344,-1.113535,1.100724
TCGA-02-2486-01,-1.305639,-0.983574,-1.119059,-1.117114,1.00631


### Calculate pan-cancer univariate feature correlations

In [16]:
# now get univariate feature correlations with labels
from sklearn.feature_selection import f_classif

f_stats = f_classif(X_df, y_df.status)[1]
print(f_stats)

[3.30376103e-061 5.89303262e-017 3.87480768e-009 1.97936420e-159
 1.36783750e-157 1.01019522e-096 2.10773419e-241 3.30813815e-038
 2.16020991e-002 2.57403481e-034 3.88371611e-005 1.02081438e-007
 6.27574327e-041 4.72294816e-041 5.38323720e-250 1.06467073e-022
 1.81748473e-003 1.10783243e-079 1.50747533e-006 4.06098083e-104
 1.23338680e-131 1.91652176e-019 9.32380111e-036 3.14246689e-220
 1.84882840e-022 1.81796563e-117 1.21833588e-157 1.59015079e-121
 1.26063171e-118 8.98865903e-002 2.27988910e-062 3.02784855e-043
 4.52288631e-034 6.82206522e-007 7.41179685e-005 1.67115046e-002
 1.52438886e-022 6.04835208e-004 1.51093597e-004 5.62790675e-027
 3.34299575e-050 1.16980359e-066 1.50087612e-095 8.35127997e-003
 1.44133374e-101 1.49492648e-008 2.21777383e-015 1.66186028e-030
 1.01991427e-158 1.25925318e-001 3.56574762e-088 3.31172474e-193
 1.29980085e-005 2.96384022e-181 7.40967401e-030 1.98052008e-232
 3.69402245e-083 1.21615995e-035 2.38969397e-004 6.04159042e-173
 1.71285445e-140 5.842665