# Detect Separation Between Two Phenotypes in TCGA, GTEx, and TARGET Data

**Gregory Way, 2019**

Perform a t-test between two distinct phenotypes. In TCGA and GTEx data, we perform a t-test on males and females while in TARGET data, we test MYCN amplified vs. MYCN not amplified neuroblastoma tumors. We track the t-test p values across k dimensions and algorithms to isolate the features that best distinguishes the two groups.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("../8.gtex-interpret")
from scripts.utils import (
    load_weight_matrix,
    apply_signature,
    load_enrichment_results,
    extract_feature,
)

sys.path.append("../9.tcga-classify/")
from scripts.tcga_util import build_feature_dictionary

In [2]:
def ttest_difference(feature_series, group_a_ids, group_b_ids):
    """
    To be applied to a pandas dataframe by column
    """
    feature_name = feature_series.name
    feature_algorithm, feature_num = feature_name.split('_')
    
    a_activation = feature_series[feature_series.index.isin(group_a_ids)]
    b_activation = feature_series[feature_series.index.isin(group_b_ids)]
    
    # Perform t-test on two groups
    t_stat, t_p = ttest_ind(a_activation, b_activation)
    
    return([t_stat, t_p, feature_algorithm, feature_num])


def get_ttest_results(z_matrix_dict, group_a_ids, group_b_ids, train_or_test='test'):
    """
    Loop through z matrix performing t-test using the compressed feature scores.
    Output full t-test results
    """
    
    # Perform t-test for all compressed features
    full_results = []
    for signal in z_matrix_dict.keys():
        for z_dim in z_matrix_dict[signal].keys():
            for seed in z_matrix_dict[signal][z_dim].keys():
                z_df = z_matrix_dict[signal][z_dim][seed][train_or_test]

                result_df = pd.DataFrame(z_df.apply(lambda x:
                                                    ttest_difference(feature_series=x,
                                                                     group_a_ids=group_a_ids,
                                                                     group_b_ids=group_b_ids)),
                                         columns = ['result'])

                result_df = (
                    pd.DataFrame(result_df.result.values.tolist(),
                                 columns=['t_stat', 't_p', 'algorithm', 'feature_num'])
                ).fillna(1)

                result_df = result_df.assign(
                    z_dim=z_dim,
                    signal=signal,
                    seed=seed
                )
                full_results.append(result_df)
    
    full_results_df = pd.concat(full_results)
    full_results_df = full_results_df.assign(neg_log_p=-np.log10(full_results_df.t_p))
    full_results_df = full_results_df.sort_values(by='neg_log_p', ascending=False)
    return full_results_df

## 1. GTEx Sex Analysis

In [3]:
# Load GTEx phenotype data
file = os.path.join("..", "0.expression-download", "download", "GTEx_v7_Annotations_SubjectPhenotypesDS.txt")
gtex_pheno_df = pd.read_table(file)
gtex_pheno_df.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [4]:
gtex_z_matrix_dict = build_feature_dictionary(dataset="GTEX",
                                              load_data=True,
                                              store_train_test='test')

In [5]:
# Extract male and female ids from the dataset using one matrix as an example
# (All matrices are aligned with the same IDs)
example_matrix_df = gtex_z_matrix_dict['signal']['8']['451283']['test']

patient_id_df = pd.concat(
    [
    pd.DataFrame(["{}-{}".format(x[0], x[1]) for x in example_matrix_df.index.str.split('-')],
                 columns=['patient_id'])
        .merge(gtex_pheno_df,
               how='left',
               left_on='patient_id',
               right_on='SUBJID'),
    pd.DataFrame(example_matrix_df.index)
    ],
    axis='columns'
)

gtex_males = patient_id_df.query("SEX == 1").sample_id.tolist()
gtex_females = patient_id_df.query("SEX == 2").sample_id.tolist()

print(patient_id_df.shape)
patient_id_df.head()

(1169, 6)


Unnamed: 0,patient_id,SUBJID,SEX,AGE,DTHHRDY,sample_id
0,GTEX-ZTX8,GTEX-ZTX8,1,20-29,0.0,GTEX-ZTX8-1126-SM-51MRM
1,GTEX-Y3IK,GTEX-Y3IK,2,50-59,0.0,GTEX-Y3IK-2426-SM-4WWDU
2,GTEX-X62O,GTEX-X62O,1,50-59,0.0,GTEX-X62O-0826-SM-46MW8
3,GTEX-13O3O,GTEX-13O3O,2,60-69,3.0,GTEX-13O3O-0011-R5b-SM-5KM44
4,GTEX-X15G,GTEX-X15G,2,50-59,0.0,GTEX-X15G-1926-SM-4PQZQ


In [6]:
# Perform t-test for all compressed features
gtex_full_results_df = get_ttest_results(z_matrix_dict=gtex_z_matrix_dict,
                                         group_a_ids=gtex_males,
                                         group_b_ids=gtex_females)

In [7]:
# Output results
file = os.path.join("results", "sex_separation_gtex_t_test.tsv")
gtex_full_results_df.to_csv(file, sep='\t', index=False)

print(gtex_full_results_df.shape)
gtex_full_results_df.head()

(61700, 8)


Unnamed: 0,t_stat,t_p,algorithm,feature_num,z_dim,signal,seed,neg_log_p
708,-48.973734,2.710249e-285,vae,108,200,signal,486191,284.566991
590,-43.23931,1.3580680000000001e-244,vae,140,150,signal,978124,243.867078
511,38.567742,1.7318179999999998e-210,nmf,111,200,signal,451283,209.761498
411,38.180061,1.250056e-207,nmf,111,150,signal,486191,206.90307
361,38.083695,6.42639e-207,nmf,111,125,signal,486191,206.192033


## 2. TCGA Sex Analysis

In [8]:
# Load TCGA phenotype data
file = os.path.join("..", "0.expression-download", "download", "TCGA-CDR-SupplementalTableS1.xlsx")
tcga_pheno_df = pd.read_excel(file)

tcga_pheno_df.head()

Unnamed: 0,bcr_patient_barcode,type,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,initial_pathologic_dx_year,...,residual_tumor,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction
1,TCGA-OR-A5J1,ACC,58.0,MALE,WHITE,Stage II,[Not Applicable],Adrenocortical carcinoma- Usual Type,[Not Available],2000.0,...,,1.0,1355.0,1.0,1355.0,1.0,754.0,1.0,754.0,
2,TCGA-OR-A5J2,ACC,44.0,FEMALE,WHITE,Stage IV,[Not Applicable],Adrenocortical carcinoma- Usual Type,[Not Available],2004.0,...,,1.0,1677.0,1.0,1677.0,,,1.0,289.0,
3,TCGA-OR-A5J3,ACC,23.0,FEMALE,WHITE,Stage III,[Not Applicable],Adrenocortical carcinoma- Usual Type,[Not Available],2008.0,...,,0.0,2091.0,0.0,2091.0,1.0,53.0,1.0,53.0,
4,TCGA-OR-A5J4,ACC,23.0,FEMALE,WHITE,Stage IV,[Not Applicable],Adrenocortical carcinoma- Usual Type,[Not Available],2000.0,...,,1.0,423.0,1.0,423.0,,,1.0,126.0,
5,TCGA-OR-A5J5,ACC,30.0,MALE,WHITE,Stage III,[Not Applicable],Adrenocortical carcinoma- Usual Type,[Not Available],2000.0,...,,1.0,365.0,1.0,365.0,,,1.0,50.0,


In [9]:
tcga_z_matrix_dict = build_feature_dictionary(dataset="TCGA",
                                              load_data=True,
                                              store_train_test='test')

In [10]:
# Extract male and female ids from the dataset
example_matrix_df = tcga_z_matrix_dict['signal']['2']['451283']['test']

patient_id_df = pd.concat(
    [
    pd.DataFrame(["{}-{}-{}".format(x[0], x[1], x[2]) for x in example_matrix_df.index.str.split('-')],
                 columns=['patient_id'])
        .merge(tcga_pheno_df,
               how='left',
               left_on='patient_id',
               right_on='bcr_patient_barcode'),
    pd.DataFrame(example_matrix_df.index)
    ],
    axis='columns'
)

tcga_males = patient_id_df.query("gender == 'MALE'").sample_id.tolist()
tcga_females = patient_id_df.query("gender == 'FEMALE'").sample_id.tolist()

print(patient_id_df.shape)
patient_id_df.head()

(1106, 35)


Unnamed: 0,patient_id,bcr_patient_barcode,type,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,clinical_stage,histological_type,histological_grade,...,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction,sample_id
0,TCGA-CN-5365,TCGA-CN-5365,HNSC,38.0,MALE,WHITE,Stage IVB,Stage IVC,Head & Neck Squamous Cell Carcinoma,G2,...,1.0,351.0,1.0,351.0,,,1.0,351.0,,TCGA-CN-5365-01
1,TCGA-LP-A7HU,TCGA-LP-A7HU,CESC,53.0,FEMALE,ASIAN,[Not Available],Stage II,Endocervical Type of Adenocarcinoma,G3,...,0.0,406.0,0.0,406.0,0.0,406.0,0.0,406.0,,TCGA-LP-A7HU-01
2,TCGA-22-5491,TCGA-22-5491,LUSC,74.0,MALE,WHITE,Stage IA,[Not Applicable],Lung Squamous Cell Carcinoma,[Not Available],...,1.0,1713.0,,1713.0,,,0.0,1713.0,,TCGA-22-5491-11
3,TCGA-CS-6667,TCGA-CS-6667,LGG,39.0,FEMALE,WHITE,[Not Available],[Not Available],Astrocytoma,G2,...,0.0,1469.0,0.0,1469.0,,,0.0,1469.0,,TCGA-CS-6667-01
4,TCGA-20-1684,TCGA-20-1684,OV,51.0,FEMALE,WHITE,[Not Applicable],Stage IIIC,Serous Cystadenocarcinoma,G3,...,0.0,581.0,0.0,581.0,0.0,581.0,0.0,581.0,,TCGA-20-1684-01


In [11]:
# Perform t-test for all compressed features
tcga_full_results_df = get_ttest_results(z_matrix_dict=tcga_z_matrix_dict,
                                         group_a_ids=tcga_males,
                                         group_b_ids=tcga_females)

In [12]:
# Output results
file = os.path.join("results", "sex_separation_tcga_t_test.tsv")
tcga_full_results_df.to_csv(file, sep='\t', index=False)

print(tcga_full_results_df.shape)
tcga_full_results_df.head()

(61700, 8)


Unnamed: 0,t_stat,t_p,algorithm,feature_num,z_dim,signal,seed,neg_log_p
76,-13.875533,1.844775e-40,vae,16,20,signal,486191,39.734057
81,-12.611795,3.703814e-34,vae,6,25,signal,165158,33.431351
441,-12.418163,3.131056e-33,dae,81,90,signal,165158,32.504309
47,-11.907419,7.75328e-31,nmf,15,16,signal,486191,30.110515
47,-11.907092,7.780224000000001e-31,nmf,15,16,signal,165158,30.109008


## 3. TARGET NBL MYCN Status Analysis

In [13]:
# Load TARGET phenotype data
file = os.path.join("..", "0.expression-download", "data", "2017-09-30-TARGET update harmonized.txt")
nbl_pheno_df = pd.read_table(file)
nbl_pheno_df.head()

Unnamed: 0,usi,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,Overall Survival Time in Days,Year of Diagnosis,...,Histology,Grade,MKI,Diagnostic Category,ICDO,ICDO Description,COG Risk Group,Site Relapse,Comment,target_update
0,PAAPFA,Male,White,Not Hispanic or Latino,1762,Event,444.0,Dead,487.0,1986.0,...,Unfavorable,Unknown,Unknown,Unknown,Unknown,Unknown,High Risk,,,old
1,PACLJN,Male,White,Not Hispanic or Latino,1475,Censored,5553.0,Alive,5553.0,1986.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,High Risk,,,old
2,PACPJG,Female,White,Not Hispanic or Latino,760,Unknown,,Unknown,,1987.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,old
3,PACRYY,Male,Unknown,Hispanic or Latino,1314,Censored,5296.0,Alive,5296.0,1987.0,...,Unfavorable,Unknown,Unknown,Unknown,Unknown,Lge Right retroperitoneal mass extending thru ...,High Risk,,,old
4,PACRZM,Male,White,Not Hispanic or Latino,3686,Event,922.0,Dead,922.0,1987.0,...,Unfavorable,Unknown,Unknown,Unknown,Unknown,Unknown,High Risk,,,old


In [14]:
# Load TARGET matrices
target_z_matrix_dict = build_feature_dictionary(dataset="TARGET",
                                                load_data=True,
                                                store_train_test='train')

In [15]:
# Extract male and female ids from the dataset
example_matrix_df = target_z_matrix_dict['signal']['8']['451283']['train']

patient_id_df = pd.concat(
    [
    pd.DataFrame([x[2] for x in example_matrix_df.index.str.split('-')],
                 columns=['patient_id'])
        .merge(nbl_pheno_df,
               how='left',
               left_on='patient_id',
               right_on='usi'),
    pd.DataFrame(example_matrix_df.index)
    ],
    axis='columns'
).dropna(subset=['usi'])

mycn_amp = patient_id_df.loc[patient_id_df["MYCN status"] == "Amplified", "sample_id"].tolist()
mycn_nonamp = patient_id_df.loc[patient_id_df["MYCN status"] == "Not Amplified", "sample_id"].tolist()

print(patient_id_df.shape)
patient_id_df.head()

(146, 28)


Unnamed: 0,patient_id,usi,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,Overall Survival Time in Days,...,Grade,MKI,Diagnostic Category,ICDO,ICDO Description,COG Risk Group,Site Relapse,Comment,target_update,sample_id
0,PARSBI,PARSBI,Female,White,Not Hispanic or Latino,2390.0,Relapse,1377.0,Dead,1743.0,...,Undifferentiated or Poorly Differentiated,Low,Neuroblastoma,C76.2,"Abdomen, NOS Abdominal wall, NOS Intra-abdom...",High Risk,Other metastatic sites,,old,TARGET-30-PARSBI-01
3,PATBMM,PATBMM,Male,White,Not Hispanic or Latino,1112.0,Relapse,653.0,Alive,768.0,...,Undifferentiated or Poorly Differentiated,Low,"Ganglioneuroblastoma, nodular",C74.9,"Adrenal gland, NOS Suprarenal gland Adrenal,...",High Risk,Bone,,old,TARGET-30-PATBMM-01
8,PANSBN,PANSBN,Male,White,Not Hispanic or Latino,2329.0,Relapse,505.0,Dead,836.0,...,Differentiating,High,"Ganglioneuroblastoma, nodular",C38.3,"Mediastinum, NOS",High Risk,Bone Marrow,,old,TARGET-30-PANSBN-01
11,PARNNC,PARNNC,Female,White,Not Hispanic or Latino,41.0,Relapse,735.0,Alive,2979.0,...,Undifferentiated or Poorly Differentiated,Low,Neuroblastoma,C48.0,Retroperitoneum\n\nPeriadrenal tissue\n\nPerin...,Low Risk,Primary site;; Other metastatic sites,,old,TARGET-30-PARNNC-01
18,PASVRU,PASVRU,Male,White,Hispanic or Latino,631.0,Event,254.0,Dead,440.0,...,Undifferentiated or Poorly Differentiated,High,Neuroblastoma,C48.0,Retroperitoneum Periadrenal tissue Perinephr...,High Risk,,,old,TARGET-30-PASVRU-01


In [16]:
# Perform t-test for all compressed features
target_full_results_df = get_ttest_results(z_matrix_dict=target_z_matrix_dict,
                                           group_a_ids=mycn_amp,
                                           group_b_ids=mycn_nonamp,
                                           train_or_test='train')

In [17]:
file = os.path.join("results", "nbl_mycn_separation_target_t_test.tsv")
target_full_results_df.to_csv(file, sep='\t', index=False)

print(target_full_results_df.shape)
target_full_results_df.head()

(61700, 8)


Unnamed: 0,t_stat,t_p,algorithm,feature_num,z_dim,signal,seed,neg_log_p
711,17.452038,2.988269e-37,vae,111,200,signal,451283,36.52458
325,16.118447,5.7464499999999994e-34,vae,55,90,signal,451283,33.2406
440,15.863525,2.493347e-33,vae,65,125,signal,978124,32.603217
516,15.157871,1.4996450000000001e-31,vae,66,150,signal,165158,30.824012
315,14.840073,9.637579e-31,nmf,15,150,signal,486191,30.016032
