In [1]:
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns

from scripts.utils import (
    load_weight_matrix,
    apply_signature,
    load_enrichment_results,
    extract_feature,
)

sys.path.append("../9.tcga-classify/")
from scripts.tcga_util import build_feature_dictionary

In [88]:
def ttest_sex_difference(feature_series, male_ids, female_ids):
    """
    To be applied to a pandas dataframe by column
    """
    feature_name = feature_series.name
    feature_algorithm, feature_num = feature_name.split('_')
    
    male_activation = feature_series[feature_series.index.isin(male_ids)]
    female_activation = feature_series[feature_series.index.isin(female_ids)]
    
    # Perform t-test on two groups
    t_stat, t_p = ttest_ind(male_activation, female_activation)
    
    return([t_stat, t_p, feature_algorithm, feature_num])

In [9]:
! wget --directory-prefix="../0.expression-download/download/" "https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SubjectPhenotypesDS.txt"

--2019-02-05 12:28:41--  https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SubjectPhenotypesDS.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 2607:f8b0:4006:804::2010, 172.217.6.240
Connecting to storage.googleapis.com (storage.googleapis.com)|2607:f8b0:4006:804::2010|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15485 (15K) [text/plain]
Saving to: ‘../0.expression-download/download/GTEx_v7_Annotations_SubjectPhenotypesDS.txt.1’


2019-02-05 12:28:41 (20.3 MB/s) - ‘../0.expression-download/download/GTEx_v7_Annotations_SubjectPhenotypesDS.txt.1’ saved [15485/15485]



In [10]:
! md5sum ../0.expression-download/download/GTEx_v7_Annotations_SubjectPhenotypesDS.txt

73d9180f604c17e6c211ca1694dc9ff7  ../0.expression-download/download/GTEx_v7_Annotations_SubjectPhenotypesDS.txt


In [2]:
# Load GTEx phenotype data
file = os.path.join("..", "0.expression-download", "download", "GTEx_v7_Annotations_SubjectPhenotypesDS.txt")
gtex_pheno_df = pd.read_table(file)
gtex_pheno_df.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [3]:
gtex_z_matrix_dict = build_feature_dictionary(dataset="GTEX", load_data=True, store_train_test='test')

In [87]:
# Extract male and female ids from the dataset
example_matrix_df = gtex_z_matrix_dict['signal']['8']['451283']['test']

patient_id_df = pd.concat(
    [
    pd.DataFrame(["{}-{}".format(x[0], x[1]) for x in example_matrix_df.index.str.split('-')],
                 columns=['patient_id'])
        .merge(gtex_pheno_df,
               how='left',
               left_on='patient_id',
               right_on='SUBJID'),
    pd.DataFrame(example_matrix_df.index)
    ],
    axis='columns'
)

males = patient_id_df.query("SEX == 1").sample_id.tolist()
females = patient_id_df.query("SEX == 2").sample_id.tolist()

print(patient_id_df.shape)
patient_id_df.head()

(1169, 6)


Unnamed: 0,patient_id,SUBJID,SEX,AGE,DTHHRDY,sample_id
0,GTEX-ZTX8,GTEX-ZTX8,1,20-29,0.0,GTEX-ZTX8-1126-SM-51MRM
1,GTEX-Y3IK,GTEX-Y3IK,2,50-59,0.0,GTEX-Y3IK-2426-SM-4WWDU
2,GTEX-X62O,GTEX-X62O,1,50-59,0.0,GTEX-X62O-0826-SM-46MW8
3,GTEX-13O3O,GTEX-13O3O,2,60-69,3.0,GTEX-13O3O-0011-R5b-SM-5KM44
4,GTEX-X15G,GTEX-X15G,2,50-59,0.0,GTEX-X15G-1926-SM-4PQZQ


In [90]:
full_results = []
for signal in gtex_z_matrix_dict.keys():
    for z_dim in gtex_z_matrix_dict[signal].keys():
        for seed in gtex_z_matrix_dict[signal][z_dim].keys():
            z_df = gtex_z_matrix_dict[signal][z_dim][seed]['test']
            
            result_df = pd.DataFrame(z_df.apply(lambda x:
                                                ttest_sex_difference(feature_series=x,
                                                                     male_ids=males,
                                                                     female_ids=females)),
                                     columns = ['result'])
            
            result_df = (
                pd.DataFrame(result_df.result.values.tolist(),
                             columns=['t_stat', 't_p', 'algorithm', 'feature_num'])
            ).fillna(1)

            result_df = result_df.assign(
                z_dim=z_dim,
                signal=signal,
                seed=seed
            )
            full_results.append(result_df)

In [96]:
full_results_df = pd.concat(full_results)
full_results_df = full_results_df.assign(neg_log_p=-np.log10(full_results_df.t_p))

file = os.path.join("results", "sex_separation_gtex_t_test.tsv")
full_results_df.to_csv(file, sep='\t', index=False)

print(full_results_df.shape)
full_results_df.head()

(61700, 8)


Unnamed: 0,t_stat,t_p,algorithm,feature_num,z_dim,signal,seed,neg_log_p
0,1.60467,0.108837,pca,0,8,signal,451283,0.963223
1,1.473363,0.140923,pca,1,8,signal,451283,0.851019
2,0.423628,0.671915,pca,2,8,signal,451283,0.172686
3,-0.005743,0.995419,pca,3,8,signal,451283,0.001994
4,2.915217,0.003622,pca,4,8,signal,451283,2.441015


In [101]:
# Load TARGET phenotype data
file = os.path.join("..", "0.expression-download", "data", "2017-09-30-TARGET update harmonized.txt")
nbl_pheno_df = pd.read_table(file)
nbl_pheno_df.head()

Unnamed: 0,usi,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,Overall Survival Time in Days,Year of Diagnosis,...,Histology,Grade,MKI,Diagnostic Category,ICDO,ICDO Description,COG Risk Group,Site Relapse,Comment,target_update
0,PAAPFA,Male,White,Not Hispanic or Latino,1762,Event,444.0,Dead,487.0,1986.0,...,Unfavorable,Unknown,Unknown,Unknown,Unknown,Unknown,High Risk,,,old
1,PACLJN,Male,White,Not Hispanic or Latino,1475,Censored,5553.0,Alive,5553.0,1986.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,High Risk,,,old
2,PACPJG,Female,White,Not Hispanic or Latino,760,Unknown,,Unknown,,1987.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,,,old
3,PACRYY,Male,Unknown,Hispanic or Latino,1314,Censored,5296.0,Alive,5296.0,1987.0,...,Unfavorable,Unknown,Unknown,Unknown,Unknown,Lge Right retroperitoneal mass extending thru ...,High Risk,,,old
4,PACRZM,Male,White,Not Hispanic or Latino,3686,Event,922.0,Dead,922.0,1987.0,...,Unfavorable,Unknown,Unknown,Unknown,Unknown,Unknown,High Risk,,,old


In [108]:
# Load TARGET matrices
target_z_matrix_dict = build_feature_dictionary(dataset="TARGET", load_data=True, store_train_test='train')

In [112]:
# Extract male and female ids from the dataset
example_matrix_df = target_z_matrix_dict['signal']['8']['451283']['train']

patient_id_df = pd.concat(
    [
    pd.DataFrame([x[2] for x in example_matrix_df.index.str.split('-')],
                 columns=['patient_id'])
        .merge(nbl_pheno_df,
               how='left',
               left_on='patient_id',
               right_on='usi'),
    pd.DataFrame(example_matrix_df.index)
    ],
    axis='columns'
).dropna(subset=['usi'])

print(patient_id_df.shape)
patient_id_df.head()

(146, 28)


Unnamed: 0,patient_id,usi,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,Overall Survival Time in Days,...,Grade,MKI,Diagnostic Category,ICDO,ICDO Description,COG Risk Group,Site Relapse,Comment,target_update,sample_id
0,PARSBI,PARSBI,Female,White,Not Hispanic or Latino,2390.0,Relapse,1377.0,Dead,1743.0,...,Undifferentiated or Poorly Differentiated,Low,Neuroblastoma,C76.2,"Abdomen, NOS Abdominal wall, NOS Intra-abdom...",High Risk,Other metastatic sites,,old,TARGET-30-PARSBI-01
3,PATBMM,PATBMM,Male,White,Not Hispanic or Latino,1112.0,Relapse,653.0,Alive,768.0,...,Undifferentiated or Poorly Differentiated,Low,"Ganglioneuroblastoma, nodular",C74.9,"Adrenal gland, NOS Suprarenal gland Adrenal,...",High Risk,Bone,,old,TARGET-30-PATBMM-01
8,PANSBN,PANSBN,Male,White,Not Hispanic or Latino,2329.0,Relapse,505.0,Dead,836.0,...,Differentiating,High,"Ganglioneuroblastoma, nodular",C38.3,"Mediastinum, NOS",High Risk,Bone Marrow,,old,TARGET-30-PANSBN-01
11,PARNNC,PARNNC,Female,White,Not Hispanic or Latino,41.0,Relapse,735.0,Alive,2979.0,...,Undifferentiated or Poorly Differentiated,Low,Neuroblastoma,C48.0,Retroperitoneum\n\nPeriadrenal tissue\n\nPerin...,Low Risk,Primary site;; Other metastatic sites,,old,TARGET-30-PARNNC-01
18,PASVRU,PASVRU,Male,White,Hispanic or Latino,631.0,Event,254.0,Dead,440.0,...,Undifferentiated or Poorly Differentiated,High,Neuroblastoma,C48.0,Retroperitoneum Periadrenal tissue Perinephr...,High Risk,,,old,TARGET-30-PASVRU-01


In [116]:
mycn_amp = patient_id_df.loc[patient_id_df["MYCN status"] == "Amplified", "sample_id"].tolist()
mycn_nonamp = patient_id_df.loc[patient_id_df["MYCN status"] == "Not Amplified", "sample_id"].tolist()

In [120]:
full_results = []
for signal in target_z_matrix_dict.keys():
    for z_dim in target_z_matrix_dict[signal].keys():
        for seed in target_z_matrix_dict[signal][z_dim].keys():
            z_df = target_z_matrix_dict[signal][z_dim][seed]['train']
            
            result_df = pd.DataFrame(z_df.apply(lambda x:
                                                ttest_sex_difference(feature_series=x,
                                                                     male_ids=mycn_amp,
                                                                     female_ids=mycn_nonamp)),
                                     columns = ['result'])
            
            result_df = (
                pd.DataFrame(result_df.result.values.tolist(),
                             columns=['t_stat', 't_p', 'algorithm', 'feature_num'])
            ).fillna(1)

            result_df = result_df.assign(
                z_dim=z_dim,
                signal=signal,
                seed=seed
            )
            full_results.append(result_df)

In [121]:
full_results_df = pd.concat(full_results)
full_results_df = full_results_df.assign(neg_log_p=-np.log10(full_results_df.t_p))

file = os.path.join("results", "nbl_mycn_separation_target_t_test.tsv")
full_results_df.to_csv(file, sep='\t', index=False)

print(full_results_df.shape)
full_results_df.head()

(61700, 8)


Unnamed: 0,t_stat,t_p,algorithm,feature_num,z_dim,signal,seed,neg_log_p
0,4.413269,1.992101e-05,pca,0,35,signal,451283,4.700689
1,7.804911,1.147526e-12,pca,1,35,signal,451283,11.940237
2,-1.194024,0.2344454,pca,2,35,signal,451283,0.629958
3,-4.88546,2.730816e-06,pca,3,35,signal,451283,5.563708
4,4.705744,5.909799e-06,pca,4,35,signal,451283,5.228427
