# Data Analysis - Survival Analysis

Data from: https://xenabrowser.net/datapages/?dataset=GDC-PANCAN.htseq_fpkm-uq.tsv&host=https%3A%2F%2Fgdc.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

### TCGA Barcodes
The column headers are TCGA barcodes:
* In the format of: `project-tissuesourcesite-participant-sample|vial-portion|analyte-plate-center`
* https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
* https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes

In [None]:
# Installations
# !pip install kaplanmeier
# !pip install gseapy

In [1]:
# Imports
import pandas as pd

In [31]:
# Read in the RNA matrix
df = pd.read_csv('./data/GDC-PANCAN.htseq_fpkm-uq.tsv', sep='\t')
display(df.head())

Unnamed: 0,xena_sample,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,TCGA-OR-A5JR-01A,TCGA-OR-A5KU-01A,TCGA-OR-A5LS-01A,TCGA-OR-A5J7-01A,TCGA-OR-A5JQ-01A,...,TARGET-50-PAJMKJ-01A,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A
0,ENSG00000242268.2,0.0,0.0,0.0,0.0,9.486642,0.0,0.0,0.0,0.0,...,11.700035,10.041859,13.398458,0.0,10.61723,11.933609,14.140998,11.659218,10.662028,12.878131
1,ENSG00000270112.3,10.689655,14.408626,14.022621,11.291444,10.221394,12.423503,12.830424,12.758888,11.547426,...,9.267574,12.513257,10.501003,10.452072,10.625798,8.310776,7.131909,7.678919,10.134942,11.116645
2,ENSG00000167578.15,18.536987,18.684183,17.334107,19.713465,16.76163,17.762472,18.114361,19.068519,17.47447,...,15.541309,16.684341,15.905948,16.991286,15.066989,13.953978,15.969451,14.607776,14.387707,15.886538
3,ENSG00000273842.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENSG00000078237.5,17.847476,18.227483,17.287893,16.722624,17.157762,17.001996,18.648729,18.076084,15.817248,...,15.37773,16.438256,16.733394,16.149538,16.277784,15.673957,16.305087,15.916629,15.850915,16.188748


### Align RNA / OS / Phenotype samples

---

In [32]:
# Read in ID/Gene Mapping file
mapping_df = pd.read_csv('./data/gencode.v22.annotation.gene.probeMap', sep='\t')
display(mapping_df.head())

Unnamed: 0,id,gene,chrom,chromStart,chromEnd,strand
0,ENSG00000223972.5,DDX11L1,chr1,11869,14409,+
1,ENSG00000227232.5,WASH7P,chr1,14404,29570,-
2,ENSG00000278267.1,MIR6859-3,chr1,17369,17436,-
3,ENSG00000243485.3,RP11-34P13.3,chr1,29554,31109,+
4,ENSG00000274890.1,MIR1302-9,chr1,30366,30503,+


In [33]:
# Check to see if the mapping file and the rna matrix file have the same id names
mapping_df.rename(columns={'id': 'xena_sample'}, inplace=True)
merged_df = pd.merge(mapping_df, df, on='xena_sample', how='outer', indicator=True)

# Check matching status
# Filter rows that do not have 'both' in the '_merge' column
non_matching_rows = merged_df[merged_df['_merge'] != 'both']

# Print the non-matching rows
print(non_matching_rows)

Empty DataFrame
Columns: [xena_sample, gene, chrom, chromStart, chromEnd, strand, TCGA-OR-A5JP-01A, TCGA-OR-A5JE-01A, TCGA-OR-A5JG-01A, TCGA-OR-A5L9-01A, TCGA-OR-A5JR-01A, TCGA-OR-A5KU-01A, TCGA-OR-A5LS-01A, TCGA-OR-A5J7-01A, TCGA-OR-A5JQ-01A, TCGA-OR-A5JS-01A, TCGA-OR-A5JL-01A, TCGA-OR-A5LC-01A, TCGA-OR-A5K2-01A, TCGA-P6-A5OG-01A, TCGA-OR-A5JW-01A, TCGA-OR-A5JZ-01A, TCGA-OR-A5J8-01A, TCGA-OR-A5K5-01A, TCGA-OR-A5KV-01A, TCGA-OR-A5L4-01A, TCGA-OR-A5KX-01A, TCGA-OR-A5K1-01A, TCGA-OR-A5JO-01A, TCGA-OR-A5LG-01A, TCGA-OR-A5LO-01A, TCGA-OR-A5JB-01A, TCGA-OR-A5JV-01A, TCGA-OR-A5LJ-01A, TCGA-OR-A5LA-01A, TCGA-OR-A5KY-01A, TCGA-OR-A5KO-01A, TCGA-OR-A5L6-01A, TCGA-OR-A5KZ-01A, TCGA-OR-A5J5-01A, TCGA-OR-A5LB-01A, TCGA-OR-A5LT-01A, TCGA-OR-A5LD-01A, TCGA-OR-A5J2-01A, TCGA-OR-A5LE-01A, TCGA-OR-A5K4-01A, TCGA-OR-A5K6-01A, TCGA-OR-A5JY-01A, TCGA-OR-A5JT-01A, TCGA-OR-A5KW-01A, TCGA-PK-A5H8-01A, TCGA-OR-A5JX-01A, TCGA-OR-A5LK-01A, TCGA-P6-A5OF-01A, TCGA-OR-A5JM-01A, TCGA-OR-A5JI-01A, TCGA-OR-A5JC-01A, 

In [35]:
display(df.head())
print(df.info())

Unnamed: 0,xena_sample,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,TCGA-OR-A5JR-01A,TCGA-OR-A5KU-01A,TCGA-OR-A5LS-01A,TCGA-OR-A5J7-01A,TCGA-OR-A5JQ-01A,...,TARGET-50-PAJMKJ-01A,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A
0,ENSG00000242268.2,0.0,0.0,0.0,0.0,9.486642,0.0,0.0,0.0,0.0,...,11.700035,10.041859,13.398458,0.0,10.61723,11.933609,14.140998,11.659218,10.662028,12.878131
1,ENSG00000270112.3,10.689655,14.408626,14.022621,11.291444,10.221394,12.423503,12.830424,12.758888,11.547426,...,9.267574,12.513257,10.501003,10.452072,10.625798,8.310776,7.131909,7.678919,10.134942,11.116645
2,ENSG00000167578.15,18.536987,18.684183,17.334107,19.713465,16.76163,17.762472,18.114361,19.068519,17.47447,...,15.541309,16.684341,15.905948,16.991286,15.066989,13.953978,15.969451,14.607776,14.387707,15.886538
3,ENSG00000273842.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENSG00000078237.5,17.847476,18.227483,17.287893,16.722624,17.157762,17.001996,18.648729,18.076084,15.817248,...,15.37773,16.438256,16.733394,16.149538,16.277784,15.673957,16.305087,15.916629,15.850915,16.188748


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60483 entries, 0 to 60482
Columns: 11769 entries, xena_sample to TARGET-50-PALFRD-01A
dtypes: float64(11768), object(1)
memory usage: 5.3+ GB
None


In [44]:
# Merge df with mapping_df on the 'id' column
df_merged = pd.merge(df, mapping_df, on='xena_sample', how='left')

In [45]:
# Print the updated df2 DataFrame
display(df_merged.head())

Unnamed: 0,xena_sample,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,TCGA-OR-A5JR-01A,TCGA-OR-A5KU-01A,TCGA-OR-A5LS-01A,TCGA-OR-A5J7-01A,TCGA-OR-A5JQ-01A,...,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A,gene,chrom,chromStart,chromEnd,strand
0,ENSG00000242268.2,0.0,0.0,0.0,0.0,9.486642,0.0,0.0,0.0,0.0,...,11.933609,14.140998,11.659218,10.662028,12.878131,RP11-368I23.2,chr3,168903366,168921996,+
1,ENSG00000270112.3,10.689655,14.408626,14.022621,11.291444,10.221394,12.423503,12.830424,12.758888,11.547426,...,8.310776,7.131909,7.678919,10.134942,11.116645,RP11-742D12.2,chr18,46756487,46764408,+
2,ENSG00000167578.15,18.536987,18.684183,17.334107,19.713465,16.76163,17.762472,18.114361,19.068519,17.47447,...,13.953978,15.969451,14.607776,14.387707,15.886538,RAB4B,chr19,40778216,40796944,+
3,ENSG00000273842.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,AC104183.2,chr3,21382478,21382542,+
4,ENSG00000078237.5,17.847476,18.227483,17.287893,16.722624,17.157762,17.001996,18.648729,18.076084,15.817248,...,15.673957,16.305087,15.916629,15.850915,16.188748,C12orf5,chr12,4321205,4354593,+


In [19]:
# Read in the basic phenotype data
df_basic_phenotype = pd.read_csv('./data/GDC-PANCAN.basic_phenotype.tsv', sep='\t')

In [23]:
display(df_basic_phenotype.head())
print(df_basic_phenotype.info())

Unnamed: 0,sample,program,sample_type_id,sample_type,project_id,Age at Diagnosis in Years,Gender
0,TCGA-69-7978-01A,TCGA,1,Primary Tumor,TCGA-LUAD,59.0,Male
1,TCGA-AR-A24Z-01A,TCGA,1,Primary Tumor,TCGA-BRCA,57.0,Female
2,TCGA-D1-A103-01A,TCGA,1,Primary Tumor,TCGA-UCEC,87.0,Female
3,TARGET-20-PASRLS-09A,TARGET,9,Primary Blood Derived Cancer - Bone Marrow,TARGET-AML,0.816438,Female
4,TARGET-20-PASARK-14A,TARGET,14,Bone Marrow Normal,TARGET-AML,15.520548,Male


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19188 entries, 0 to 19187
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   sample                     19188 non-null  object 
 1   program                    19188 non-null  object 
 2   sample_type_id             19188 non-null  int64  
 3   sample_type                19117 non-null  object 
 4   project_id                 18954 non-null  object 
 5   Age at Diagnosis in Years  18677 non-null  float64
 6   Gender                     18738 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.0+ MB
None


In [20]:
# Read in the survival phenotype data
df_survival_phenotype = pd.read_csv('./data/GDC-PANCAN.survival.tsv', sep='\t')

In [24]:
display(df_survival_phenotype.head())
print(df_survival_phenotype.info())

Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-OR-A5KZ-01A,1,TCGA-OR-A5KZ,125
1,TCGA-OR-A5LC-01A,1,TCGA-OR-A5LC,159
2,TCGA-P6-A5OF-01A,1,TCGA-P6-A5OF,207
3,TCGA-OR-A5JU-01A,1,TCGA-OR-A5JU,289
4,TCGA-OR-A5K9-11A,1,TCGA-OR-A5K9,344


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18492 entries, 0 to 18491
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sample    18492 non-null  object
 1   OS        18492 non-null  int64 
 2   _PATIENT  18492 non-null  object
 3   OS.time   18492 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 578.0+ KB
None


In [None]:
# Create a list of all samples
samples_rna = list(df_merged.columns)
samples_pheno = list(df_basic_phenotype['sample'].values)
samples_survival = list(df_survival_phenotype['sample'].values)

In [54]:
# Find all common samples in all three lists
common_samples = list(set(samples_rna) & set(samples_pheno) & set(samples_survival))

In [57]:
# Subset and reorder all three datasets by common_samples
# Filter merged_df by columns in common_samples
df_merged_filtered = df_merged[common_samples]
df_basic_phenotype_filtered = df_basic_phenotype[df_basic_phenotype['sample'].isin(common_samples)]
df_survival_phenotype_filtered = df_survival_phenotype[df_survival_phenotype['sample'].isin(common_samples)]

In [59]:
display(df_merged_filtered.head())
display(df_basic_phenotype_filtered.head())
display(df_survival_phenotype_filtered.head())

Unnamed: 0,TCGA-HC-8264-01B,TCGA-C5-A1BJ-01A,TCGA-FC-A6HD-01A,TCGA-K1-A6RU-01A,TCGA-3A-A9IB-01A,TCGA-77-8138-01A,TARGET-15-SJMPAL044949-09A,TCGA-AN-A0XV-01A,TCGA-E2-A1LL-01A,TCGA-FY-A3R7-01A,...,TCGA-H5-A2HR-01A,TCGA-E9-A1N8-01A,TCGA-78-7154-01A,TCGA-C5-A2LZ-01A,TCGA-97-A4M0-01A,TCGA-BS-A0TJ-01A,TCGA-F2-A44H-01A,TCGA-A2-A1FW-01A,TCGA-25-1627-01A,TCGA-F4-6855-01A
0,0.0,8.086491,0.0,0.0,9.212124,0.0,0.0,0.0,0.0,13.878879,...,0.0,0.0,9.587563,8.971686,0.0,11.790971,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,6.258041,7.369036,0.0,8.736521,7.703624,0.0,7.86565,...,0.0,7.571425,0.0,0.0,0.0,0.0,7.847837,0.0,10.149879,0.0
2,15.683349,15.813341,17.414614,16.392072,15.955252,16.774913,15.486407,15.782273,14.88237,15.953637,...,17.164329,15.818564,15.797501,16.396978,17.135551,16.406838,16.849131,15.052879,15.514263,15.604356
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14.902773,16.626816,14.711518,15.037346,16.336363,17.840189,15.558571,16.614208,17.99262,16.370329,...,16.165682,17.574627,17.575089,16.873085,15.738257,16.802025,16.092752,15.606818,17.371912,16.243352


Unnamed: 0,sample,program,sample_type_id,sample_type,project_id,Age at Diagnosis in Years,Gender
0,TCGA-69-7978-01A,TCGA,1,Primary Tumor,TCGA-LUAD,59.0,Male
1,TCGA-AR-A24Z-01A,TCGA,1,Primary Tumor,TCGA-BRCA,57.0,Female
2,TCGA-D1-A103-01A,TCGA,1,Primary Tumor,TCGA-UCEC,87.0,Female
5,TCGA-24-1435-01A,TCGA,1,Primary Tumor,TCGA-OV,57.0,Female
7,TCGA-63-A5MB-01A,TCGA,1,Primary Tumor,TCGA-LUSC,62.0,Male


Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-OR-A5KZ-01A,1,TCGA-OR-A5KZ,125
1,TCGA-OR-A5LC-01A,1,TCGA-OR-A5LC,159
2,TCGA-P6-A5OF-01A,1,TCGA-P6-A5OF,207
5,TCGA-OR-A5K9-01A,1,TCGA-OR-A5K9,344
6,TCGA-OR-A5J5-01A,1,TCGA-OR-A5J5,365


In [63]:
# Reorder phenotype and survival dataframes to match the columns of the rna matrix
column_order = list(df_merged_filtered.columns)
df_basic_phenotype_filtered_ordered = df_basic_phenotype_filtered.set_index('sample').loc[column_order].reset_index()
df_survival_phenotype_filtered_ordered = df_survival_phenotype_filtered.set_index('sample').loc[column_order].reset_index()
# display(df_basic_phenotype_filtered_ordered)
# display(df_survival_phenotype_filtered_ordered)
# display(df_merged_filtered)

Unnamed: 0,sample,program,sample_type_id,sample_type,project_id,Age at Diagnosis in Years,Gender
0,TCGA-HC-8264-01B,TCGA,1,Primary Tumor,TCGA-PRAD,60.0,Male
1,TCGA-C5-A1BJ-01A,TCGA,1,Primary Tumor,TCGA-CESC,34.0,Female
2,TCGA-FC-A6HD-01A,TCGA,1,Primary Tumor,TCGA-PRAD,77.0,Male
3,TCGA-K1-A6RU-01A,TCGA,1,Primary Tumor,TCGA-SARC,66.0,Female
4,TCGA-3A-A9IB-01A,TCGA,1,Primary Tumor,TCGA-PAAD,69.0,Female
...,...,...,...,...,...,...,...
11501,TCGA-BS-A0TJ-01A,TCGA,1,Primary Tumor,TCGA-UCEC,59.0,Female
11502,TCGA-F2-A44H-01A,TCGA,1,Primary Tumor,TCGA-PAAD,65.0,Male
11503,TCGA-A2-A1FW-01A,TCGA,1,Primary Tumor,TCGA-BRCA,62.0,Female
11504,TCGA-25-1627-01A,TCGA,1,Primary Tumor,TCGA-OV,73.0,Female


Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-HC-8264-01B,0,TCGA-HC-8264,48
1,TCGA-C5-A1BJ-01A,0,TCGA-C5-A1BJ,4385
2,TCGA-FC-A6HD-01A,0,TCGA-FC-A6HD,789
3,TCGA-K1-A6RU-01A,1,TCGA-K1-A6RU,711
4,TCGA-3A-A9IB-01A,1,TCGA-3A-A9IB,224
...,...,...,...,...
11501,TCGA-BS-A0TJ-01A,0,TCGA-BS-A0TJ,2068
11502,TCGA-F2-A44H-01A,0,TCGA-F2-A44H,586
11503,TCGA-A2-A1FW-01A,0,TCGA-A2-A1FW,528
11504,TCGA-25-1627-01A,1,TCGA-25-1627,394


Unnamed: 0,TCGA-HC-8264-01B,TCGA-C5-A1BJ-01A,TCGA-FC-A6HD-01A,TCGA-K1-A6RU-01A,TCGA-3A-A9IB-01A,TCGA-77-8138-01A,TARGET-15-SJMPAL044949-09A,TCGA-AN-A0XV-01A,TCGA-E2-A1LL-01A,TCGA-FY-A3R7-01A,...,TCGA-H5-A2HR-01A,TCGA-E9-A1N8-01A,TCGA-78-7154-01A,TCGA-C5-A2LZ-01A,TCGA-97-A4M0-01A,TCGA-BS-A0TJ-01A,TCGA-F2-A44H-01A,TCGA-A2-A1FW-01A,TCGA-25-1627-01A,TCGA-F4-6855-01A
0,0.000000,8.086491,0.000000,0.000000,9.212124,0.000000,0.000000,0.000000,0.000000,13.878879,...,0.000000,0.000000,9.587563,8.971686,0.000000,11.790971,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,6.258041,7.369036,0.000000,8.736521,7.703624,0.000000,7.865650,...,0.000000,7.571425,0.000000,0.000000,0.000000,0.000000,7.847837,0.000000,10.149879,0.000000
2,15.683349,15.813341,17.414614,16.392072,15.955252,16.774913,15.486407,15.782273,14.882370,15.953637,...,17.164329,15.818564,15.797501,16.396978,17.135551,16.406838,16.849131,15.052879,15.514263,15.604356
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,14.902773,16.626816,14.711518,15.037346,16.336363,17.840189,15.558571,16.614208,17.992620,16.370329,...,16.165682,17.574627,17.575089,16.873085,15.738257,16.802025,16.092752,15.606818,17.371912,16.243352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60478,17.488084,18.318117,17.793001,19.265656,18.696628,19.527041,19.426327,17.712625,18.142989,18.008997,...,18.079327,18.320616,18.658238,18.089116,18.464886,18.739685,18.483742,17.515678,18.933500,18.471828
60479,13.246070,11.317400,14.278994,10.909847,11.668187,12.297178,14.500089,13.135553,12.484611,14.447084,...,14.617002,11.764781,10.822145,12.977239,11.660268,12.556979,11.342222,12.148587,16.444590,10.592739
60480,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
60481,12.633438,13.881981,12.894713,14.373712,14.615599,13.927300,12.693156,12.727786,12.812400,12.904331,...,14.254714,12.386390,15.386432,13.526787,16.040043,13.271749,14.663916,11.268701,11.649429,13.762100


### KM Plot

---

OS Values:
* 1 = `deceased`
* 0 = `living`

Ref: https://docs.cbioportal.org/user-guide/faq/#what-is-the-meaning-of-os_status--os_months-and-pfs_status--pfs_months   
Ref: https://erdogant.github.io/kaplanmeier/pages/html/Examples.html

In [62]:
import kaplanmeier as km

In [None]:
# EXAMPLE: https://erdogant.github.io/kaplanmeier/pages/html/Examples.html
time_event = df_survival_phenotype_filtered_ordered['OS.time']
censoring = df_survival_phenotype_filtered_ordered['OS']
y = df['group']

print(df)
#       time  Died  group
# 0     485     0      1
# 1     526     1      2
# 2     588     1      2
# 3     997     0      1
# 4     426     1      1
# ..    ...   ...    ...
# 175   183     0      1
# 176  3196     0      1
# 177   457     1      2
# 178  2100     1      1
# 179   376     0      1
#
# [180 rows x 3 columns]

# Compute Survival
results = km.fit(time_event, censoring, y)

# Plot
km.plot(results)