# Data Analysis - Survival Analysis

Data from: https://xenabrowser.net/datapages/?dataset=GDC-PANCAN.htseq_fpkm-uq.tsv&host=https%3A%2F%2Fgdc.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

### TCGA Barcodes
The column headers are TCGA barcodes:
* In the format of: `project-tissuesourcesite-participant-sample|vial-portion|analyte-plate-center`
* https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
* https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes

In [2]:
# Installations
# !pip install kaplanmeier
# !pip install gseapy

In [1]:
# Imports
import pandas as pd

In [2]:
# Read in the RNA matrix
df = pd.read_parquet('./data/GDC-PANCAN.htseq_fpkm-uq.parquet')
display(df.head())

Unnamed: 0,xena_sample,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,TCGA-OR-A5JR-01A,TCGA-OR-A5KU-01A,TCGA-OR-A5LS-01A,TCGA-OR-A5J7-01A,TCGA-OR-A5JQ-01A,...,TARGET-50-PAJMKJ-01A,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A
0,ENSG00000242268.2,0.0,0.0,0.0,0.0,9.486642,0.0,0.0,0.0,0.0,...,11.700035,10.041859,13.398458,0.0,10.61723,11.933609,14.140998,11.659218,10.662028,12.878131
1,ENSG00000270112.3,10.689655,14.408626,14.022621,11.291444,10.221394,12.423503,12.830424,12.758888,11.547426,...,9.267574,12.513257,10.501003,10.452072,10.625798,8.310776,7.131909,7.678919,10.134942,11.116645
2,ENSG00000167578.15,18.536987,18.684183,17.334107,19.713465,16.76163,17.762472,18.114361,19.068519,17.47447,...,15.541309,16.684341,15.905948,16.991286,15.066989,13.953978,15.969451,14.607776,14.387707,15.886538
3,ENSG00000273842.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENSG00000078237.5,17.847476,18.227483,17.287893,16.722624,17.157762,17.001996,18.648729,18.076084,15.817248,...,15.37773,16.438256,16.733394,16.149538,16.277784,15.673957,16.305087,15.916629,15.850915,16.188748


### Align RNA / OS / Phenotype samples

---

In [3]:
# Read in ID/Gene Mapping file
mapping_df = pd.read_csv('./data/gencode.v22.annotation.gene.probeMap', sep='\t')
display(mapping_df.head())

Unnamed: 0,id,gene,chrom,chromStart,chromEnd,strand
0,ENSG00000223972.5,DDX11L1,chr1,11869,14409,+
1,ENSG00000227232.5,WASH7P,chr1,14404,29570,-
2,ENSG00000278267.1,MIR6859-3,chr1,17369,17436,-
3,ENSG00000243485.3,RP11-34P13.3,chr1,29554,31109,+
4,ENSG00000274890.1,MIR1302-9,chr1,30366,30503,+


In [4]:
# Check to see if the mapping file and the rna matrix file have the same id names
mapping_df.rename(columns={'id': 'xena_sample'}, inplace=True)
merged_df = pd.merge(mapping_df, df, on='xena_sample', how='outer', indicator=True)

# Check matching status
# Filter rows that do not have 'both' in the '_merge' column
non_matching_rows = merged_df[merged_df['_merge'] != 'both']

# Print the non-matching rows
print(non_matching_rows)

Empty DataFrame
Columns: [xena_sample, gene, chrom, chromStart, chromEnd, strand, TCGA-OR-A5JP-01A, TCGA-OR-A5JE-01A, TCGA-OR-A5JG-01A, TCGA-OR-A5L9-01A, TCGA-OR-A5JR-01A, TCGA-OR-A5KU-01A, TCGA-OR-A5LS-01A, TCGA-OR-A5J7-01A, TCGA-OR-A5JQ-01A, TCGA-OR-A5JS-01A, TCGA-OR-A5JL-01A, TCGA-OR-A5LC-01A, TCGA-OR-A5K2-01A, TCGA-P6-A5OG-01A, TCGA-OR-A5JW-01A, TCGA-OR-A5JZ-01A, TCGA-OR-A5J8-01A, TCGA-OR-A5K5-01A, TCGA-OR-A5KV-01A, TCGA-OR-A5L4-01A, TCGA-OR-A5KX-01A, TCGA-OR-A5K1-01A, TCGA-OR-A5JO-01A, TCGA-OR-A5LG-01A, TCGA-OR-A5LO-01A, TCGA-OR-A5JB-01A, TCGA-OR-A5JV-01A, TCGA-OR-A5LJ-01A, TCGA-OR-A5LA-01A, TCGA-OR-A5KY-01A, TCGA-OR-A5KO-01A, TCGA-OR-A5L6-01A, TCGA-OR-A5KZ-01A, TCGA-OR-A5J5-01A, TCGA-OR-A5LB-01A, TCGA-OR-A5LT-01A, TCGA-OR-A5LD-01A, TCGA-OR-A5J2-01A, TCGA-OR-A5LE-01A, TCGA-OR-A5K4-01A, TCGA-OR-A5K6-01A, TCGA-OR-A5JY-01A, TCGA-OR-A5JT-01A, TCGA-OR-A5KW-01A, TCGA-PK-A5H8-01A, TCGA-OR-A5JX-01A, TCGA-OR-A5LK-01A, TCGA-P6-A5OF-01A, TCGA-OR-A5JM-01A, TCGA-OR-A5JI-01A, TCGA-OR-A5JC-01A, 

In [5]:
display(merged_df.head())

Unnamed: 0,xena_sample,gene,chrom,chromStart,chromEnd,strand,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,...,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A,_merge
0,ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,17.544533,17.623128,17.747056,18.246428,...,19.491339,20.104873,19.215693,19.308517,19.586799,19.567246,18.448389,19.536579,19.907498,both
1,ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,10.592418,11.728494,12.427999,0.0,...,14.251711,10.770522,13.559971,9.758736,16.225119,11.877669,12.018519,16.369945,11.156833,both
2,ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,20.046039,20.569446,20.392837,18.801009,...,19.230607,19.57999,19.37457,19.258578,19.058986,19.981892,19.085757,19.326476,19.838199,both
3,ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,15.323361,15.22832,15.075964,15.563598,...,15.451625,15.225552,15.411968,16.505343,15.762901,15.48178,17.207028,16.205465,15.483468,both
4,ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,14.066195,14.361755,14.457514,13.769704,...,12.789757,14.406227,14.04045,15.90826,15.992289,15.80077,16.772996,16.536652,15.21413,both


In [9]:
# Print the updated df DataFrame
display(merged_df.head()) # original one

Unnamed: 0,xena_sample,gene,chrom,chromStart,chromEnd,strand,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,...,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A,_merge
0,ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,17.544533,17.623128,17.747056,18.246428,...,19.491339,20.104873,19.215693,19.308517,19.586799,19.567246,18.448389,19.536579,19.907498,both
1,ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,10.592418,11.728494,12.427999,0.0,...,14.251711,10.770522,13.559971,9.758736,16.225119,11.877669,12.018519,16.369945,11.156833,both
2,ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,20.046039,20.569446,20.392837,18.801009,...,19.230607,19.57999,19.37457,19.258578,19.058986,19.981892,19.085757,19.326476,19.838199,both
3,ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,15.323361,15.22832,15.075964,15.563598,...,15.451625,15.225552,15.411968,16.505343,15.762901,15.48178,17.207028,16.205465,15.483468,both
4,ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,14.066195,14.361755,14.457514,13.769704,...,12.789757,14.406227,14.04045,15.90826,15.992289,15.80077,16.772996,16.536652,15.21413,both


In [16]:
# Ensure the expression dataframe is in the format: indexed on gene names column labels as sample ids
merged_trimmed_df = merged_df
merged_trimmed_df.drop(columns=['xena_sample', 'chrom', 'chromStart', 'chromEnd', 'strand', '_merge'], axis=1, inplace=True)
merged_trimmed_df.set_index('gene', inplace=True)
display(merged_trimmed_df.head())

Unnamed: 0_level_0,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,TCGA-OR-A5JR-01A,TCGA-OR-A5KU-01A,TCGA-OR-A5LS-01A,TCGA-OR-A5J7-01A,TCGA-OR-A5JQ-01A,TCGA-OR-A5JS-01A,...,TARGET-50-PAJMKJ-01A,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,17.544533,17.623128,17.747056,18.246428,18.257023,18.353113,18.292785,17.084432,18.562685,18.237882,...,19.286487,19.491339,20.104873,19.215693,19.308517,19.586799,19.567246,18.448389,19.536579,19.907498
TNMD,10.592418,11.728494,12.427999,0.0,8.30802,9.006822,13.378305,11.823974,9.67263,9.578988,...,9.520397,14.251711,10.770522,13.559971,9.758736,16.225119,11.877669,12.018519,16.369945,11.156833
DPM1,20.046039,20.569446,20.392837,18.801009,20.042714,20.133037,19.859895,19.585215,19.886858,20.067324,...,19.861846,19.230607,19.57999,19.37457,19.258578,19.058986,19.981892,19.085757,19.326476,19.838199
SCYL3,15.323361,15.22832,15.075964,15.563598,15.260088,14.850601,16.18701,15.453456,15.045246,15.309825,...,16.5117,15.451625,15.225552,15.411968,16.505343,15.762901,15.48178,17.207028,16.205465,15.483468
C1orf112,14.066195,14.361755,14.457514,13.769704,14.161787,13.557133,13.918952,15.328623,13.781018,14.321432,...,16.570038,12.789757,14.406227,14.04045,15.90826,15.992289,15.80077,16.772996,16.536652,15.21413


In [10]:
# Read in the basic phenotype data
df_basic_phenotype = pd.read_parquet('./data/GDC-PANCAN.basic_phenotype.parquet')

In [11]:
display(df_basic_phenotype.head())
print(df_basic_phenotype.info())

Unnamed: 0,sample,program,sample_type_id,sample_type,project_id,Age at Diagnosis in Years,Gender
0,TCGA-69-7978-01A,TCGA,1,Primary Tumor,TCGA-LUAD,59.0,Male
1,TCGA-AR-A24Z-01A,TCGA,1,Primary Tumor,TCGA-BRCA,57.0,Female
2,TCGA-D1-A103-01A,TCGA,1,Primary Tumor,TCGA-UCEC,87.0,Female
3,TARGET-20-PASRLS-09A,TARGET,9,Primary Blood Derived Cancer - Bone Marrow,TARGET-AML,0.816438,Female
4,TARGET-20-PASARK-14A,TARGET,14,Bone Marrow Normal,TARGET-AML,15.520548,Male


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19188 entries, 0 to 19187
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   sample                     19188 non-null  object 
 1   program                    19188 non-null  object 
 2   sample_type_id             19188 non-null  int64  
 3   sample_type                19117 non-null  object 
 4   project_id                 18954 non-null  object 
 5   Age at Diagnosis in Years  18677 non-null  float64
 6   Gender                     18738 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.0+ MB
None


In [12]:
# Read in the survival phenotype data
df_survival_phenotype = pd.read_parquet('./data/GDC-PANCAN.survival.parquet')

In [13]:
display(df_survival_phenotype.head())
print(df_survival_phenotype.info())

Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-OR-A5KZ-01A,1,TCGA-OR-A5KZ,125
1,TCGA-OR-A5LC-01A,1,TCGA-OR-A5LC,159
2,TCGA-P6-A5OF-01A,1,TCGA-P6-A5OF,207
3,TCGA-OR-A5JU-01A,1,TCGA-OR-A5JU,289
4,TCGA-OR-A5K9-11A,1,TCGA-OR-A5K9,344


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18492 entries, 0 to 18491
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sample    18492 non-null  object
 1   OS        18492 non-null  int64 
 2   _PATIENT  18492 non-null  object
 3   OS.time   18492 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 578.0+ KB
None


In [17]:
# Create a list of all samples
samples_rna = list(merged_trimmed_df.columns)
samples_pheno = list(df_basic_phenotype['sample'].values)
samples_survival = list(df_survival_phenotype['sample'].values)

In [18]:
# Find all common samples in all three lists
common_samples = list(set(samples_rna) & set(samples_pheno) & set(samples_survival))

In [19]:
# Subset and reorder all three datasets by common_samples
# Filter merged_df by columns in common_samples
df_merged_filtered = merged_trimmed_df[common_samples]
df_basic_phenotype_filtered = df_basic_phenotype[df_basic_phenotype['sample'].isin(common_samples)]
df_survival_phenotype_filtered = df_survival_phenotype[df_survival_phenotype['sample'].isin(common_samples)]

In [20]:
display(df_merged_filtered.head())
display(df_basic_phenotype_filtered.head())
display(df_survival_phenotype_filtered.head())

Unnamed: 0_level_0,TCGA-B0-5092-01A,TCGA-EM-A2P1-06A,TCGA-EK-A2RB-01A,TCGA-QR-A70U-01A,TCGA-DS-A7WH-01A,TCGA-BH-A0HB-01A,TCGA-CV-5977-01A,TCGA-E8-A2JQ-01A,TCGA-HZ-8001-01A,TCGA-J4-A67N-01A,...,TCGA-24-1562-01A,TCGA-DM-A1DB-01A,TCGA-ER-A19G-06A,TCGA-L5-A4OG-11A,TCGA-CJ-4904-01A,TCGA-FD-A5C0-01A,TCGA-C5-A1MF-01A,TCGA-D8-A1J9-01A,TCGA-OL-A6VO-01A,TARGET-20-PANSBH-09A
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,16.883139,18.044325,18.563318,14.261682,17.497327,17.621236,17.174381,18.300299,17.85913,17.999614,...,18.384206,21.076464,16.350409,17.52265,17.835202,19.842137,17.32298,17.029695,18.400249,10.08443
TNMD,11.400423,0.0,0.0,0.0,8.217318,11.63837,0.0,0.0,12.926527,8.528177,...,12.695337,15.336365,0.0,0.0,12.545499,8.110187,9.839878,0.0,10.343369,0.0
DPM1,18.941371,19.57733,19.740473,19.346546,19.812974,19.71889,19.916774,19.202222,19.373198,19.067896,...,19.749937,21.367464,19.639513,19.092025,18.759391,20.142187,19.494731,19.32185,19.180661,20.463873
SCYL3,15.761547,16.095025,15.309957,14.640364,15.594273,15.918741,14.994572,15.692706,15.506588,15.722934,...,15.637122,15.31873,15.992741,16.291034,15.284375,15.528646,15.273744,17.473718,15.746154,14.673866
C1orf112,14.114888,14.446739,15.688927,12.976915,16.931707,14.864885,15.052504,14.613625,14.184866,13.645658,...,15.367002,15.702452,16.290281,14.217542,13.764538,14.851685,16.014469,15.697993,16.343102,14.52376


Unnamed: 0,sample,program,sample_type_id,sample_type,project_id,Age at Diagnosis in Years,Gender
0,TCGA-69-7978-01A,TCGA,1,Primary Tumor,TCGA-LUAD,59.0,Male
1,TCGA-AR-A24Z-01A,TCGA,1,Primary Tumor,TCGA-BRCA,57.0,Female
2,TCGA-D1-A103-01A,TCGA,1,Primary Tumor,TCGA-UCEC,87.0,Female
5,TCGA-24-1435-01A,TCGA,1,Primary Tumor,TCGA-OV,57.0,Female
7,TCGA-63-A5MB-01A,TCGA,1,Primary Tumor,TCGA-LUSC,62.0,Male


Unnamed: 0,sample,OS,_PATIENT,OS.time
0,TCGA-OR-A5KZ-01A,1,TCGA-OR-A5KZ,125
1,TCGA-OR-A5LC-01A,1,TCGA-OR-A5LC,159
2,TCGA-P6-A5OF-01A,1,TCGA-P6-A5OF,207
5,TCGA-OR-A5K9-01A,1,TCGA-OR-A5K9,344
6,TCGA-OR-A5J5-01A,1,TCGA-OR-A5J5,365


In [21]:
# Reorder phenotype and survival dataframes to match the columns of the rna matrix
column_order = list(df_merged_filtered.columns)
df_basic_phenotype_filtered_ordered = df_basic_phenotype_filtered.set_index('sample').loc[column_order].reset_index()
df_survival_phenotype_filtered_ordered = df_survival_phenotype_filtered.set_index('sample').loc[column_order].reset_index()

## GSVA: gene set variation analysis

---

Ref: https://pypi.org/project/GSVA/   
Ref: https://jason-weirather.github.io/GSVA/   
Ref: https://bioconductor.statistik.tu-dortmund.de/packages/3.16/bioc/vignettes/GSVA/inst/doc/GSVA.html#1_Quick_start   
Ref: https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29   

In [17]:
# Locate the ID/Gene mapped merged dataframe
display(merged_df.head())

Unnamed: 0,xena_sample,gene,chrom,chromStart,chromEnd,strand,TCGA-OR-A5JP-01A,TCGA-OR-A5JE-01A,TCGA-OR-A5JG-01A,TCGA-OR-A5L9-01A,...,TARGET-50-CAAAAQ-11A,TARGET-50-PAKSCC-01A,TARGET-50-PAJNSL-11A,TARGET-50-PAJPAU-01A,TARGET-50-PAJNZU-01A,TARGET-50-PAJNNR-01A,TARGET-50-PAJNTJ-02A,TARGET-50-PAECJB-01A,TARGET-50-PALFRD-01A,_merge
0,ENSG00000000003.13,TSPAN6,chrX,100627109,100639991,-,17.544533,17.623128,17.747056,18.246428,...,19.491339,20.104873,19.215693,19.308517,19.586799,19.567246,18.448389,19.536579,19.907498,both
1,ENSG00000000005.5,TNMD,chrX,100584802,100599885,+,10.592418,11.728494,12.427999,0.0,...,14.251711,10.770522,13.559971,9.758736,16.225119,11.877669,12.018519,16.369945,11.156833,both
2,ENSG00000000419.11,DPM1,chr20,50934867,50958555,-,20.046039,20.569446,20.392837,18.801009,...,19.230607,19.57999,19.37457,19.258578,19.058986,19.981892,19.085757,19.326476,19.838199,both
3,ENSG00000000457.12,SCYL3,chr1,169849631,169894267,-,15.323361,15.22832,15.075964,15.563598,...,15.451625,15.225552,15.411968,16.505343,15.762901,15.48178,17.207028,16.205465,15.483468,both
4,ENSG00000000460.15,C1orf112,chr1,169662007,169854080,+,14.066195,14.361755,14.457514,13.769704,...,12.789757,14.406227,14.04045,15.90826,15.992289,15.80077,16.772996,16.536652,15.21413,both


### 1) GSEAPY GSVA: gene set variation analysis

---

Takes roughly 1h20m to run.

In [22]:
# !pip install gseapy

In [25]:
import gseapy as gp
import threading

In [27]:
# Test data
signature = {'ecsig': ['EGFR', 'FGR']}
n_threads=threading.active_count()-1

scores=gp.gsva(data=merged_trimmed_df,
               gene_sets=signature,
               threads=n_threads,
               min_size=2,
               outdir=None,
               verbose=True)

2024-09-13 16:56:29,633 [INFO] Parsing data files for GSVA.............................
2024-09-13 16:56:56,135 [INFO] Estimating ECDFs with Gaussian kernels.
2024-09-13 16:56:56,142 [INFO] 0000 gene_sets have been filtered out when max_size=1000 and min_size=2
2024-09-13 16:56:56,143 [INFO] 0001 gene_sets used for further statistical testing.....
2024-09-13 16:56:56,143 [INFO] Start to run GSVA...Might take a while................
2024-09-13 18:16:59,629 [INFO] Done


In [28]:
# Display 2D results
scores.res2d

Unnamed: 0,Name,Term,ES
0,TCGA-OR-A5JP-01A,ecsig,-0.851434
1,TCGA-XK-AAJA-01A,ecsig,-0.60125
2,TCGA-KK-A8I7-01A,ecsig,0.319677
3,TCGA-J4-A83L-01A,ecsig,-0.319414
4,TCGA-HC-7817-01B,ecsig,0.514978
...,...,...,...
11763,TCGA-CJ-4874-01A,ecsig,0.81312
11764,TCGA-B0-5697-11A,ecsig,-0.837024
11765,TCGA-CZ-5463-11A,ecsig,0.451239
11766,TCGA-CZ-5469-11A,ecsig,0.506551


In [None]:
# Pivot table to display a matrix of scores
scores.res2d.pivot(index='Term', columns='Name', values='ES').head()

### 2) GSVA using R in Python (via rpy2): gene set variation analysis

Testing using R inside Python with R's highly optimized GSVA calculations

---

Transform dataframe into R-compatible object 14 minutes.   
gsva calculation 3:21-??

In [31]:
# !pip install rpy2

In [32]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()
# ro.r('BiocManager::install("GSVA")') # Install GSVA library in R

In [34]:
# Convert pandas DataFrame to an R-compatible DataFrame
r_merged_trimmed_df = pandas2ri.py2rpy(merged_trimmed_df)

# Assign the DataFrame to the R environment
ro.globalenv['merged_trimmed_df'] = r_merged_trimmed_df

# Your gene signature (if it's a Python dict, convert it to an R list)
signature = {'ecsig': ['EGFR', 'FGR']}
r_signature = ro.ListVector(signature)
ro.globalenv['signature'] = r_signature



In [None]:
ro.r('library(GSVA)')
ro.r('gsva_result <- gsva(as.matrix(merged_trimmed_df), signature)')

In [None]:
# To retrieve the result back to Python
gsva_result = ro.r('as.data.frame(gsva_result)')

## Gene Signatures & Scoring Methods - Alternatives to GSVA

---

### 1) Single-Sample Gene Set Enrichment Analysis (ssGSEA) with gseapy

---

ssGSEA is similar to GSVA but calculates enrichment scores for each sample separately. ssGSEA can sometimes be faster than GSVA, especially for large datasets.

Takes roughly 8 minutes to run.   
Takes roughly 6 minutes to run. with cProfile

In [19]:
import gseapy as gp
import threading
import numpy as np
import cProfile
from datetime import datetime

In [20]:
signature = {'ecsig': ['EGFR', 'FGR']}
n_threads=threading.active_count()-1

In [None]:
# Run using cProfile
def run_ssgsea():
    signature = {'ecsig': ['EGFR', 'FGR']}
    ss = gp.ssgsea(data=merged_trimmed_df, gene_sets=signature, outdir=None, 
                   sample_norm_method='rank', threads=n_threads, min_size=2, 
                   verbose=True)

cProfile.run('run_ssgsea()')

In [21]:
# Run normally, without using cProfile
ss = gp.ssgsea(data=merged_trimmed_df, gene_sets=signature, outdir=None, 
               sample_norm_method='rank', threads=n_threads, min_size=2, 
               verbose=True)

2024-09-19 09:54:53,362 [INFO] Parsing data files for ssGSEA...........................
2024-09-19 09:57:29,135 [INFO] 0000 gene_sets have been filtered out when max_size=500 and min_size=2
2024-09-19 09:57:29,136 [INFO] 0001 gene_sets used for further statistical testing.....
2024-09-19 09:57:29,137 [INFO] Start to run ssGSEA...Might take a while................


In [22]:
today = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
print(today)

2024-09-19_10:01:52


In [25]:
print(ss.res2d.head())

               Name   Term            ES       NES
0  TCGA-CV-6933-01A  ecsig  27817.584014  0.870469
1  TCGA-DU-A5TY-01A  ecsig  27703.696631  0.866905
2  TCGA-E1-A7YN-01A  ecsig  27701.631012  0.866841
3  TCGA-BP-4354-01A  ecsig  27627.828306  0.864531
4  TCGA-24-2262-01A  ecsig  27533.418743  0.861577


### 2) pySCENIC

---

A python version of R's SCENIC pipeline - significantly faster than R's implementation.

aucell auc_mtx calculation: 1:00pm--1:01pm with error

In [20]:
# !pip install pyscenic

In [35]:
import pyscenic
from pyscenic.aucell import aucell

In [29]:
signature = {'ecsig': ['EGFR', 'FGR']}

In [31]:
# Create a list of all unique genes
all_genes = list(set(gene for genes in signature.values() for gene in genes))

# Initialize an empty DataFrame with all genes as the index and regulons as columns
regulons_df = pd.DataFrame(index=all_genes, columns=signature.keys()).fillna(0)

# Fill the DataFrame with 1s where the gene is present in the regulon
for regulon, genes in signature.items():
    regulons_df.loc[genes, regulon] = 1

  regulons_df = pd.DataFrame(index=all_genes, columns=signature.keys()).fillna(0)


In [32]:
print(regulons_df.head())

      ecsig
FGR       1
EGFR      1


In [None]:
auc_mtx = aucell(merged_trimmed_df, regulons_df)

## KM Plot

---

OS Values:
* 1 = `deceased`
* 0 = `living`

Ref: https://docs.cbioportal.org/user-guide/faq/#what-is-the-meaning-of-os_status--os_months-and-pfs_status--pfs_months   
Ref: https://erdogant.github.io/kaplanmeier/pages/html/Examples.html

In [62]:
import kaplanmeier as km

In [None]:
# EXAMPLE: https://erdogant.github.io/kaplanmeier/pages/html/Examples.html
time_event = df_survival_phenotype_filtered_ordered['OS.time']
censoring = df_survival_phenotype_filtered_ordered['OS']
y = df['group']

print(df)
#       time  Died  group
# 0     485     0      1
# 1     526     1      2
# 2     588     1      2
# 3     997     0      1
# 4     426     1      1
# ..    ...   ...    ...
# 175   183     0      1
# 176  3196     0      1
# 177   457     1      2
# 178  2100     1      1
# 179   376     0      1
#
# [180 rows x 3 columns]

# Compute Survival
results = km.fit(time_event, censoring, y)

# Plot
km.plot(results)

### Transfer CSV files to Parquet

---

In [20]:
# Read in the csv into a df
# df_transfer = pd.read_csv('./data/GDC-PANCAN.htseq_fpkm-uq.tsv', sep='\t')
# df_survival_transfer = pd.read_csv('./data/GDC-PANCAN.survival.tsv', sep='\t')
# df_basic_pheno_transfer = pd.read_csv('./data/GDC-PANCAN.basic_phenotype.tsv', sep='\t')

# Transform csv's into parquet files
# df_transfer.to_parquet('./data/GDC-PANCAN.htseq_fpkm-uq.parquet', compression=None)
# df_survival_transfer.to_parquet('./data/GDC-PANCAN.survival.parquet', compression=None)
# df_basic_pheno_transfer.to_parquet('./data/GDC-PANCAN.basic_phenotype.parquet', compression=None)