In [1]:
library(tidyverse)

# Custom package
library(rutils)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.2     ✔ purrr   0.3.4
✔ tibble  3.0.3     ✔ dplyr   1.0.0
✔ tidyr   1.1.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
dirs <- rutils::get_dev_directories(dev_paths_file = "../dev_paths.txt")
projects <- c("TCGA-CESC", "TCGA-UCS", "TCGA-UCEC", "TCGA-OV")
unified_dsets <- c("unified_cervical_data", "unified_uterine_data", "unified_uterine_endometrial_data")
matrisome_path <- paste0(dirs$data_dir, "/matrisome/matrisome_hs_masterlist.tsv")

In [3]:
dset_idx <- 2

In [4]:
p_thresh = 0.05
lfc_thresh = log2(2)
coxph_coeff_thresh = 0.0

# DEG quick glance

In [5]:
deg_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_DESeq_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  baseMean = col_double(),
  log2FoldChange = col_double(),
  lfcSE = col_double(),
  stat = col_double(),
  pvalue = col_double(),
  padj = col_double()
)


In [6]:
deg_results_df %>%
    dplyr::filter(padj < p_thresh, abs(log2FoldChange) > lfc_thresh) %>%
    top_n(-20, padj) %>%
    dplyr::arrange(padj) %>%
    dplyr::select(c(geneID, baseMean, log2FoldChange, padj))

geneID,baseMean,log2FoldChange,padj
<chr>,<dbl>,<dbl>,<dbl>
CBX7,5921.31225,-4.618208,0.0
AURKA,562.38661,4.84462,4.775893e-249
AUNIP,102.95701,4.245145,2.558909e-232
NXPH4,447.19461,8.081839,8.750641e-230
CCNF,487.9666,3.362696,2.193272e-226
TCF23,825.94556,-8.268469,9.582031999999999e-224
KIF2C,868.44071,4.903658,7.673212e-219
CBX2,1123.3753,4.196122,3.082652e-214
CDKN2A,1344.24703,6.989934,2.8465740000000003e-208
OTX1,76.90961,6.74742,6.4998800000000004e-205


# Cox PH quick glance

In [7]:
coxph_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_coxph_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  gene_pval = col_double(),
  gene_coeff = col_double()
)


In [8]:
coxph_results_df %>%
    dplyr::top_n(-20, gene_pval) %>%
    dplyr::arrange(gene_pval)

geneID,gene_pval,gene_coeff
<chr>,<dbl>,<dbl>
COL7A1,0.0002336249,-0.7225286
TGM3,0.0004093898,1.1245609
CST6,0.0005719445,-1.0079816
SDC1,0.0010491945,-0.7943372
MUC12,0.0011265731,1.2784272
COL5A3,0.0015920889,0.8329635
WNT10A,0.0017444436,-0.572024
IFNA1,0.0020149675,-6.510222
WNT7B,0.0025813922,-0.4752282
TIMP4,0.0027979185,0.4775624


# Correlation quick glance

In [9]:
cor_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_cor_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  cor = col_double(),
  pval = col_double(),
  n = col_double()
)


In [10]:
cor_results_df %>%
    dplyr::select(geneID, cor, pval) %>%
    dplyr::top_n(-20, pval) %>%
    dplyr::arrange(pval)

geneID,cor,pval
<chr>,<dbl>,<dbl>
FGF16,0.7061038,3.866132e-05
IL9,0.5942475,0.001081511
IFNA2,0.5942475,0.001081511
TSKU,0.5878516,0.001261943
TECTB,0.5407042,0.003592
ANGPTL6,0.5320337,0.004284821
OGN,0.5116528,0.006372819
FGL1,-0.4973148,0.008308743
FGF6,0.486533,0.01007027
TGM6,0.4859619,0.01017166


# MI survival quick glance

In [11]:
mi_survival_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_MI_survival_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  MI_est_median = col_double()
)


In [12]:
mi_survival_results_df %>%
    dplyr::top_n(20, MI_est_median) %>%
    dplyr::arrange(desc(MI_est_median))

geneID,MI_est_median
<chr>,<dbl>
COL9A2,0.3096317
CTSG,0.28912
SEMA3D,0.2537188
IL17C,0.2498483
S100A3,0.2403626
ADAM33,0.2396814
TGFA,0.2301018
SPP1,0.2249892
COL27A1,0.2236791
MMP13,0.2233744


# MAE GBR quick glance

In [13]:
mae_gbr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_mae_gbr_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  mean_imp_0 = col_double(),
  score_pct_improvement_0 = col_double(),
  mean_imp_1 = col_double(),
  score_pct_improvement_1 = col_double(),
  mean_imp_2 = col_double(),
  score_pct_improvement_2 = col_double(),
  mean_imp_3 = col_double(),
  score_pct_improvement_3 = col_double(),
  mean_imp_4 = col_double(),
  score_pct_improvement_4 = col_double(),
  consensus_imp_mean = col_double(),
  consensus_imp_std = col_double(),
  consensus_imp_cv = col_double(),
  consensus_vote = col_logical()
)


In [14]:
mae_gbr_results_df %>%
    dplyr::select(geneID, contains("pct"), consensus_vote) %>%
    dplyr::mutate(consensus_mean_pct_imp = rowMeans(dplyr::select(., contains("pct")))) %>%
    dplyr::select(geneID, consensus_vote, consensus_mean_pct_imp) %>%
    dplyr::top_n(20, consensus_mean_pct_imp) %>%
    dplyr::arrange(desc(consensus_mean_pct_imp))

geneID,consensus_vote,consensus_mean_pct_imp
<chr>,<lgl>,<dbl>
CTSV,True,1.03170675
INHBC,True,0.9924938
CCL16,True,0.08939644
ANGPTL7,True,0.07644699
TMPRSS15,False,0.0714004
SFRP5,True,0.04796304
FN1,True,0.04042421
WISP1,True,0.03666048
COL25A1,True,0.03610745
CLEC12A,True,0.03374429


# EV GBR quick glance

In [17]:
ev_gbr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_ev_gbr_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  mean_imp_0 = col_double(),
  score_pct_improvement_0 = col_double(),
  mean_imp_1 = col_double(),
  score_pct_improvement_1 = col_double(),
  mean_imp_2 = col_double(),
  score_pct_improvement_2 = col_double(),
  mean_imp_3 = col_double(),
  score_pct_improvement_3 = col_double(),
  mean_imp_4 = col_double(),
  score_pct_improvement_4 = col_double(),
  consensus_imp_mean = col_double(),
  consensus_imp_std = col_double(),
  consensus_imp_cv = col_double(),
  consensus_vote = col_logical()
)


In [23]:
ev_gbr_results_df %>%
    dplyr::select(geneID, contains("pct"), consensus_vote) %>%
    dplyr::mutate(consensus_mean_pct_imp = rowMeans(dplyr::select(., contains("pct")))) %>%
    dplyr::select(geneID, consensus_vote, consensus_mean_pct_imp) %>%
    dplyr::top_n(20, consensus_mean_pct_imp) %>%
    dplyr::arrange(desc(consensus_mean_pct_imp))

geneID,consensus_vote,consensus_mean_pct_imp
<chr>,<lgl>,<dbl>
FGF1,True,32.1966959
INHBC,True,30.7086815
LGALS9B,True,28.983941
SLIT2,True,23.8823489
CTSG,True,15.2848516
SFRP2,True,11.8558355
CTSV,True,7.6207291
WNT7B,True,7.5323356
TMPRSS15,True,5.6932101
OMD,True,5.4181083


# MI FIGO quick glance

In [27]:
mi_figo_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_MI_figo_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  MI_est_median = col_double()
)


In [29]:
mi_figo_results_df %>%
    dplyr::top_n(20, MI_est_median) %>%
    dplyr::arrange(desc(MI_est_median))

geneID,MI_est_median
<chr>,<dbl>
LOXL3,0.9755391
ADAM8,0.9283197
ANXA8L1,0.8728699
CCL3L3,0.8561879
PPBP,0.8160963
NTNG1,0.8131801
IFNA5,0.7771863
PLXNA2,0.758161
SBSPON,0.7567825
CX3CL1,0.7385588


# F1 GBC quick glance

In [15]:
f1_gbc_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_gbc_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  mean_imp_0 = col_double(),
  score_pct_improvement_0 = col_double(),
  mean_imp_1 = col_double(),
  score_pct_improvement_1 = col_double(),
  mean_imp_2 = col_double(),
  score_pct_improvement_2 = col_double(),
  mean_imp_3 = col_double(),
  score_pct_improvement_3 = col_double(),
  mean_imp_4 = col_double(),
  score_pct_improvement_4 = col_double(),
  consensus_imp_mean = col_double(),
  consensus_imp_std = col_double(),
  consensus_imp_cv = col_double(),
  consensus_vote = col_logical()
)


In [31]:
f1_gbc_results_df %>%
    dplyr::select(geneID, contains("pct"), consensus_vote) %>%
    dplyr::mutate(consensus_mean_pct_imp = rowMeans(dplyr::select(., contains("pct")))) %>%
    dplyr::select(geneID, consensus_vote, consensus_mean_pct_imp) %>%
    dplyr::top_n(20, consensus_mean_pct_imp) %>%
    dplyr::arrange(desc(consensus_mean_pct_imp))

geneID,consensus_vote,consensus_mean_pct_imp
<chr>,<lgl>,<dbl>
AMELX,False,2.200979
NCAN,True,1.832294
CSF3,False,1.675147
XCL1,False,1.2901529
LPA,False,0.9904496
TGM3,False,0.8955166
PF4V1,False,0.7594193
CHRDL2,False,0.7342582
OSM,False,0.6853571
SERPINA4,False,0.6424801


# F1 RFC quick glance

In [32]:
f1_rfc_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_rfc_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  mean_imp_0 = col_double(),
  score_pct_improvement_0 = col_double(),
  mean_imp_1 = col_double(),
  score_pct_improvement_1 = col_double(),
  mean_imp_2 = col_double(),
  score_pct_improvement_2 = col_double(),
  mean_imp_3 = col_double(),
  score_pct_improvement_3 = col_double(),
  mean_imp_4 = col_double(),
  score_pct_improvement_4 = col_double(),
  consensus_imp_mean = col_double(),
  consensus_imp_std = col_double(),
  consensus_imp_cv = col_double(),
  consensus_vote = col_logical()
)


In [33]:
f1_rfc_results_df %>%
    dplyr::select(geneID, contains("pct"), consensus_vote) %>%
    dplyr::mutate(consensus_mean_pct_imp = rowMeans(dplyr::select(., contains("pct")))) %>%
    dplyr::select(geneID, consensus_vote, consensus_mean_pct_imp) %>%
    dplyr::top_n(20, consensus_mean_pct_imp) %>%
    dplyr::arrange(desc(consensus_mean_pct_imp))

geneID,consensus_vote,consensus_mean_pct_imp
<chr>,<lgl>,<dbl>
NCAN,False,2.2305067
TCHH,False,2.2244728
CHRDL2,False,1.8886852
XCL1,False,1.8300103
COL11A1,False,1.6658268
GDF6,False,1.4732691
TGM3,False,1.4680128
LPA,False,1.2138383
TIMP2,False,1.1756255
ZP3,False,1.1405991


# L1 LR quick glance

In [34]:
l1_lr_results_df <- read_tsv(paste0(dirs$analysis_dir, "/", unified_dsets[dset_idx], "_l1_lr_results.tsv"))

Parsed with column specification:
cols(
  geneID = col_character(),
  mean_imp_0 = col_double(),
  score_pct_improvement_0 = col_double(),
  mean_imp_1 = col_double(),
  score_pct_improvement_1 = col_double(),
  mean_imp_2 = col_double(),
  score_pct_improvement_2 = col_double(),
  mean_imp_3 = col_double(),
  score_pct_improvement_3 = col_double(),
  mean_imp_4 = col_double(),
  score_pct_improvement_4 = col_double(),
  consensus_imp_mean = col_double(),
  consensus_imp_std = col_double(),
  consensus_imp_cv = col_double(),
  consensus_vote = col_logical()
)


In [39]:
l1_lr_results_df %>%
    dplyr::select(geneID, contains("pct"), consensus_vote) %>%
    dplyr::mutate(consensus_mean_pct_imp = rowMeans(dplyr::select(., contains("pct")))) %>%
    dplyr::select(geneID, consensus_vote, consensus_mean_pct_imp) %>%
    dplyr::top_n(20, consensus_mean_pct_imp) %>%
    dplyr::arrange(desc(consensus_mean_pct_imp))

geneID,consensus_vote,consensus_mean_pct_imp
<chr>,<lgl>,<dbl>
FGF1,True,9.549765
CELA1,True,8.284395
PDGFC,True,7.715524
CCL16,True,6.328994
NCAN,True,6.214198
XCL1,True,5.835793
MEGF11,True,5.443356
BMPER,True,5.424592
WNT1,True,4.79032
SERPINB9,True,4.666706
