In [1]:
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(readr)



clusterProfiler v4.6.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141


Attaching package: ‘clusterProfiler’


The following object is masked from ‘package:stats’:

    filter


Loading required package: AnnotationDbi

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, p

In [2]:
input_base_dir = "../DEMGs/"
input_dir = paste0(input_base_dir, "demgs/")
stage_significant_input_dir = paste0(input_base_dir, "stage_significant_demgs/")

output_base_dir <- "../enrichment_analysis/"
output_dir <- paste0(output_base_dir, "results/")
stage_significant_output_dir <- paste0(output_base_dir, "stage_significant_results/")

In [3]:
# Create output directories if they don't exist

if (!dir.exists(output_base_dir)) {
    dir.create(output_base_dir)
}

if (!dir.exists(output_dir)) {
    dir.create(output_dir)
}

if (!dir.exists(stage_significant_output_dir)) {
    dir.create(stage_significant_output_dir)
}

In [12]:
### Functions for enrichment analysis

get_enriched_pathways <- function(input_dir, output_dir) {
  gene_list_fp <- paste0(input_dir, "union_demg.txt")
  gene_list <- scan(gene_list_fp, what = "", sep = "\n")

  # Map the gene symbols to Entrez IDs
  genes_entrez <- bitr(gene_list, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
  genes_entrez <- genes_entrez[!is.na(genes_entrez$ENTREZID),]

  # Perform the enrichment analysis for GO
  ego <- enrichGO(gene = genes_entrez$ENTREZID,
                  OrgDb = org.Hs.eg.db,
                  keyType = 'ENTREZID',
                  ont = "ALL",
                  pAdjustMethod = "BH",
                  qvalueCutoff = 0.05,
                  readable = TRUE)

  # Perform the enrichment analysis for KEGG
  kegg_result <- enrichKEGG(gene = genes_entrez$ENTREZID,
                            organism = 'hsa',
                            keyType = 'kegg', 
                            pAdjustMethod = "BH",
                            qvalueCutoff = 0.05)

  # Simplify the GO results and write to file
  ego_simplified <- simplify(ego, cutoff=0.7, by="p.adjust", select_fun=min)
  write.table(ego@result, file = paste0(output_dir, "go_results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  write.table(ego_simplified@result, file = paste0(output_dir, "go_simplified_results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
  write.table(kegg_result@result, file = paste0(output_dir, "kegg_results.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
}

# Function to generate enrichment plot
ea_plot <- function(df, y_ax_limit=100, title = "", top = 15, label_size = 16, qval_thresh = 0.05, type_filter = "") {
  print(colnames(df))

  df <- df %>%
    dplyr::filter(qvalue < qval_thresh) %>%
    {if (type_filter != "") dplyr::filter(., str_detect(Description, type_filter)) else . } %>%
    dplyr::top_n(n = -top, wt = qvalue) %>%
    dplyr::arrange(-log10(qvalue)) %>%
    dplyr::mutate(type_fact = factor(Description, levels = unique(Description)))  # Use unique() to remove duplicates

  ggplot(df) +
    geom_bar(aes(x = type_fact, y = -log10(qvalue)), stat = "identity") +
    theme_classic() +
    theme(
      text = element_text(size = 13, family = "ArialMT"),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.text.x = element_text(color = "#4d4d4d"),
      axis.text.y = element_text(color = "#4d4d4d"),
      axis.line = element_line(size = 0.8, color = "black"),
      axis.ticks = element_line(size = 0.8),
      axis.ticks.length = unit(0.15, "cm"),
    ) +
    labs(y = bquote(paste(-log[10], italic(q), "-value"))) +
    coord_flip() +
    scale_y_continuous(limits = c(0, y_ax_limit))
}

create_barplots <- function(folder_name, h, w) {
  # Read the GO and KEGG results
  go_results <- read_tsv(paste0(folder_name, "/go_results.tsv"))
  go_simplified_results <- read_tsv(paste0(folder_name, "/go_simplified_results.tsv"))
  kegg_results <- read_tsv(paste0(folder_name, "/kegg_results.tsv"))

  # Generate and save the plots
  go_plot <- ea_plot(go_results, y_ax_limit=150, top = 15)
  ggsave(paste0(folder_name, "/go_plot.png"), go_plot, height = h, width = w, dpi = 320)

  go_simplified_plot <- ea_plot(go_simplified_results, y_ax_limit=150, top = 15)
  ggsave(paste0(folder_name, "/go_simplified_plot.png"), go_simplified_plot, height = h, width = w, dpi = 320)

  kegg_plot <- ea_plot(kegg_results, y_ax_limit=20, top = 15)
  ggsave(paste0(folder_name, "/kegg_plot.png"), kegg_plot, height = h, width = w*1.25, dpi = 320)
}

In [5]:
# Run the enrichment analysis
get_enriched_pathways(input_dir, output_dir)
get_enriched_pathways(stage_significant_input_dir, stage_significant_output_dir)

'select()' returned 1:1 mapping between keys and columns

Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

'select()' returned 1:1 mapping between keys and columns



In [6]:
# Create barplots of the top 15 enriched pathways
create_barplots(output_dir, 4.5, 4.5)
create_barplots(stage_significant_output_dir, 4.5, 4.5)

[1mRows: [22m[34m954[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (6): ONTOLOGY, ID, Description, GeneRatio, BgRatio, geneID
[32mdbl[39m (4): pvalue, p.adjust, qvalue, Count

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m311[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (6): ONTOLOGY, ID, Description, GeneRatio, BgRatio, geneID
[32mdbl[39m (4): pvalue, p.adjust, qvalue, Count

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [

 [1] "ONTOLOGY"    "ID"          "Description" "GeneRatio"   "BgRatio"    
 [6] "pvalue"      "p.adjust"    "qvalue"      "geneID"      "Count"      


“[1m[22mThe `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
[36mℹ[39m Please use the `linewidth` argument instead.”


 [1] "ONTOLOGY"    "ID"          "Description" "GeneRatio"   "BgRatio"    
 [6] "pvalue"      "p.adjust"    "qvalue"      "geneID"      "Count"      
[1] "ID"          "Description" "GeneRatio"   "BgRatio"     "pvalue"     
[6] "p.adjust"    "qvalue"      "geneID"      "Count"      


[1mRows: [22m[34m943[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (6): ONTOLOGY, ID, Description, GeneRatio, BgRatio, geneID
[32mdbl[39m (4): pvalue, p.adjust, qvalue, Count

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m311[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (6): ONTOLOGY, ID, Description, GeneRatio, BgRatio, geneID
[32mdbl[39m (4): pvalue, p.adjust, qvalue, Count

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [

 [1] "ONTOLOGY"    "ID"          "Description" "GeneRatio"   "BgRatio"    
 [6] "pvalue"      "p.adjust"    "qvalue"      "geneID"      "Count"      
 [1] "ONTOLOGY"    "ID"          "Description" "GeneRatio"   "BgRatio"    
 [6] "pvalue"      "p.adjust"    "qvalue"      "geneID"      "Count"      
[1] "ID"          "Description" "GeneRatio"   "BgRatio"     "pvalue"     
[6] "p.adjust"    "qvalue"      "geneID"      "Count"      


In [None]:
# Read the expression data and the clinical data
expr_data <- read.table("../data/all/all_phases_all_matrisome_counts.tsv", header = TRUE, row.names = 1, sep = "\t")
clinical_data <- read.table("../data_prep/metadata/stagewise_coldata.tsv", header = TRUE, sep = "\t")

# Transpose expr_data to get samples in rows and genes in columns
expr_data <- t(expr_data)

# Merge the expression data and the clinical data
data <- merge(clinical_data, expr_data, by = "row.names", all = TRUE)
rownames(data) <- data$Row.names
data$Row.names <- NULL

# Get the list of all genes used in the enrichment analysis
all_genes <- get_all_genes_from_pathways(go_results, kegg_results)

# Calculate ANOVA for each gene
anova_results <- sapply(all_genes, function(gene) {
  if (gene %in% colnames(data)) {
    fit <- aov(as.formula(paste(gene, "~ phase")), data = data)
    summary(fit)[[1]][["Pr(>F)"]]  # Extract the p-value
  } else {
    NA  # Return NA for genes not present in the expression data
  }
})

# Identify genes with significant differences
sig_genes <- names(anova_results)[anova_results < 0.05]

# Plot these genes
for (gene in sig_genes) {
  plot_gene_expression(data[,c("phase", gene)])  # You need to implement plot_gene_expression
}
