In [1]:
# Load necessary libraries.
library(dplyr)
library(tidyr)
library(readxl)
library(clusterProfiler)
library(org.Mm.eg.db)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




clusterProfiler v4.16.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

G Yu. Thirteen years of clusterProfiler. The Innovation. 2024,
5(6):100722


Attaching package: ‘clusterProfiler’


The following object is masked from ‘package:stats’:

    filter


Loading required package: AnnotationDbi

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: generics


Attaching package: ‘generics’


The following object is masked from ‘package:dplyr’:

    explain


The following objects are masked from ‘package:base’:

    as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
    setequal, union



Attaching package: ‘BiocGenerics’


The following object is masked from ‘package:dplyr’:

    combine


Th

In [2]:
parse_data <- function() {
  dataset_file_path <- "./data/dataset.xlsx"
  dataset_sheet <- "Sheet1"

  full_data <- NULL
  df_2month_final <- data.frame()
  df_4month_final <- data.frame()
  df_6month_final <- data.frame()

  # Read in Excel file dataset.
  tryCatch({
    # --- Step 1: Load in all the data from the Excel file ---
    cat("Attempting to read Excel file:", dataset_file_path, ", 
      Sheet:", dataset_sheet, "\n")

    if (!file.exists(dataset_file_path)) {
      stop(paste("File not found at the specified path:", dataset_file_path, 
                 "Current R working directory is:", getwd()))
    }

    full_data <- read_excel(dataset_file_path,
                            sheet = dataset_sheet,
                            col_types = "text",
                            .name_repair = "minimal")
    cat("Excel file read successfully (all columns initially as text).\n")
    cat("Number of columns in full_data after reading:", ncol(full_data), "\n")

    # --- Step 2: Split the data immediately into three dataframes ---
    cat("--- Splitting data into 2-month, 4-month, 
        and 6-month raw dataframes ---\n")
    cols_2month_indices <- 1:8      # A-H
    cols_4month_indices <- 10:17    # J-Q
    cols_6month_indices <- 19:26    # S-Z

    # Helper function to safely select columns.
    safe_select_cols <- function(df, col_indices, df_name_desc) {
      max_col_needed <- suppressWarnings(max(col_indices, na.rm = TRUE))
      actual_cols_in_df <- ncol(df)

      cat("For", df_name_desc, ": Attempting to select columns",
          min(col_indices), "to", max(col_indices),
          "\n\tMax needed:", max_col_needed, "\n\tActual cols in df:",
          actual_cols_in_df, "\n")

      if (actual_cols_in_df >= max_col_needed && length(col_indices) > 0
          && all(col_indices > 0)) {
        valid_indices <- col_indices[col_indices <= actual_cols_in_df]
        if (length(valid_indices) == length(col_indices)) {
          selected_df <- df %>% dplyr::select(all_of(valid_indices))
          cat(df_name_desc, "created with columns:", 
              paste(names(selected_df), collapse = ", "), "\n")
          return(as.data.frame(selected_df))
        } else {
          cat("Warning: Not all specified column indices for",
              df_name_desc, "are valid.\n\tMax index: ", max_col_needed,
              ", available cols: ", actual_cols_in_df, ".\n")
          return(data.frame())
        }
      } else {
        cat("Warning: Cannot create", df_name_desc,
            "- insufficient columns or invalid indices.\n\tNeeds up to",
            max_col_needed, "columns, available:", actual_cols_in_df, ".\n")
        return(data.frame())
      }
    }

    df_2month <- safe_select_cols(full_data, cols_2month_indices,
                                  "Raw 2-Month Dataframe")
    df_4month <- safe_select_cols(full_data, cols_4month_indices,
                                  "Raw 4-Month Dataframe")
    df_6month <- safe_select_cols(full_data, cols_6month_indices,
                                  "Raw 6-Month Dataframe")

    # --- Step 3: Filter out rows with entries "" or NA for each dataframe ---
    cat("--- Filtering completely blank rows from each dataframe ---\n")

    is_row_entirely_blank <- function(row_vector) {
        all(is.na(row_vector) | trimws(as.character(row_vector)) == "")
    }

    filter_blank_rows <- function(df, df_name) {
      if (nrow(df) > 0 && ncol(df) > 0) {
        rows_to_keep <- !apply(df, 1, is_row_entirely_blank)
        cat("Filtering blank rows for:", df_name,"\n\tOriginal rows:",
            nrow(df), "\n\tRows after filtering:", sum(rows_to_keep), "\n")
        return(df[rows_to_keep, , drop = FALSE])
      } else {
        cat("Skipping blank row filtering for:", df_name,
            "(0 rows or 0 columns).\n")
        return(df)
      }
    }

    df_2month_final <- filter_blank_rows(df_2month, "2-Month Data")
    df_4month_final <- filter_blank_rows(df_4month, "4-Month Data")
    df_6month_final <- filter_blank_rows(df_6month, "6-Month Data (post-shift)")

    # --- Step 4: Print out final dimensions ---
    cat("--- Final Data Dimensions (after all cleaning and filtering) ---\n")
    cat("   2-Month Data (df_2month_final): ",
        paste(dim(df_2month_final), collapse = " x "))
    cat("   4-Month Data (df_4month_final): ",
        paste(dim(df_4month_final), collapse = " x "))
    cat("   6-Month Data (df_6month_final): ",
        paste(dim(df_6month_final), collapse = " x "))

    cat("--- Data parsing and cleaning complete. 
        Returning dataframes as a list. ---\n")
    return(list(
      df_2month = df_2month_final,
      df_4month = df_4month_final,
      df_6month = df_6month_final
    ))
  }, error = function(e) {
    cat("An error occurred within the parse_data function.\n")
    cat("Specific error message from R (conditionMessage(e)):\n",
        conditionMessage(e), "\n")
    cat("Full error object (e):\n")
    print(e)
    stop(
         paste("Stopping due to error in parse_data function. 
               Original error was:", conditionMessage(e)))
  })
}

In [None]:
processed_data <- parse_data()
if (!is.null(processed_data)) {
  data_2_month <- processed_data$df_2month
  data_4_month <- processed_data$df_4month
  data_6_month <- processed_data$df_6month

  cat("\n--- Head of Unpacked 2-Month Data ---\n")
  print(head(data_2_month))

  cat("\n--- Head of Unpacked 4-Month Data ---\n")
  print(head(data_4_month))

  cat("\n--- Head of Unpacked 6-Month Data ---\n")
  print(head(data_6_month))
} else {
  cat("parse_data() did not return any parsed data, 
      check for errors above.\n")
}

Attempting to read Excel file: ./data/dataset.xlsx , 
      Sheet: Sheet1 
Excel file read successfully (all columns initially as text).
Number of columns in full_data after reading: 27 
--- Splitting data into 2-month, 4-month, 
        and 6-month raw dataframes ---
For Raw 2-Month Dataframe : Attempting to select columns 1 to 8 
	Max needed: 8 
	Actual cols in df: 27 
Raw 2-Month Dataframe created with columns: baseMean, log2FoldChange, FC, lfcSE, pvalue, padj, Gene, threshold 
For Raw 4-Month Dataframe : Attempting to select columns 10 to 17 
	Max needed: 17 
	Actual cols in df: 27 
Raw 4-Month Dataframe created with columns: baseMean, log2FoldChange, FC, lfcSE, pvalue, padj, Gene, threshold 
For Raw 6-Month Dataframe : Attempting to select columns 19 to 26 
	Max needed: 26 
	Actual cols in df: 27 
Raw 6-Month Dataframe created with columns: baseMean, log2FoldChange, FC, lfcSE, pvalue, padj, Gene, threshold 
--- Filtering completely blank rows from each dataframe ---
Filtering blan