In [1]:
library(reticulate)
library(dplyr)

use_condaenv("sc_rna_env_python", required = TRUE)

# --- Functions to read h5 files ---

anndata <- import("anndata", delay_load = TRUE)
scanpy <- import("scanpy", delay_load = TRUE)



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [4]:
load_h5ad <- function(file_path) {
  ad_obj <- anndata$read_h5ad(file_path)

  # UMAP coordinates
  umap <- ad_obj$obsm["X_umap"]
  df <- as.data.frame(umap)
  colnames(df) <- c("UMAP_1", "UMAP_2")

  # Adapted columns from obs
  df$cluster <- as.factor(ad_obj$obs["cluster_feature"][[1]])         # previously pheno_leiden
  df$cell_type <- as.factor(ad_obj$obs["celltype"][[1]])              # previously majority_voting
  df$condition <- as.factor(ad_obj$obs["Condition"][[1]])             # additional: Condition
  df$time <- as.factor(ad_obj$obs["Time"][[1]])                       # additional: Time
  df$doublet <- as.factor(ad_obj$obs["doublet"][[1]])                 # optional: doublet info
  df$index <- rownames(df)

  # MAGIC UMAP (if present)
  if ("X_magic_umap" %in% names(ad_obj$obsm)) {
    umap_magic <- ad_obj$obsm["X_magic_umap"]
    colnames(umap_magic) <- c("MAGIC_1", "MAGIC_2")
    df$MAGIC_1 <- umap_magic[, 1]
    df$MAGIC_2 <- umap_magic[, 2]
  } else {
    df$MAGIC_1 <- NA
    df$MAGIC_2 <- NA
  }

  # SEACell summary (if present)
  seacell_df <- NULL
  if ("SEACells_summary" %in% names(ad_obj$uns)) {
    seacell_df <- as.data.frame(ad_obj$uns[["SEACells_summary"]]$obsm["X_umap"])
    colnames(seacell_df) <- c("UMAP_1", "UMAP_2")
    seacell_df$SEACell <- rownames(seacell_df)
    seacell_df$cell_type <- ad_obj$uns[["SEACells_summary"]]$obs["cluster_feature"][[1]]
  }

  # Genes (index) and expression
  genes <- ad_obj$var_names
  expr <- ad_obj$X

  list(df = df, expr = expr, genes = genes, ad = ad_obj, seacell_df = seacell_df)
}


In [None]:
h5_file <- '/home/genos/gmunoz/SCRATCH/Projects/scRNA_Project/relaxed_epdsc_annotated_data.h5'
h5_data <- load_h5ad(h5_file)b

In [6]:
h5_data

$df
            UMAP_1       UMAP_2 cluster              cell_type condition time
1     3.5484893322 -3.564047813       0            DB_advanced      PIMQ  D30
2     3.5399062634 -2.660188437       0            DB_advanced      PIMQ   Y1
3     3.7690057755 -3.677712202       0            DB_advanced   Control   Y1
4     3.4253854752 -3.236126184       0            DB_advanced      PIMQ  D30
5    10.9052362442  3.693690300       0       UB_proliferation   Control  D30
6     9.8347120285  5.029740334       0       UB_proliferation      PIMQ  D30
7    -1.4239920378 -1.343407989       0             UB_classic      PIMQ   Y1
8    11.1183309555  4.139722824       0             UB_classic   Control  D30
9    10.2104091644  3.090096235       0       UB_proliferation      PIMQ  D30
10    9.6037645340  5.135641575       0       UB_proliferation   Control  D30
11    9.5326890945  4.184043884       0       UB_proliferation      PIMQ   Y1
12   10.5058841705  4.065787315       0       UB_proliferati