0. Packages & options (≃ “Importing Packages”)

In [None]:
the.seed <-42
set.seed(the.seed)

In [None]:
## Packages 
library(Seurat)
library(SeuratDisk)   
library(Matrix)
library(dplyr)
library(ggplot2)
library(patchwork)


1. Import des données (≃ adata = sc.read_h5ad(...))

In [None]:
## 1.1 Conversion h5ad -> h5seurat (à faire UNE FOIS) ----
# setwd("...")  # mets ton working directory si besoin
# Convert("AD_PD_CTRL.h5ad", dest = "h5seurat", overwrite = TRUE)

## 1.2 Chargement dans un objet Seurat ----
sobj <- LoadH5Seurat("AD_PD_CTRL.h5seurat")

sobj
head(colnames(sobj))      # noms des cellules
head(rownames(sobj))      # noms des gènes
head(sobj@meta.data)      # équivalent à adata.obs




2. QC & métadonnées (≃ sc.pp.calculate_qc_metrics + plots)

En Python tu as :

sc.pp.calculate_qc_metrics(adata, inplace=True)

En R/Seurat, on utilise déjà nFeature_RNA, nCount_RNA et on ajoute le % mitochondrial :


In [None]:

## 2.1 Ajout des métriques QC ----

# gènes mitochondriaux humains typiques
sobj[["percent.mt"]] <- PercentageFeatureSet(sobj, pattern = "^MT-")

# si besoin : ribosomal, hemoglobin, etc. comme dans le .Rmd
sobj[["percent.ribo"]] <- PercentageFeatureSet(sobj, pattern = "^RPL|^RPS")
sobj[["percent.hb"]]   <- PercentageFeatureSet(sobj, pattern = "^HB[AB]")

head(sobj@meta.data)




2.2 Visualisation des QC metrics


In [None]:
(≃ sc.pl.violin(adata, ['n_counts','n_genes']) + scatter)

## Violin plots nFeature, nCount, percent.mt ----
VlnPlot(
  sobj,
  features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
  ncol = 3,
  pt.size = 0.1
)

## Scatter plots (équivalent des scatter QC dans scanpy) ----
FeatureScatter(sobj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") +
FeatureScatter(sobj, feature1 = "nCount_RNA", feature2 = "percent.mt")




3. Filtrage des cellules (≃ “Filtering low quality cells / mt / ribo / hb”)


In [None]:

## 3.1 Choix des thresholds (inspirés du .Rmd + ton notebook) ----
minGenes <- 200      # comme dans la prof
minUMI   <- 1000     # à adapter
maxMT    <- 15       # % de mitochondrie max (exemple)


In [None]:

## 3.2 Création d'un objet filtré ----
sobj_filtrd <- subset(
  sobj,
  subset = nFeature_RNA > minGenes &
           nCount_RNA   > minUMI   &
           percent.mt   < maxMT
)

## Visualisation post-filtre ----
VlnPlot(
  sobj_filtrd,
  features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
  ncol = 3,
  pt.size = 0.1
)


In [None]:

#Si tu veux exclure certains gènes (ribosomal, hémoglobine) comme mentionné dans ton notebook :

genes_to_remove <- grep("^RPL|^RPS|^HB[AB]", rownames(sobj_filtrd), value = TRUE)

sobj_filtrd <- sobj_filtrd[ setdiff(rownames(sobj_filtrd), genes_to_remove), ]


4. Normalisation (≃ sc.pp.normalize_total + sc.pp.log1p)




5. HVG – Highly Variable Genes (≃ sc.pp.highly_variable_genes)


In [None]:

sobj_filtrd <- FindVariableFeatures(
  sobj_filtrd,
  selection.method = "vst",
  nfeatures = 3000
)


In [None]:

# liste des HVG
hvg <- VariableFeatures(sobj_filtrd)
length(hvg)


6. Scaling & PCA (≃ sc.pp.scale + sc.tl.pca)


In [None]:
sobj_filtrd <- ScaleData(sobj_filtrd, features = hvg)

sobj_filtrd <- RunPCA(
  sobj_filtrd,
  features = hvg,
  npcs = 50 # à adapter selon ton notebook
)

ElbowPlot(sobj_filtrd)  # pour choisir combien de PC garder


7. Voisins, clustering, UMAP


In [None]:
#7.1 Graph des voisins

sobj_filtrd <- FindNeighbors(
  sobj_filtrd,
  dims = 1:30   # adapte au nb de PC retenues
)


In [None]:

#7.2 Clustering Leiden ≃ Seurat FindClusters

sobj_filtrd <- FindClusters(
  sobj_filtrd,
  resolution = 0.5  # adapte à ce que tu as testé en Python
)

table(sobj_filtrd$seurat_clusters)


In [None]:
#7.3 UMAP

sobj_filtrd <- RunUMAP(
  sobj_filtrd,
  dims = 1:30
)


In [None]:

DimPlot(sobj_filtrd, reduction = "umap", group.by = "seurat_clusters", label = TRUE)
DimPlot(sobj_filtrd, reduction = "umap", group.by = "disease")
DimPlot(sobj_filtrd, reduction = "umap", group.by = "genetic_ancestry")
DimPlot(sobj_filtrd, reduction = "umap", group.by = "subtype")



8. Annotation manuelle (≃ section “Manual Annotation” + marker plots)


In [None]:
## Exemple de gènes marqueurs (adaptés à ton dataset) ----
marker_genes <- c("CD3D", "CD4", "CD8A", "P2RY12", "GFAP")  # à adapter

# UMAP colored by expression
FeaturePlot(sobj_filtrd, features = marker_genes, cols = c("lightgrey", "red"))

# Violin plots par cluster
VlnPlot(sobj_filtrd, features = marker_genes, group.by = "seurat_clusters", pt.size = 0)

Ensuite tu peux renommer tes clusters (comme dans le .Rmd de la prof) :

new.cluster.ids <- c(
  "0" = "CD8_T",
  "1" = "CD4_T",
  "2" = "Microglia",
  "3" = "Astrocytes"
  # etc.
)

sobj_filtrd$celltype_manual <- plyr::mapvalues(
  x = sobj_filtrd$seurat_clusters,
  from = names(new.cluster.ids),
  to   = new.cluster.ids
)

DimPlot(sobj_filtrd, reduction = "umap", group.by = "celltype_manual", label = TRUE)




9. Annotation automatique (≃ CellTypist)


In [None]:

# En Python tu utilises celltypist.
# En R, il n’y a pas CellTypist directement, mais tu peux utiliser par ex. SingleR ou scType.

# Pour rester simple (et puisque ce n’est pas dans le .Rmd prof), je te mets juste un squelette très court :

# Exemple *idée* avec SingleR (si tu veux pousser) :
# library(SingleR)
# library(celldex)
# ref <- HumanPrimaryCellAtlasData()
# sce <- as.SingleCellExperiment(sobj_filtrd)
# pred <- SingleR(test = sce, ref = ref, labels = ref$label.main)
# sobj_filtrd$celltype_auto <- pred$labels
# DimPlot(sobj_filtrd, group.by = "celltype_auto", label = TRUE)

# Tu peux aussi t’arrêter à l’annotation manuelle.



10. Differential Expression (≃ sc.tl.rank_genes_groups)


In [None]:

## 10.1 Choisir l'identité par laquelle comparer ----
Idents(sobj_filtrd) <- "disease"

## 10.2 Comparer un groupe vs référence ----
markers_AD_vs_ctrl <- FindMarkers(
  sobj_filtrd,
  ident.1 = "dementia || Alzheimer disease",  # adapte au label exact
  ident.2 = "normal",
  logfc.threshold = 0.25,
  min.pct = 0.2,
  test.use = "wilcox"
)


In [None]:

head(markers_AD_vs_ctrl)


In [None]:

Idents(sobj_filtrd) <- "seurat_clusters"

markers_all <- FindAllMarkers(
  sobj_filtrd,
  only.pos = TRUE,
  min.pct = 0.25,
  logfc.threshold = 0.25
)

head(markers_all)



11. Pseudo-bulk (≃ adata.to_df().groupby(adata.obs["donor_id"]).sum())

In [None]:


## 11.1 Récupérer la matrice de counts ----
counts <- GetAssayData(sobj_filtrd, slot = "counts")

## 11.2 Vérifier qu'on a bien une colonne donor_id dans la metadata ----
head(sobj_filtrd$donor_id)

## 11.3 Agrégation pseudo-bulk par donneur ----
donors <- sobj_filtrd$donor_id

pb_mat <- sapply(
  X = split(seq_len(ncol(counts)), donors),
  FUN = function(cells) Matrix::rowSums(counts[, cells, drop = FALSE])
)

pb_mat <- as.matrix(pb_mat)  # gènes x donneurs

dim(pb_mat)
colnames(pb_mat)  # donneurs

