Use remote Rmd files

czbiohub-sf · Mar 5, 2018 · 5c5b6a2 · 5c5b6a2
1 parent f057437
commit 5c5b6a2
Show file tree

Hide file tree

Showing 62 changed files with 10,028 additions and 1,125 deletions.
diff --git a/00_data_ingest/02_tissue_analysis_rmd/Aorta_facs.Rmd b/00_data_ingest/02_tissue_analysis_rmd/Aorta_facs.Rmd
@@ -150,3 +150,11 @@ write.csv(FetchData(tiss, c('plate.barcode','cell_ontology_class', 'free_annotat
 
 # Figures for Supplement
 
+
+Write the cell ontology and free annotations to CSV.
+
+```{r}
+filename = here('00_data_ingest', '03_tissue_annotation_csv', 
+                    paste0(tissue_of_interest, "_facs_annotation.csv"))
+write.csv(tiss@meta.data[,c('plate.barcode,'cell_ontology_class','cell_ontology_id', 'free.annotation')], file=filename)
+```
diff --git a/00_data_ingest/02_tissue_analysis_rmd/Aorta_facs.Rmd.backup b/00_data_ingest/02_tissue_analysis_rmd/Aorta_facs.Rmd.backup
@@ -0,0 +1,310 @@
+---
+title: "Aorta FACS Notebook"
+output: html_notebook
+---
+
+Specify the tissue of interest, run the boilerplate code which sets up the functions and environment, load the tissue object.
+
+```{r}
+tissue_of_interest = "Aorta"
+library(here)
+source(here("00_data_ingest", "02_tissue_analysis_rmd", "boilerplate.R"))
+load_tissue_facs(tissue_of_interest)
+```
+
+Visualize top genes in principal components
+
+```{r, echo=FALSE, fig.height=4, fig.width=8}
+PCHeatmap(object = tiss, pc.use = 1:3, cells.use = 500, do.balanced = TRUE, label.columns = FALSE, num.genes = 8)
+```
+
+Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
+
+```{r}
+PCElbowPlot(object = tiss)
+```
+
+Choose the number of principal components to use.
+```{r}
+# Set number of principal components. 
+n.pcs = 10
+```
+
+
+The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity--more connections within the group than between groups. The resolution parameter determines the scale...higher resolution will give more clusters, lower resolution will give fewer.
+
+For the top-level clustering, aim to under-cluster instead of over-cluster. It will be easy to subset groups and further analyze them below.
+
+```{r}
+# Set resolution 
+res.used <- 0.5
+
+tiss <- FindClusters(object = tiss, reduction.type = "pca", dims.use = 1:n.pcs, 
+    resolution = res.used, print.output = 0, save.SNN = TRUE)
+```
+
+To visualize 
+```{r}
+# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
+tiss <- RunTSNE(object = tiss, dims.use = 1:n.pcs, seed.use = 10, perplexity=30)
+```
+
+```{r}
+# note that you can set do.label=T to help label individual clusters
+TSNEPlot(object = tiss, do.label = T)
+```
+
+Check expression of genes of interset.
+
+```{r, echo=FALSE, fig.height=20, fig.width=8}
+genes_to_check = c('Vim','Krt5','Krt8','Ptprc','Epcam','H2-Ab1','H2-Aa','Cd34','Cdh5','Cd14','Vwf','Syk','Col1a2','Timp2','Cd74','Laptm5','Selplg','Pecam1','Fabp4','Sox18','Icam1','Tek','Vcam1','Vegfa','Adipor1','Car3','Alas2','Acta2','Dcn','Myocd','Tagln','Myh11','S100a4','Cd3e','Pdgfra','Kit','Gypa','Col6a3')
+#genes_to_check = c('Alb', 'Cyp2f2', 'Cyp2e1', 'Hamp')
+
+FeaturePlot(tiss, genes_to_check, pt.size = 1, nCol = 3)
+```
+
+Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
+
+```{r, echo=FALSE, fig.height=4, fig.width=25}
+# To change the y-axis to show raw counts, add use.raw = T.
+DotPlot(tiss, genes_to_check, plot.legend = T)
+```
+
+How big are the clusters?
+```{r}
+table(tiss@ident)
+```
+
+Which markers identify a specific cluster?
+
+```{r}
+clust.markers <- FindMarkers(object = tiss, ident.1 = 0, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
+```
+
+```{r}
+print(x = head(x= clust.markers, n = 10))
+```
+
+You can also compute all markers for all clusters at once. This may take some time.
+```{r}
+tiss.markers <- FindAllMarkers(object = tiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
+```
+
+Display the top markers you computed above.
+```{r}
+tiss.markers %>% group_by(cluster) %>% top_n(5, avg_diff)
+```
+
+## Assigning cell type identity to clusters
+
+At a coarse level, we can use canonical markers to match the unbiased clustering to known cell types:
+
+0: heterogenous group of cells
+1: endothelial cells
+2: adipocytes
+3: endothelial cells
+4: fibroblasts
+5: hematopoetic 
+
+
+
+
+
+```{r}
+# stash current cluster IDs
+tiss <- StashIdent(object = tiss, save.name = "cluster.ids")
+
+# enumerate current cluster IDs and the labels for them
+cluster.ids <- c(0, 1, 2, 3, 4, 5)
+free_annotation <- c()
+cell_ontology_class <-c("heterogenous group of cells", "endothelial cell", "epicardial adipocytes", "endothelial cells", "fibroblasts", "hematopoetic cells")
+cell_ontology_id = c(NA, "CL:0000115", "CL:1000309", "CL:0000115", "CL:0000057", "CL:0000988")
+
+tiss@meta.data[,'free_annotation'] <- NA
+tiss@meta.data[,'cell_ontology_class'] <- plyr::mapvalues(x = tiss@meta.data$cluster.ids, from = cluster.ids, to = cell_ontology_class)
+tiss@meta.data[,'cell_ontology_id'] <- plyr::mapvalues(x = tiss@meta.data$cluster.ids, from = cluster.ids, to = cell_ontology_id)
+
+TSNEPlot(object = tiss, do.label = TRUE, pt.size = 0.5, group.by='cell_ontology_class')
+```
+
+
+## Checking for batch effects
+
+```{r}
+head(tiss@meta.data)
+```
+
+
+
+Color by metadata, like plate barcode, to check for batch effects.
+```{r}
+TSNEPlot(object = tiss, do.return = TRUE, group.by = "plate.barcode")
+```
+
+Print a table showing the count of cells in each identity category from each plate.
+
+```{r}
+table(as.character(tiss@ident), as.character(tiss@meta.data$plate.barcode))
+```
+
+
+# Subset and iterate
+
+We can repeat the above analysis on a subset of cells, defined using cluster IDs or some other metadata. This is a good way to drill down and find substructure.
+
+## First subset
+
+```{r}
+# Subset data based on cluster id
+subtiss <- SubsetData(object = tiss, ident.use = c(0), do.center = F, do.scale = F, cells.use = )
+
+# To subset data based on cell_ontology_class or other metadata, you can explicitly pass cell names
+
+# anno = 'thymocyte'
+# cells.to.use = tiss@cell.names[which(tiss@meta.data$cell_ontology_class == anno)]
+# subtiss <- SubsetData(object = tiss, cells.use = cells.to.use, do.center = F, do.scale = F)
+
+```
+
+```{r}
+subtiss <- NormalizeData(object = subtiss)
+subtiss <- ScaleData(object = subtiss, vars.to.regress = c("nReads", "percent.ribo","Rn45s"))
+```
+
+```{r}
+subtiss <- FindVariableGenes(object = subtiss, do.plot = TRUE, x.high.cutoff = Inf, y.cutoff = 0.8)
+subtiss <- RunPCA(object = subtiss, pcs.compute = 20)
+subtiss <- ProjectPCA(object = subtiss, do.print = FALSE)
+```
+
+
+<!-- Run Principal Component Analysis. -->
+<!-- ```{r} -->
+<!-- subtiss <- RunPCA(object = subtiss, do.print = FALSE) -->
+<!-- subtiss <- ProjectPCA(object = subtiss, do.print = FALSE) -->
+<!-- ``` -->
+
+```{r}
+# If this fails for your subset, it may be that cells.use is more cells than you have left! Try reducing it.
+PCHeatmap(object = subtiss, pc.use = 1:3, cells.use = 100, do.balanced = TRUE, label.columns = FALSE, num.genes = 12)
+```
+
+Later on (in FindClusters and TSNE) you will pick a number of principal components to use. This has the effect of keeping the major directions of variation in the data and, ideally, supressing noise. There is no correct answer to the number to use, but a decent rule of thumb is to go until the plot plateaus.
+
+```{r}
+PCElbowPlot(object = subtiss)
+```
+
+Choose the number of principal components to use.
+```{r}
+# Set number of principal components. 
+sub.n.pcs = 5
+```
+
+
+The clustering is performed based on a nearest neighbors graph. Cells that have similar expression will be joined together. The Louvain algorithm looks for groups of cells with high modularity--more connections within the group than between groups. The resolution parameter determines the scale...higher resolution will give more clusters, lower resolution will give fewer.
+
+```{r}
+# Set resolution 
+sub.res.used <- 1
+
+subtiss <- FindClusters(object = subtiss, reduction.type = "pca", dims.use = 1:sub.n.pcs, 
+    resolution = sub.res.used, print.output = 0, save.SNN = TRUE, force.recalc = TRUE)
+```
+
+To visualize 
+```{r}
+# If cells are too spread out, you can raise the perplexity. If you have few cells, try a lower perplexity (but never less than 10).
+subtiss <- RunTSNE(object = subtiss, dims.use = 1:sub.n.pcs, seed.use = 10, perplexity=20)
+```
+
+```{r}
+# note that you can set do.label=T to help label individual clusters
+TSNEPlot(object = subtiss, do.label = T)
+```
+
+```{r}
+subtiss.markers <- FindAllMarkers(object = subtiss, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
+```
+
+```{r}
+subtiss.markers %>% group_by(cluster) %>% top_n(6, avg_diff)
+```
+
+Check expression of genes of interset.
+```{r, fig.height=16, fig.width=8}
+genes_to_check = (subtiss.markers %>% group_by(cluster) %>% top_n(6, avg_diff))$gene
+# genes_to_check = c('Vim','Krt5','Krt8','Ptprc','Epcam','H2-D1','H2-Aa','H2-Ab1','Cd34','Itgam','Syk','Col1a2','Timp2','Cd74','Laptm5','Selplg','Fabp4','Icam1','Vcam1','Adipor1','Alas2','Cnn1','Acta2','Dcn','Myocd','Mmp2','Lmna','Eln','Col3a1','Col4a1','Col1a1','Pdgfra','Gypa','Col6a3','Pecam1','Pdgfrb','Psmb8','Myh10','Tpm4','Cald1','Rbp1', 'Cd36')
+
+FeaturePlot(subtiss, genes_to_check, pt.size = 1)
+```
+
+Dotplots let you see the intensity of exppression and the fraction of cells expressing for each of your genes of interest.
+
+```{r, echo=FALSE, fig.height=4, fig.width=25}
+# To change the y-axis to show raw counts, add use.raw = T.
+DotPlot(subtiss, genes_to_check, plot.legend = T)
+```
+
+How big are the clusters?
+```{r}
+table(subtiss@ident)
+```
+
+## Checking for batch effects
+
+Color by metadata, like plate barcode, to check for batch effects.
+```{r}
+TSNEPlot(object = subtiss, do.return = TRUE, group.by = "plate.barcode")
+```
+
+Print a table showing the count of cells in each identity category from each plate.
+
+```{r}
+table(as.character(subtiss@ident), as.character(subtiss@meta.data$plate.barcode))
+```
+
+
+
+### Assigning subcell_ontology_classs
+
+For the subsets, we produce subcell_ontology_classs. These will be written back as metadata in the original object, so we can see all subcell_ontology_classs together.
+
+If some of the clusters you find in the subset deserve additional cell_ontology_class, you can add that right here. Use NA for clusters for which no subcell_ontology_class is needed.
+
+```{r}
+subcluster.ids <- c(0, 1,2)
+subfree_annotation <- c()
+cell_ontology_class <-c(NA, "epicardial adipocyte", "smooth muscle cell")
+sub_cell_ontology_id = c(NA, "CL:1000309", "CL:0000192")
+
+subtiss@meta.data[,'free_annotation'] <- NA
+tiss@meta.data[,'cell_ontology_class'] <- plyr::mapvalues(x = subtiss@ident, from = subcluster.ids, to = subcell_ontology_class)
+subtiss@meta.data[,'cell_ontology_id'] <- plyr::mapvalues(x = subtiss@ident, from = subcluster.ids, to = sub_cell_ontology_id)
+
+tiss@meta.data[subtiss@cell.names,'cell_ontology_class'] <- as.character(subtiss@meta.data$cell_ontology_class)
+tiss@meta.data[subtiss@cell.names,'cell_ontology_id'] <- as.character(subtiss@meta.data$cell_ontology_id)
+
+TSNEPlot(object = tiss, do.label = TRUE, pt.size = 0.5, group.by='cell_ontology_class')
+```
+
+Save the tissue object with updated cell_ontology_classs
+```{r}
+filename = here('00_data_ingest', '04_tissue_robj_generated', 
+                     paste0("facs", tissue_of_interest, "_seurat_tiss.Robj"))
+print(filename)
+save(tiss, file=filename)
+```
+
+# Export the final metadata
+
+So that Biohub can easily combine all your cell_ontology_classs, please export them as a simple csv.
+
+```{r}
+filename = here('00_data_ingest', '03_tissue_cell_ontology_class_csv', 
+                     paste0(tissue_of_interest, "_cell_ontology_class.csv"))
+write.csv(tiss@meta.data[,c('plate.barcode','cell_ontology_class','cell_ontology_id')], file=filename)
+```
+
+
+
diff --git a/00_data_ingest/02_tissue_analysis_rmd/Bladder_droplet.Rmd b/00_data_ingest/02_tissue_analysis_rmd/Bladder_droplet.Rmd
@@ -412,7 +412,15 @@ head(tiss@meta.data)
 
 ```{r}
 filename = here('00_data_ingest', '03_tissue_annotation_csv', 
-                     paste0(tissue_of_interest, "_droplet_annotation.csv"))
-write.csv(FetchData(tiss, c('channel','cell_ontology_class','cell_ontology_id', 'free_annotation', 'tSNE_1', 'tSNE_2')), file=filename)
+                    paste0(tissue_of_interest, "_droplet+_annotation.csv"))
+write.csv(tiss@meta.data[,c('channel','cell_ontology_class','cell_ontology_id', 'free.annotation')], file=filename)
 ```
 
+
+Write the cell ontology and free annotations to CSV.
+
+```{r}
+filename = here('00_data_ingest', '03_tissue_annotation_csv', 
+                    paste0(tissue_of_interest, "_droplet_annotation.csv"))
+write.csv(tiss@meta.data[,c('channel,'cell_ontology_class','cell_ontology_id', 'free.annotation')], file=filename)
+```