# PART 3
## Normalize data

Scran normalization: since the cell types show intrinsic differences in library sizes, we will running the normalzation on each cell type separately.

- **INPUT:**
    - ```counts_cf_ctf_gf_sf.RData``` 
    - ```anno_cells_cf_ctf_gf_sf.RData```
    - ```anno_samples_cf_ctf_gf_sf.RData```
    - ```anno_genes_cf_ctf_gf_sf.RData```
    - ```cell_relabelling.csv``` file containing unified cell type annotations. Stored in additional_input_files sub-directory.
    
    
- **OUTPUT:**

    -  ```counts_norm.mtx``` 
    -  ```anno_cells_norm.txt``` 
    -  ```anno_samples_norm.txt``` 
    -  ```anno_genes_norm.txt``` 

### load data

In [1]:
library(ggplot2)
library(SingleCellExperiment)
library(scran)
library(scater)
library(Matrix)

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges

In [2]:
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# BiocManager::install(c("SingleCellExperiment","scran","scater"))

In [3]:
path_to_additional_files="../../../data/data_preprocessing/vanGalen_Hourigan/additional_input_files/"
path_in <- "../../../results/data_preprocessing/vanGalen_Hourigan/filtered/"
path_out <- "../../../results/data_preprocessing/vanGalen_Hourigan/normalized/"

In [4]:
# load cell relabelling file
print("load cell_relabelling")
file="cell_relabelling.csv"
cell_relabelling <- read.csv(file = paste(path_to_additional_files, file, sep="")
                             ,sep = ";"
)
rownames(cell_relabelling) <- cell_relabelling$cell_type_original
print(str(cell_relabelling))

[1] "load cell_relabelling"
'data.frame':	40 obs. of  9 variables:
 $ cell_type_original    : chr  "B" "CD10+ B cells" "CD20+ B cells" "ProB" ...
 $ cell_type             : chr  "B" "B" "B" "B" ...
 $ source                : chr  "van Galen et al., 2019" "Oetjen et al., 2018" "Oetjen et al., 2018" "van Galen et al., 2019" ...
 $ cell_type_color       : chr  "green" "green" "green" "green" ...
 $ cell_type_color_hex   : chr  "#4FC384" "#4FC384" "#4FC384" "#4FC384" ...
 $ cell_type_shape       : int  12 12 12 12 13 13 13 13 13 13 ...
 $ cell_subtype          : chr  "B" "B" "B" "B" ...
 $ cell_subtype_color    : chr  "green" "green" "green" "green" ...
 $ cell_subtype_color_hex: chr  "#4FC384" "#4FC384" "#4FC384" "#4FC384" ...
NULL


In [5]:
# load counts
print("load counts")
load(paste0(path_in, "counts_cf_ctf_gf_sf.RData"))
print("dim(counts_cf_ctf_gf_sf)")
print(dim(counts_cf_ctf_gf_sf))

[1] "load counts"


In [None]:
# load cell annotation
print("load cell annotation")
load(paste0(path_in, "anno_cells_cf_ctf_gf_sf.RData"))
print("str(anno_cells_cf_ctf_gf_sf)")
print(str(anno_cells_cf_ctf_gf_sf))

In [None]:
# load sample annotation
print("load sample annotation")
load(paste0(path_in, "anno_samples_cf_ctf_gf_sf.RData"))
print("str(anno_samples_cf_ctf_gf_sf)")
print(str(anno_samples_cf_ctf_gf_sf))

In [None]:
# load gene annotation
print("load gene annotation")
load(paste0(path_in, "anno_genes_cf_ctf_gf_sf.RData"))
print("str(anno_genes_cf_ctf_gf_sf)")
print(str(anno_genes_cf_ctf_gf_sf))

In [None]:
# color scheme for celltypes
color_celltype <- cell_relabelling$cell_type_color_hex[!duplicated(cell_relabelling$cell_type)]
names(color_celltype) <- cell_relabelling$cell_type[!duplicated(cell_relabelling$cell_type)]

### process data 

In [None]:
counts_norm <- counts_cf_ctf_gf_sf

In [None]:
# normalize each cell type separately with scran
for(cluster in unique(anno_cells_cf_ctf_gf_sf$cell_type)){
    print(cluster)
  
    idx_cluster <- anno_cells_cf_ctf_gf_sf$cell_type == cluster

    # subset anno_cells_cf_ctf_gf_sf
    anno_cells_sub <- anno_cells_cf_ctf_gf_sf[idx_cluster,]

    # create SingleCellExperiment object
    print("create SingleCellExperiment object")
    original_sce <- SingleCellExperiment(
    assays = list(counts = as.matrix(counts_cf_ctf_gf_sf[,idx_cluster]))
    ,colData = anno_cells_sub
    )

    # comupte size factors
    print("comupte size factors")
    original_sce <- computeSumFactors(original_sce
                                    #,cluster=cluster 
    )

    anno_cells_cf_ctf_gf_sf$sizeFactor[idx_cluster] <- sizeFactors(original_sce)

    # normalize
    print("scran normalization")
    original_sce <- logNormCounts(original_sce
                                ,log = FALSE
                                )
    counts_norm[,idx_cluster] <- original_sce@assays@data@listData$normcounts
}


### visualize 

In [None]:
print("construct pseudo bulks for cell types")
for(counts_type in c("counts"
                ,"normcounts")
){
  
  ifelse(counts_type == "counts"
         ,counts <- counts_cf_ctf_gf_sf
         ,counts <- counts_norm)
  
  cell_type_IDs <- unique(anno_cells_cf_ctf_gf_sf$cell_type_ID)
  counts_pseudobulkCellType <- as.data.frame(matrix(,nrow = nrow(counts)
                                                    ,ncol = length(cell_type_IDs)
  )
  )
  i <- 1
  for(cell_type_ID in cell_type_IDs){
    #print(cell_type_ID)
    
    idx_cell_type_ID <- anno_cells_cf_ctf_gf_sf$cell_type_ID == cell_type_ID
    
    if(sum(idx_cell_type_ID) == 0){
      counts_pseudobulkCellType[,i] <- 0
    } else if( sum(idx_cell_type_ID) == 1){
      counts_pseudobulkCellType[,i] <- counts[,idx_cell_type_ID]
    } else {
      counts_pseudobulkCellType[,i] <- rowMeans(counts[,idx_cell_type_ID])
    }
    
    i <- i+1
  }
  colnames(counts_pseudobulkCellType) <- cell_type_IDs
  rownames(counts_pseudobulkCellType) <- rownames(counts)
  print("str(counts_pseudobulkCellType)")
  print(str(counts_pseudobulkCellType))
  
  # visualize expression in pseudo sorted bulk
  print("visualize expression in pseudo-bulk cell type")
  for(genes_type in c("all genes"
                      ,"ligands and receptors")){
    ifelse(genes_type == "all genes"
           ,idx_genes <- rep(T,nrow(anno_genes_cf_ctf_gf_sf))
           ,idx_genes <- anno_genes_cf_ctf_gf_sf$isLigand | anno_genes_cf_ctf_gf_sf$isReceptor
    )
    
    my_sampes <- sub("_.*", "", colnames(counts_pseudobulkCellType))
    df <- data.frame(log10_lib_size = log10(colSums(counts_pseudobulkCellType[idx_genes,]) +1)
                     ,sample_ID = sub(".*_"
                                      ,""
                                      ,colnames(counts_pseudobulkCellType)
                     )
                     ,cell_type = sub("_.*"
                                      ,""
                                      ,colnames(counts_pseudobulkCellType)
                     )
    )
    
    print(ggplot(data = df
                 ,aes(x = sample_ID
                      ,y = log10_lib_size
                      ,color = cell_type
                 )
    ) +
      geom_jitter(height = NULL
                  ,width = 0
                  ,alpha = 0.75) +
      ggtitle(paste(genes_type, ":"
                    ,counts_type)) +
      coord_flip() +
      ylim(c(0,4.5)) +
      scale_color_manual(values = color_celltype)+
      theme_bw()
    )
    
    
    # visualize expression after gene filter
    print("visualize expression after gene filter")
    df <- data.frame(log10_lib_size = log10(colSums(counts[idx_genes,]) +1)
                     ,sample_ID = anno_cells_cf_ctf_gf_sf$sample_ID
    )
    print(str(df))
    
    print(ggplot(data = df
                 ,aes(x = sample_ID
                      ,y = log10_lib_size
                 )
    ) +
      geom_jitter(height = NULL
                  ,size = 0.1
                  ,alpha = 0.2) +
      geom_violin() + 
      ggtitle(paste("log10 expression"
                    ,counts_type
                    ,genes_type)) +
      coord_flip() +
      ylim(c(0,6))+
      theme_bw()
    )
    rm(df)
    
  }
  
}


In [None]:
anno_cells_norm <- anno_cells_cf_ctf_gf_sf
anno_genes_norm <- anno_genes_cf_ctf_gf_sf
anno_samples_norm <- anno_samples_cf_ctf_gf_sf

### export

In [14]:
# export ####
print("export")
print("export counts_norm.mtx")
writeMM(obj = Matrix(as.matrix(counts_norm)
                     , sparse=TRUE)
        , file = paste0(path_out, "counts_norm.mtx")
        )

[1] "export"
[1] "export counts_norm.mtx"


NULL

In [15]:
# export anno_cells_norm
print("export anno_cells_norm.txt")
write.table(anno_cells_norm
            ,file = paste0(path_out, "anno_cells_norm.txt")
             ,sep = "\t"
            ,row.names = FALSE )

[1] "export anno_cells_norm.txt"


In [16]:
# export anno_genes
print("export anno_genes_norm.txt")
write.table(anno_genes_norm
            ,file = paste0(path_out, "anno_genes_norm.txt")
            ,sep = "\t"
            ,row.names = FALSE
)

[1] "export anno_genes_norm.txt"


In [17]:
# export anno_samples
print("export anno_samples_norm.txt")
write.table(anno_samples_norm
            ,file = paste0(path_out, "anno_samples_norm.txt")
            ,sep = "\t"
            ,row.names = FALSE
)

[1] "export anno_samples_norm.txt"
