# PART 1
## 1.1 READ IN AND CLEAN UP DATA

- **INPUT:**
    - ```$path_in``` argument: rdata directory containing the files
    - raw count matrices from the SCP259 dataset
    
    
- **NOTES:**

    -  all-zero genes are excluded
    
    
- **OUTPUT:**

    -  ```counts.RData``` raw merged counts

In [1]:
library(dplyr)
library(Matrix)


Attache Paket: ‘dplyr’


Die folgenden Objekte sind maskiert von ‘package:stats’:

    filter, lag


Die folgenden Objekte sind maskiert von ‘package:base’:

    intersect, setdiff, setequal, union




### set input and output path

In [2]:
path_in <- paste0(getwd(),"/rdata")
print("input path is:")
print(path_in)

path_out <- paste0(getwd(), "/pdata")
print("output path is:")
print(path_out)

[1] "input path is:"
[1] "/work/project/ladcol_013/algo_ulc/rdata"
[1] "output path is:"
[1] "/work/project/ladcol_013/algo_ulc/pdata"


### read gene/barcode data & assign to variables

In [3]:
file_names <- c("Epi", "Fib", "Imm")

gene_data <- lapply(file_names, function(file_name) {
  read.csv(paste0(path_in, "/", file_name, ".genes.tsv"), sep = "\t", header = FALSE)})
for (i in 1:length(gene_data)) {
  assign(paste0(file_names[i], "_genes"), gene_data[[i]])}

bar_data <- lapply(file_names, function(file_name) {
  t(read.csv(paste0(path_in, "/", file_name, ".barcodes2.tsv"), sep = "\t", header = FALSE))})
for (i in 1:length(bar_data)) {
  assign(paste0(file_names[i], "_bar"), bar_data[[i]])}

In [4]:
Epi_all_genes <- union(union(Epi_genes, Fib_genes), Imm_genes)
Fib_all_genes <- union(union(Fib_genes, Imm_genes), Epi_genes)
Imm_all_genes <- union(union(Imm_genes, Epi_genes), Fib_genes)

### import matrices & attach row/col names

In [5]:
print("importing sorted matrices and creating counts submatrices")
all_genes <- lapply(file_names, function(file_name) {
  tmp_mat <- readMM(paste0(path_in, "/gene_sorted-", file_name, ".matrix.mtx")) %>%
    rbind(matrix(0, nrow = nrow(get(paste0(file_name,"_all_genes"))) - nrow(get(paste0(file_name,"_genes"))), ncol = ncol(get(paste0(file_name,"_bar"))))) %>%
    `colnames<-`(get(paste0(file_name,"_bar"))) %>%
    `rownames<-`(t(get(paste0(file_name,"_all_genes"))))
  assign(paste0(file_name, "_m"), tmp_mat, envir = .GlobalEnv)})

[1] "importing sorted matrices and creating counts submatrices"


###  combine submatrices to count matrix

In [6]:
print("combining submatrices to counts matrix")
counts <- cbind(Epi_m[order(rownames(Epi_m)),], Fib_m[order(rownames(Fib_m)),], Imm_m[order(rownames(Imm_m)),])

[1] "combining submatrices to counts matrix"


### export counts matrix

In [7]:
# saving counts matrix as counts.RData
# paste0(path_in, "/", file_name, ".genes.tsv"
print("saving counts matrix")
save(counts, file = paste0(path_out,"/counts.RData"))
print("DONE")

[1] "saving counts matrix"
[1] "DONE"


##  1.2 Preprocessing sample annotation files

- **INPUT:**
    - sample annotation file ```all.meta2.tx```

    
- **OUTPUT:**

    -  ```anno_samples.RData``` cleaned and merged sample annotation file

### load data

In [8]:
# read in sample annotation for all.meta2
anno_samples <- read.csv(paste(file = paste0(path_in,"/all.meta2.txt")),sep = "\t",header = TRUE)
print(str(anno_samples))

'data.frame':	365493 obs. of  8 variables:
 $ NAME    : chr  "TYPE" "N7.EpiA.AAACATACACACTG" "N7.EpiA.AAACCGTGCATCAG" "N7.EpiA.AAACGCACAATCGC" ...
 $ Cluster : chr  "group" "TA 1" "TA 1" "TA 2" ...
 $ nGene   : chr  "numeric" "328" "257" "300" ...
 $ nUMI    : chr  "numeric" "891" "663" "639" ...
 $ Subject : chr  "group" "N7" "N7" "N7" ...
 $ Health  : chr  "group" "Non-inflamed" "Non-inflamed" "Non-inflamed" ...
 $ Location: chr  "group" "Epi" "Epi" "Epi" ...
 $ Sample  : chr  "group" "N7.EpiA" "N7.EpiA" "N7.EpiA" ...
NULL


### process data
Clean and process sample annotation

In [9]:
str(anno_samples)

'data.frame':	365493 obs. of  8 variables:
 $ NAME    : chr  "TYPE" "N7.EpiA.AAACATACACACTG" "N7.EpiA.AAACCGTGCATCAG" "N7.EpiA.AAACGCACAATCGC" ...
 $ Cluster : chr  "group" "TA 1" "TA 1" "TA 2" ...
 $ nGene   : chr  "numeric" "328" "257" "300" ...
 $ nUMI    : chr  "numeric" "891" "663" "639" ...
 $ Subject : chr  "group" "N7" "N7" "N7" ...
 $ Health  : chr  "group" "Non-inflamed" "Non-inflamed" "Non-inflamed" ...
 $ Location: chr  "group" "Epi" "Epi" "Epi" ...
 $ Sample  : chr  "group" "N7.EpiA" "N7.EpiA" "N7.EpiA" ...


In [10]:
# rename columns and adapt sample_ID to include Sample and Health info
anno_samples <- anno_samples[-1,] %>%
  rename(sample_ID = Sample,
         cell_ID = NAME,
         patient_ID = Subject,
         region = Health,
         cell_type_original = Cluster) %>%
  mutate(sample_ID = paste(sample_ID, region, sep = "_"))
anno_samples <- anno_samples %>% 
  arrange(region)

str(anno_samples)

'data.frame':	365492 obs. of  8 variables:
 $ cell_ID           : chr  "N10.EpiA.AAACATACAACCAC" "N10.EpiA.AAACATACAGGCGA" "N10.EpiA.AAACATACCACTAG" "N10.EpiA.AAACATACCCTTTA" ...
 $ cell_type_original: chr  "Enterocyte Progenitors" "Cycling TA" "Immature Goblet" "Secretory TA" ...
 $ nGene             : chr  "425" "1695" "391" "1327" ...
 $ nUMI              : chr  "968" "7273" "1190" "5620" ...
 $ patient_ID        : chr  "N10" "N10" "N10" "N10" ...
 $ region            : chr  "Healthy" "Healthy" "Healthy" "Healthy" ...
 $ Location          : chr  "Epi" "Epi" "Epi" "Epi" ...
 $ sample_ID         : chr  "N10.EpiA_Healthy" "N10.EpiA_Healthy" "N10.EpiA_Healthy" "N10.EpiA_Healthy" ...


In [11]:
# remove unnecessary columns and introduce anno_samplex for later usage in anno_cells
anno_samplex = anno_samples[,-c(3,4,5,7)]
anno_samples <- anno_samples[,-c(1,2,3,4,7)]
str(anno_samples)

'data.frame':	365492 obs. of  3 variables:
 $ patient_ID: chr  "N10" "N10" "N10" "N10" ...
 $ region    : chr  "Healthy" "Healthy" "Healthy" "Healthy" ...
 $ sample_ID : chr  "N10.EpiA_Healthy" "N10.EpiA_Healthy" "N10.EpiA_Healthy" "N10.EpiA_Healthy" ...


In [12]:
# change position of sample_ID column and introduce health_status column
anno_samples <- subset(anno_samples, !duplicated(anno_samples$sample_ID))
rownames(anno_samples) <- NULL
anno_samples <- anno_samples %>% relocate(sample_ID)
anno_samples$case_or_control <- ifelse(anno_samples$region %in% 'Inflamed', 'case', 'control')
anno_samples$health_status <- ifelse(anno_samples$region %in% 'Healthy', 'Healthy', 'Colitis')
anno_samples

sample_ID,patient_ID,region,case_or_control,health_status
<chr>,<chr>,<chr>,<chr>,<chr>
N10.EpiA_Healthy,N10,Healthy,control,Healthy
N10.EpiB_Healthy,N10,Healthy,control,Healthy
N10.LPA_Healthy,N10,Healthy,control,Healthy
N10.LPB_Healthy,N10,Healthy,control,Healthy
N8.EpiA_Healthy,N8,Healthy,control,Healthy
N8.EpiB_Healthy,N8,Healthy,control,Healthy
N8.LPA_Healthy,N8,Healthy,control,Healthy
N8.LPB_Healthy,N8,Healthy,control,Healthy
N11.EpiA_Healthy,N11,Healthy,control,Healthy
N11.EpiB_Healthy,N11,Healthy,control,Healthy


### export anno_samples

In [13]:
save(anno_samples, file = paste0(path_out,"/anno_samples.RData"))

## 1.3 Preprocessing cell annotation files

- **INPUT:**
    - ```cell_relabelling.csv``` file containing unified cell type annotations. Stored in /rdata.
	- metadata from ```all.meta2.tx``` accessed via ```anno_samplex``` variable
    
    
- **NOTES:**

    -  ```anno_cells``` file should contain columns "cell_ID", "cell_type" and "sample_ID" for communication analysis
    
    
- **OUTPUT:**

    -  ```anno_cells.RData``` cleaned and merged cell annotation file

### load data

In [14]:
anno_cells <- read.csv(paste(file = paste0(path_in,"/cell_relabelling.csv")),sep = ";")
str(anno_cells)

'data.frame':	51 obs. of  6 variables:
 $ cell_type_original      : chr  "Stem" "TA 1" "TA 2" "Cycling TA" ...
 $ tissue_type             : chr  "Epithelial" "Epithelial" "Epithelial" "Epithelial" ...
 $ cell_type_original_color: chr  "#CB4335" "#CB4335" "#CB4335" "#CB4335" ...
 $ cell_type_color         : chr  "#CB4335" "#CB4335" "#CB4335" "#CB4335" ...
 $ population_shape        : int  19 19 19 19 19 19 19 19 19 19 ...
 $ cell_type               : chr  "Epithelial" "Epithelial" "Epithelial" "Epithelial" ...


### process data

In [15]:
anno_cells <- anno_cells %>% 
  right_join(anno_samplex, by = c("cell_type_original"), multiple = 'all') %>% 
  select(-c(3, 4, 5)) %>% 
  relocate(cell_ID)
anno_cells <- anno_cells %>% 
  arrange(region)
anno_cells$health_status <- ifelse(anno_cells$region %in% 'Healthy', 'Healthy', 'Colitis')
str(anno_cells)

'data.frame':	365492 obs. of  7 variables:
 $ cell_ID           : chr  "N10.EpiA.AACATTGAAAGTAG" "N10.EpiA.AACCCAGACGTAAC" "N10.EpiA.AATCTCACCGCATA" "N10.EpiA.ACAATAACCGCTAA" ...
 $ cell_type_original: chr  "Stem" "Stem" "Stem" "Stem" ...
 $ tissue_type       : chr  "Epithelial" "Epithelial" "Epithelial" "Epithelial" ...
 $ cell_type         : chr  "Epithelial" "Epithelial" "Epithelial" "Epithelial" ...
 $ region            : chr  "Healthy" "Healthy" "Healthy" "Healthy" ...
 $ sample_ID         : chr  "N10.EpiA_Healthy" "N10.EpiA_Healthy" "N10.EpiA_Healthy" "N10.EpiA_Healthy" ...
 $ health_status     : chr  "Healthy" "Healthy" "Healthy" "Healthy" ...


### export anno_cells

In [16]:
save(anno_cells, file = paste0(path_out,"/anno_cells.RData"))

## 1.4 Preprocessing gene annotation files

- **INPUT:**
    - ```LR_database.rda```  stored in additional_input_files sub-directory
<!--     - ```- counts.RData```  -->
   
    
- **OUTPUT:**

    -  ```anno_genes.RData``` contains information about what genes are ligands and receptors

### load LR_database.rda data base

In [17]:
# test <- load("./counts.RData")
load(file = paste0(path_in,"/LR_database.rda"))
cat("str(LR_database)\n\n")

str(LR_database)



### create anno_genes

In [18]:
anno_genes <- data.frame(gene_symbol = rownames(counts))
print(str(anno_genes))

'data.frame':	21784 obs. of  1 variable:
 $ gene_symbol: chr  "7SK" "A1BG" "A1BG-AS1" "A1CF" ...
NULL


In [19]:
# match genes of counts matrix to LR_database
anno_genes$inDB <- (anno_genes$gene_symbol %in% LR_database$Ligand)| (anno_genes$gene_symbol %in% LR_database$Receptor)
anno_genes$isLigand <- anno_genes$gene_symbol %in% LR_database$Ligand
anno_genes$isReceptor <- anno_genes$gene_symbol %in% LR_database$Receptor

In [20]:
print(str(anno_genes))

'data.frame':	21784 obs. of  4 variables:
 $ gene_symbol: chr  "7SK" "A1BG" "A1BG-AS1" "A1CF" ...
 $ inDB       : logi  FALSE TRUE FALSE FALSE TRUE FALSE ...
 $ isLigand   : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
 $ isReceptor : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
NULL


In [21]:
cat("total nr genes are", nrow(anno_genes),"\n")
cat("nr ligands in LR_database are", length(unique(LR_database$Ligand)),"\n")
cat("nr ligands in our data are",sum(anno_genes$isLigand),"\n")
cat("nr receptors in LR_database are",length(unique(LR_database$Receptor)),"\n")
cat("nr receptors in our data are",sum(anno_genes$isReceptor),'\n')

total nr genes are 21784 
nr ligands in LR_database are 1515 
nr ligands in our data are 1248 
nr receptors in LR_database are 1311 
nr receptors in our data are 1081 


### export anno_genes

In [22]:
save(anno_genes, file = paste0(path_out,"/anno_genes.RData"))