# PART 1
## READ IN AND CLEAN UP DATA

- **INPUT:**
    - ```$path_in``` argument: raw_data directory containing the sub-directory GSE185381_RAW
    - raw count matrices from the GSE185381_RAW dataset
    
    
- **NOTES:**

    -  all-zero genes are excluded
    
    
- **OUTPUT:**

    -  ```counts.RData``` raw merged counts

### prepare list of samples

In [1]:
# Sys.setenv('R_MAX_VSIZE'=3200000000000)

In [2]:
library(data.table) #to read gz file

In [3]:
# raw_data directory contains two sub-directory GSE185381_RAW

path_in <- "../../../../data/Lasry/raw_data/"
# path_in <- paste0(dirname(dirname(dirname(getwd())))
#                  ,"/raw_data")
print("input path is:")
print(path_in)

path_out <- getwd()
print("output path is:")
print(path_out)

[1] "input path is:"
[1] "../../../../data/Lasry/raw_data/"
[1] "output path is:"
[1] "/work/project/ladcol_011/polish/community-paper/src/data_preprocessing/Lasry/1.preprocess_data"


In [4]:
# variables####
print("variables")
datasets <- "GSE185381_RAW"
print("datasets:")
print(datasets)

[1] "variables"
[1] "datasets:"
[1] "GSE185381_RAW"


In [5]:
files <- lapply(datasets
               ,function(dataset){
                   list.files(paste0(path_in
                                   ,"/"
                                   ,dataset))
               })
names(files) <- datasets
print("files:")
print(files)

[1] "files:"
$GSE185381_RAW
  [1] "GSM5613744_2019-07-01-count-1_ADT_processed.csv"                 
  [2] "GSM5613744_2019-07-01-count-1_metadata.csv"                      
  [3] "GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv"           
  [4] "GSM5613745_2019-07-01-count-2_ADT_processed.csv"                 
  [5] "GSM5613745_2019-07-01-count-2_metadata.csv"                      
  [6] "GSM5613745_2019-07-01-count-2_RNA_soupx_processed.csv"           
  [7] "GSM5613746_2019-07-01-count-3_ADT_processed.csv"                 
  [8] "GSM5613746_2019-07-01-count-3_metadata.csv"                      
  [9] "GSM5613746_2019-07-01-count-3_RNA_soupx_processed.csv"           
 [10] "GSM5613747_2019-08-01-count-1_ADT_processed.csv"                 
 [11] "GSM5613747_2019-08-01-count-1_metadata.csv"                      
 [12] "GSM5613747_2019-08-01-count-1_RNA_soupx_processed.csv"           
 [13] "GSM5613748_2019-08-01-count-2_ADT_processed.csv"                 
 [14] "GSM5613748_2019-

### load data

In [6]:
# read in raw counts
print("read in raw counts")
counts_raw <- lapply(datasets
                    ,function(ds){
                        # select correct files
                        my_files <- files[[ds]][grepl("RNA", files[[ds]])]
                        
                        # read in files
                        counts <- lapply(my_files
                                        ,function(f){
                                            c <- as.data.frame(fread(paste0(path_in,"/",ds,"/",f)
                                                                     , header = TRUE
                                                                    ))
                                            rownames(c) <- c$V1
                                            c <- c[,!(colnames(c) %in% "V1")]
                                            c
                                        })
                        
                        names(counts) <- my_files
                        counts
                    })
names(counts_raw) <- datasets
print(str(counts_raw))

[1] "read in raw counts"
List of 1
 $ GSE185381_RAW:List of 46
  ..$ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv           :'data.frame':	36601 obs. of  8686 variables:
  .. ..$ X2019.07.01.count.1.AAACCCAAGACTCTTG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCACAAATGGCG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCACACACACGC: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCAGTACTAGCT: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCAGTAGACGTG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCAGTGACCGAA: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCAGTTCTCCTG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACCCATCGCTGATA: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACGAAAGAAGTATC: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  .. ..$ X2019.07.01.count.1.AAACGAAAGCTTTCTT: num [1:36

### process data

Check that all genes are identical.

In [7]:
# check genes in overlap
print("check genes in overlap")
overlap_genes <- Reduce(intersect
                        ,lapply(counts_raw[[datasets]]
                                , function(set){
                                    rownames(set)
                                }
                               )
                       )

print(paste("we have"
           ,length(overlap_genes
                  )
            , "overlapping genes"
           )
     )

print(str(overlap_genes))


[1] "check genes in overlap"
[1] "we have 36601 overlapping genes"
 chr [1:36601] "MIR1302-2HG" "FAM138A" "OR4F5" "AL627309.1" "AL627309.3" ...
NULL


In [8]:
gc()

Unnamed: 0,used,(Mb),gc trigger,(Mb).1,max used,(Mb).2
Ncells,1127626,60.3,1847802,98.7,1847802,98.7
Vcells,7744320421,59084.5,11166574838,85194.3,8932027729,68146.0


In [9]:
# subset all counts for overlapping genes
print("subset all counts for overlapping genes")

counts_raw_geneOverlap <- lapply(counts_raw[[datasets]]
                                       ,function(c){
                                           c <- c[overlap_genes,]
                                       })

print(str(counts_raw_geneOverlap))

rm(counts_raw)

[1] "subset all counts for overlapping genes"
List of 46
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv           :'data.frame':	36601 obs. of  8686 variables:
  ..$ X2019.07.01.count.1.AAACCCAAGACTCTTG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCACAAATGGCG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCACACACACGC: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCAGTACTAGCT: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCAGTAGACGTG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCAGTGACCGAA: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCAGTTCTCCTG: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACCCATCGCTGATA: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACGAAAGAAGTATC: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X2019.07.01.count.1.AAACGAAAGCTTTCTT: num [1:36601] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ X201

In [10]:
# concatenate: counts
counts_raw_merged <- do.call(cbind.data.frame, counts_raw_geneOverlap)
print(head(str(counts_raw_merged)))

'data.frame':	36601 obs. of  211442 variables:
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.AAACCCAAGACTCTTG                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.AAACCCACAAATGGCG                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.AAACCCACACACACGC                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.AAACCCAGTACTAGCT                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.AAACCCAGTAGACGTG                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.AAACCCAGTGACCGAA                      : num  0 0 0 0 0 0 0 0 0 0 ...
 $ GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv.X2019.07.01.count.1.

In [11]:
# remove additional samples name in front of the cell_ID
colnames(counts_raw_merged) <- gsub(".*X", "", colnames(counts_raw_merged))
for(i in names(counts_raw_geneOverlap)){
    print(i)
    print(dim(counts_raw_geneOverlap[[i]]))
}

print("dimensions of the merged dataset (genes x cells):")
print(dim(counts_raw_merged))


print(str(counts_raw_merged))

[1] "GSM5613744_2019-07-01-count-1_RNA_soupx_processed.csv"
[1] 36601  8686
[1] "GSM5613745_2019-07-01-count-2_RNA_soupx_processed.csv"
[1] 36601  9430
[1] "GSM5613746_2019-07-01-count-3_RNA_soupx_processed.csv"
[1] 36601  9570
[1] "GSM5613747_2019-08-01-count-1_RNA_soupx_processed.csv"
[1] 36601   269
[1] "GSM5613748_2019-08-01-count-2_RNA_soupx_processed.csv"
[1] 36601  5445
[1] "GSM5613749_2019-08-01-count-3_RNA_soupx_processed.csv"
[1] 36601  4778
[1] "GSM5613750_2019-08-01-count-4_RNA_soupx_processed.csv"
[1] 36601  4858
[1] "GSM5613751_2019-08-29-count-1_RNA_soupx_processed.csv"
[1] 36601  8973
[1] "GSM5613752_2019-08-29-count-2_RNA_soupx_processed.csv"
[1] 36601  8383
[1] "GSM5613753_2019-09-03-count-1_RNA_soupx_processed.csv"
[1] 36601   180
[1] "GSM5613754_2019-09-03-count-2_RNA_soupx_processed.csv"
[1] 36601   183
[1] "GSM5613755_2019-09-03-count-3_RNA_soupx_processed.csv"
[1] 36601   184
[1] "GSM5613756_2019-10-25-count-5_RNA_soupx_processed.csv"
[1] 36601  5297
[1] "GSM5613

In [12]:
rm(counts_raw_geneOverlap)

We will filter out the genes that have zero expression in all cells.

In [13]:
#filter zero genes
idx_nonZero <- counts_raw_merged != 0

idx_nonZero_sum <- rowSums(idx_nonZero)
print(paste("we have"
            ,sum(idx_nonZero_sum == 0)
            ,"all-zero genes. These genes will be removed."))

[1] "we have 4758 all-zero genes. These genes will be removed."


In [14]:
counts <- counts_raw_merged[idx_nonZero_sum != 0,]
print("dim(counts_raw_merged)")
print(dim(counts_raw_merged))
print("dim(counts)")
print(dim(counts))

[1] "dim(counts_raw_merged)"
[1]  36601 211442
[1] "dim(counts)"
[1]  31843 211442


### export

Save the intermediate results

In [15]:
# save intermediate results: counts
save(counts, file = "outs/counts.RData")

# Preprocessing sample annotation files


- **INPUT:**
    - sample annotation file "Supp1. Adult AML-Table 1.tsv" stored in additional_input_files sub-directory of the working directory
    - sample annotation file "Supp1. Control BM-Table 1.tsv" stored in additional_input_files sub-directory of the working directory

    
- **OUTPUT:**

    -  ```anno_samples.RData``` cleaned and merged sample annotation file

### load data

In [16]:
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [17]:
path_to_additional_files <- sub("raw_data", "additional_input_files", path_in)

In [18]:
getwd()

In [19]:
path_to_additional_files <- "../additional_input_files/"

In [20]:
# read in sample annotation
print("read in sample annotation")
anno_GSE185381_samples <- lapply(c("Control"
                   ,"AML")
                    ,function(hs){
                        # define path
                        path <- path_to_additional_files
                        
                        # select correct files
                        my_files <- list.files(path)
                        my_file <- my_files[grepl(hs,my_files)]
                        
                        # read in files
                        anno <- read.csv(paste0(path,"/",my_file)
                                        ,header = TRUE
                                        ,sep = "\t")
                        
                        anno
                    })
names(anno_GSE185381_samples) <- c("Control"
                   ,"AML")
print(str(anno_GSE185381_samples))

[1] "read in sample annotation"
List of 2
 $ Control:'data.frame':	11 obs. of  5 variables:
  ..$ Sample    : int [1:11] 1 2 3 4 5 4 5 58 82 4003 ...
  ..$ Sex       : chr [1:11] "F" "M" "F" "M" ...
  ..$ Age       : int [1:11] 26 39 50 20 47 53 22 20 19 42 ...
  ..$ Sequencing: chr [1:11] "CITE-Seq" "CITE-Seq" "CITE-Seq" "CITE-Seq" ...
  ..$ X         : logi [1:11] NA NA NA NA NA NA ...
 $ AML    :'data.frame':	24 obs. of  50 variables:
  ..$ Institution        : chr [1:24] "OSU" "OSU" "OSU" "OSU" ...
  ..$ Sample.ID          : chr [1:24] "U-06-0024" "U-11-0693" "U-12-0134" "U-15-2975" ...
  ..$ Manuscript.ID      : chr [1:24] "AML0024" "AML0693" "" "" ...
  ..$ Gender             : chr [1:24] "M" "F" "M" "F" ...
  ..$ Age                : int [1:24] 64 77 42 74 74 78 71 57 32 62 ...
  ..$ Sequencing         : chr [1:24] "CITE-seq" "CITE-seq + scTCR-seq" "scTCR-seq" "scTCR-seq" ...
  ..$ Dx                 : chr [1:24] "AML" "AML" "AML+ leukemia cutis" "AML" ...
  ..$ DxELN_Cytogeneti

### process data

Clean and process sample annotation.

In [21]:
# remove empty column in the control annotation
anno_GSE185381_samples$Control <- anno_GSE185381_samples$Control[,!(colnames(anno_GSE185381_samples$Control) %in% "X")]
print(str(anno_GSE185381_samples$Control))

'data.frame':	11 obs. of  4 variables:
 $ Sample    : int  1 2 3 4 5 4 5 58 82 4003 ...
 $ Sex       : chr  "F" "M" "F" "M" ...
 $ Age       : int  26 39 50 20 47 53 22 20 19 42 ...
 $ Sequencing: chr  "CITE-Seq" "CITE-Seq" "CITE-Seq" "CITE-Seq" ...
NULL


In [22]:
# add sample_ID to anno_GSE185381_samples$Control
anno_GSE185381_samples$Control$sample_ID <- paste("healthy"
                                       ,anno_GSE185381_samples$Control$Sample
                                                 ,sep = "-")
print(str(anno_GSE185381_samples$Control))

'data.frame':	11 obs. of  5 variables:
 $ Sample    : int  1 2 3 4 5 4 5 58 82 4003 ...
 $ Sex       : chr  "F" "M" "F" "M" ...
 $ Age       : int  26 39 50 20 47 53 22 20 19 42 ...
 $ Sequencing: chr  "CITE-Seq" "CITE-Seq" "CITE-Seq" "CITE-Seq" ...
 $ sample_ID : chr  "healthy-1" "healthy-2" "healthy-3" "healthy-4" ...
NULL


In [23]:
# rename the column "Sex" to "sex", similar to what it is in the cell annotation files
colnames(anno_GSE185381_samples$Control)[colnames(anno_GSE185381_samples$Control) == "Sex"] <- "sex"

In [24]:
# rename the column "Gender" to "sex", similar to what it is in the cell annotation files
colnames(anno_GSE185381_samples$AML)[colnames(anno_GSE185381_samples$AML) == "Gender"] <- "sex"

In [25]:
# rename the column "Sample" to "Sample.ID"
colnames(anno_GSE185381_samples$Control)[colnames(anno_GSE185381_samples$Control) == "Sample"] <- "Sample.ID"
anno_GSE185381_samples$Control$Sample.ID <- as.character(anno_GSE185381_samples$Control$Sample.ID)

In [26]:
# fill in the missing values in the "Manuscript.ID" column in anno_GSE185381_samples$AML
idx_missing <- anno_GSE185381_samples$AML$Manuscript.ID == ""
anno_GSE185381_samples$AML$Manuscript.ID[idx_missing] <- paste0("AML"
                                                               ,substr(anno_GSE185381_samples$AML$Sample.ID[idx_missing]
                                                                      ,6
                                                                      ,9)
                                                               )
print(str(anno_GSE185381_samples$AML$Manuscript.ID))

 chr [1:24] "AML0024" "AML0693" "AML0134" "AML2975" "AML0160" "AML1133" ...
NULL


In [27]:
# add sample_ID to anno_GSE185381_samples$AML
anno_GSE185381_samples$AML$sample_ID <- gsub("AML", "AML-",anno_GSE185381_samples$AML$Manuscript.ID)
print(str(anno_GSE185381_samples$AML))

'data.frame':	24 obs. of  51 variables:
 $ Institution        : chr  "OSU" "OSU" "OSU" "OSU" ...
 $ Sample.ID          : chr  "U-06-0024" "U-11-0693" "U-12-0134" "U-15-2975" ...
 $ Manuscript.ID      : chr  "AML0024" "AML0693" "AML0134" "AML2975" ...
 $ sex                : chr  "M" "F" "M" "F" ...
 $ Age                : int  64 77 42 74 74 78 71 57 32 62 ...
 $ Sequencing         : chr  "CITE-seq" "CITE-seq + scTCR-seq" "scTCR-seq" "scTCR-seq" ...
 $ Dx                 : chr  "AML" "AML" "AML+ leukemia cutis" "AML" ...
 $ DxELN_Cytogenetic  : chr  "Adverse" "Adverse" "Adverse" "Adverse" ...
 $ Cytogenetics       : chr  "48,XY,+8,+8,i(8)(p10),t(9;15)(q33;q15),del(10)(q22.1q24),del(13)(q14q21)" "74-89<4n>,XXXX,-8,-9,-9,-12,-17,i(17)(q10),+mar1,+mar2,+dmin[cp18]/46,XX[2] .ish dmin(amp CMYC)" "41,XY,add(4)(p16),der(4)t(4;12)(q21;q15),-5,-7,add(8)(p21),del(11)(q23),-12 \n,dic(13;21)(p11.2;p11.2),-16,add("| __truncated__ "45,XX,del(5)(q13q31),ins(7;12)(q11.2;q13q24.1),-12,del(20)(q13.1)[10

From the AML samples, we will keep only the primary AML diagnosis without any additional diagnosis.

In [28]:
idx_prim <- anno_GSE185381_samples$AML$Dx == "AML"
anno_GSE185381_samples$AML <- anno_GSE185381_samples$AML[idx_prim,]

Merge the two annotation files.

In [29]:
# get same columns as for AML anno
anno_GSE185381_samples$Control <- left_join(anno_GSE185381_samples$Control
                                            ,anno_GSE185381_samples$AML)

# get same columns as for Control anno
anno_GSE185381_samples$AML <- left_join(anno_GSE185381_samples$AML
                                        ,anno_GSE185381_samples$Control
                                        )

# unify order of the columns
anno_GSE185381_samples$AML <- anno_GSE185381_samples$AML[,colnames(anno_GSE185381_samples$Control)]

# bind
anno_samples <- rbind(anno_GSE185381_samples$Control
                     ,anno_GSE185381_samples$AML)

print(str(anno_samples))

[1m[22mJoining with `by = join_by(Sample.ID, sex, Age, Sequencing, sample_ID)`
[1m[22mJoining with `by = join_by(Institution, Sample.ID, Manuscript.ID, sex, Age, Sequencing, Dx, DxELN_Cytogenetic, Cytogenetics, Overall.survival, Flow.report.summary, RAS.pathway, NPM1, IDH, TET2, TP53,
RUNX1, CBF, MLL.rearranged, IDH1, IDH2, ASXL1, BCOR, CBL, CEBPAdm, CSF3R, DNMT3A, ETV6, FLT3.TKD, FLT3.ITD, GATA2, JAK2, KIT, NPM1.1, NRAS, KRAS, PTPN11, PHF6, RAD21, RUNX1.1, SETBP1, SMC1A, STAG2,
SRSF2, SF3B1, TET2.1, U2AF1, WT1, TP53.1, ZRSR2, sample_ID)`


'data.frame':	22 obs. of  51 variables:
 $ Sample.ID          : chr  "1" "2" "3" "4" ...
 $ sex                : chr  "F" "M" "F" "M" ...
 $ Age                : int  26 39 50 20 47 53 22 20 19 42 ...
 $ Sequencing         : chr  "CITE-Seq" "CITE-Seq" "CITE-Seq" "CITE-Seq" ...
 $ sample_ID          : chr  "healthy-1" "healthy-2" "healthy-3" "healthy-4" ...
 $ Institution        : chr  NA NA NA NA ...
 $ Manuscript.ID      : chr  NA NA NA NA ...
 $ Dx                 : chr  NA NA NA NA ...
 $ DxELN_Cytogenetic  : chr  NA NA NA NA ...
 $ Cytogenetics       : chr  NA NA NA NA ...
 $ Overall.survival   : int  NA NA NA NA NA NA NA NA NA NA ...
 $ Flow.report.summary: chr  NA NA NA NA ...
 $ RAS.pathway        : chr  NA NA NA NA ...
 $ NPM1               : chr  NA NA NA NA ...
 $ IDH                : chr  NA NA NA NA ...
 $ TET2               : chr  NA NA NA NA ...
 $ TP53               : chr  NA NA NA NA ...
 $ RUNX1              : chr  NA NA NA NA ...
 $ CBF                : logi  NA NA NA

In [30]:
anno_samples$health_status <- sapply(anno_samples$sample_ID
                                    ,function(sample_ID){
                                        ifelse(grepl("healthy", sample_ID)
                                              ,"healthy"
                                              ,"AML")
                                    }
                                   )

In [31]:
anno_samples$case_or_control <- sapply(anno_samples$health_status
                                      ,function(health_status){
                                              ifelse(grepl("healthy", health_status)
                                                     ,"control"
                                                     ,"case")
                                      }
                                      )

Two healthy individuals (healthy-4 and healthy-5) were sequenced twice. We will remove the duplicates.

In [32]:
anno_samples <- anno_samples[!duplicated(anno_samples$sample_ID),]

In [33]:
cat("anno_samples:\n\n**************\n\n")
print(anno_samples)

anno_samples:

**************

   Sample.ID sex Age           Sequencing    sample_ID Institution
1          1   F  26             CITE-Seq    healthy-1        <NA>
2          2   M  39             CITE-Seq    healthy-2        <NA>
3          3   F  50             CITE-Seq    healthy-3        <NA>
4          4   M  20             CITE-Seq    healthy-4        <NA>
5          5   M  47             CITE-Seq    healthy-5        <NA>
8         58   M  20             CITE-Seq   healthy-58        <NA>
9         82   M  19 CITE-Seq + scTCR-seq   healthy-82        <NA>
10      4003   M  42 CITE-Seq + scTCR-seq healthy-4003        <NA>
11       182   M  51            scTCR-seq  healthy-182        <NA>
12 U-06-0024   M  64             CITE-seq     AML-0024         OSU
13 U-11-0693   F  77 CITE-seq + scTCR-seq     AML-0693         OSU
14 U-15-2975   F  74            scTCR-seq     AML-2975         OSU
15 U-16-0160   M  74             CITE-seq     AML-0160         OSU
16 U-16-1133   F  78           

### export anno_samples

In [34]:
save(anno_samples, file = "outs/anno_samples.RData")

# Preprocessing cell annotation files

- **INPUT:**
    - ```cell_relabelling.csv``` file containing unified cell type annotations. Stored in additional_input_files sub-directory
	- metadata tables from the GSE185381_RAW dataset
    
    
- **NOTES:**

    -  ```anno_cells``` file should contain columns "cell_ID", "cell_type" and "sample_ID" for communication analysis
    
    
- **OUTPUT:**

    -  ```anno_cells.RData``` cleaned and merged cell annotation file

### load data

In [35]:
cat("load cell_relabelling\n\n**************\n\n")
tsv_file="../../cell_relabelling.csv"
cell_relabelling <- read.csv(file = tsv_file
                             ,sep = ";"
                             )

cell_relabelling <- cell_relabelling[cell_relabelling$source=="Lasry et al., 2022",]
rownames(cell_relabelling) <- cell_relabelling$cell_type_original
print(cell_relabelling)

load cell_relabelling

**************

                  cell_type_original      cell_type             source
CD4+ T                        CD4+ T              T Lasry et al., 2022
CD8+ T                        CD8+ T              T Lasry et al., 2022
CD16+ monocyte        CD16+ monocyte           Mono Lasry et al., 2022
cDC2                            cDC2             DC Lasry et al., 2022
B                                  B              B Lasry et al., 2022
CD14+ monocyte        CD14+ monocyte           Mono Lasry et al., 2022
NK                                NK             NK Lasry et al., 2022
cDC1                            cDC1             DC Lasry et al., 2022
HLA-II+ monocyte    HLA-II+ monocyte           Mono Lasry et al., 2022
HSC                              HSC           HSPC Lasry et al., 2022
MAIT                            MAIT              T Lasry et al., 2022
CD11c+                        CD11c+           Mono Lasry et al., 2022
Pre-B                          Pre-B  

In [36]:
unique(cell_relabelling$cell_type)

In [37]:
# read in cell annotations
print("read in cell annotations")
meta <- lapply(datasets
                    ,function(ds){
                        # select correct files
                        my_files <- files[[ds]][grepl("metadata", files[[ds]])]
                        
                        # read in files
                        anno <- lapply(my_files
                                        ,function(f){
                                            c <- as.data.frame(fread(paste0(path_in,"/",ds,"/",f)
                                                                     , header = TRUE
                                                                    ))
                                            rownames(c) <- c$V1
                                            c <- c[,!(colnames(c) %in% "V1")]
                                            c
                                        })
                        
                        names(anno) <- my_files
                        anno
                    })
names(meta) <- datasets
print(str(meta))

[1] "read in cell annotations"
List of 1
 $ GSE185381_RAW:List of 46
  ..$ GSM5613744_2019-07-01-count-1_metadata.csv           :'data.frame':	8686 obs. of  16 variables:
  .. ..$ cell               : chr [1:8686] "2019-07-01-count-1:AAACCCAAGACTCTTG" "2019-07-01-count-1:AAACCCACAAATGGCG" "2019-07-01-count-1:AAACCCACACACACGC" "2019-07-01-count-1:AAACCCAGTACTAGCT" ...
  .. ..$ UMAP_1             : num [1:8686] -8.74 -8.32 10.84 -7.42 -8.35 ...
  .. ..$ UMAP_2             : num [1:8686] 4.707 4.423 -0.609 4.015 4.323 ...
  .. ..$ orig.ident         : chr [1:8686] "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" ...
  .. ..$ samples            : chr [1:8686] "AML0612" "AML3762" "AML3762" "AML3762" ...
  .. ..$ Broad_cell_identity: chr [1:8686] "HSC" "MPP" "Ery" "HSC" ...
  .. ..$ Cell_type_identity : chr [1:8686] "HSC" "MPP_GMP" "Ery-3" "HSC" ...
  .. ..$ clusters_res.2     : int [1:8686] 1 16 58 16 16 55 58 1 27 8 ...
  .. ..$ CNV_pos            : chr [

### process data

In [38]:
# add column with the GSM number
n <- names(meta$GSE185381_RAW)
meta$GSE185381_RAW <- lapply(n
               ,function(gsm){
                   meta$GSE185381_RAW[[gsm]]$GSM <- sub("\\_.*","",gsm)
                   meta$GSE185381_RAW[[gsm]]$date <- sub("\\-[A-Z].*","",sub(".*GSM\\d+_","",gsm))
                   meta$GSE185381_RAW[[gsm]]
                            })
names(meta$GSE185381_RAW) <- n
print(str(meta$GSE185381_RAW))

List of 46
 $ GSM5613744_2019-07-01-count-1_metadata.csv           :'data.frame':	8686 obs. of  18 variables:
  ..$ cell               : chr [1:8686] "2019-07-01-count-1:AAACCCAAGACTCTTG" "2019-07-01-count-1:AAACCCACAAATGGCG" "2019-07-01-count-1:AAACCCACACACACGC" "2019-07-01-count-1:AAACCCAGTACTAGCT" ...
  ..$ UMAP_1             : num [1:8686] -8.74 -8.32 10.84 -7.42 -8.35 ...
  ..$ UMAP_2             : num [1:8686] 4.707 4.423 -0.609 4.015 4.323 ...
  ..$ orig.ident         : chr [1:8686] "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" ...
  ..$ samples            : chr [1:8686] "AML0612" "AML3762" "AML3762" "AML3762" ...
  ..$ Broad_cell_identity: chr [1:8686] "HSC" "MPP" "Ery" "HSC" ...
  ..$ Cell_type_identity : chr [1:8686] "HSC" "MPP_GMP" "Ery-3" "HSC" ...
  ..$ clusters_res.2     : int [1:8686] 1 16 58 16 16 55 58 1 27 8 ...
  ..$ CNV_pos            : chr [1:8686] "CNV+" "CNV-" "CNV+" "CNV-" ...
  ..$ malignant          : chr [1:8686] "maligna

In [39]:
sapply(1:length(meta$GSE185381_RAW),function(i)print(dim(meta$GSE185381_RAW[[i]])))

[1] 8686   18
[1] 9430   18
[1] 9570   18
[1] 269  18
[1] 5445   18
[1] 4778   18
[1] 4858   18
[1] 8973   18
[1] 8383   18
[1] 180  18
[1] 183  18
[1] 184  18
[1] 5297   18
[1] 4602   18
[1] 3331   18
[1] 3843   18
[1] 7954   18
[1] 8008   18
[1] 9484   18
[1] 8910   18
[1] 7803   18
[1] 8393   18
[1] 2096   18
[1] 2185   18
[1] 6542   18
[1] 6183   18
[1] 5873   18
[1] 659  18
[1] 674  18
[1] 1252   18
[1] 2973   18
[1] 2665   18
[1] 3026   18
[1] 2964   18
[1] 6141   18
[1] 2844   18
[1] 5931   18
[1] 2776   18
[1] 466  18
[1] 3604   18
[1] 8636   18
[1] 5053   18
[1] 5015   18
[1] 1640   18
[1] 1888   18
[1] 1792   18


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
8686,9430,9570,269,5445,4778,4858,8973,8383,180,⋯,5931,2776,466,3604,8636,5053,5015,1640,1888,1792
18,18,18,18,18,18,18,18,18,18,⋯,18,18,18,18,18,18,18,18,18,18


In [40]:
#check if all metadata's colnames are matching
for (x in names(meta$GSE185381_RAW)){
    if (!identical(colnames(meta$GSE185381_RAW[[x]]),colnames(meta$GSE185381_RAW[[2]]))){
        print(x)
    }
}

In [41]:
# merge anno_GSE116256_RAW and anno_GSE120221_RAW into one list
anno <- do.call("rbind"
                ,meta$GSE185381_RAW)

# sapply(1:length(meta$GSE185381_RAW),function(i)print(dim(meta$GSE185381_RAW[[i]])))
       
print(str(anno))

'data.frame':	211442 obs. of  18 variables:
 $ cell               : chr  "2019-07-01-count-1:AAACCCAAGACTCTTG" "2019-07-01-count-1:AAACCCACAAATGGCG" "2019-07-01-count-1:AAACCCACACACACGC" "2019-07-01-count-1:AAACCCAGTACTAGCT" ...
 $ UMAP_1             : num  -8.74 -8.32 10.84 -7.42 -8.35 ...
 $ UMAP_2             : num  4.707 4.423 -0.609 4.015 4.323 ...
 $ orig.ident         : chr  "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" ...
 $ samples            : chr  "AML0612" "AML3762" "AML3762" "AML3762" ...
 $ Broad_cell_identity: chr  "HSC" "MPP" "Ery" "HSC" ...
 $ Cell_type_identity : chr  "HSC" "MPP_GMP" "Ery-3" "HSC" ...
 $ clusters_res.2     : int  1 16 58 16 16 55 58 1 27 8 ...
 $ CNV_pos            : chr  "CNV+" "CNV-" "CNV+" "CNV-" ...
 $ malignant          : chr  "malignant" "malignant" "malignant" "malignant" ...
 $ aml                : chr  "AML" "AML" "AML" "AML" ...
 $ ap_aml_age         : chr  "adult_AML" "adult_AML" "adult_AML" "adult_AML

### unify columns

In [42]:
# make anno with all columns needed
anno$cell_ID <- anno$cell
anno$cell_ID <- gsub("-",".",anno$cell_ID)
anno$cell_ID <- gsub(":",".",anno$cell_ID)
anno$cell_type_original <- anno$Broad_cell_identity
anno$sample_ID <- anno$samples
anno$sample_ID <- gsub("Control", "healthy-",anno$sample_ID)
anno$sample_ID <- gsub("AML", "AML-",anno$sample_ID)

print(str(anno))

'data.frame':	211442 obs. of  21 variables:
 $ cell               : chr  "2019-07-01-count-1:AAACCCAAGACTCTTG" "2019-07-01-count-1:AAACCCACAAATGGCG" "2019-07-01-count-1:AAACCCACACACACGC" "2019-07-01-count-1:AAACCCAGTACTAGCT" ...
 $ UMAP_1             : num  -8.74 -8.32 10.84 -7.42 -8.35 ...
 $ UMAP_2             : num  4.707 4.423 -0.609 4.015 4.323 ...
 $ orig.ident         : chr  "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" "2019-07-01-count-1" ...
 $ samples            : chr  "AML0612" "AML3762" "AML3762" "AML3762" ...
 $ Broad_cell_identity: chr  "HSC" "MPP" "Ery" "HSC" ...
 $ Cell_type_identity : chr  "HSC" "MPP_GMP" "Ery-3" "HSC" ...
 $ clusters_res.2     : int  1 16 58 16 16 55 58 1 27 8 ...
 $ CNV_pos            : chr  "CNV+" "CNV-" "CNV+" "CNV-" ...
 $ malignant          : chr  "malignant" "malignant" "malignant" "malignant" ...
 $ aml                : chr  "AML" "AML" "AML" "AML" ...
 $ ap_aml_age         : chr  "adult_AML" "adult_AML" "adult_AML" "adult_AML

We will now merge the information from the anno_samples object to the cell annotation object. This will remove cells that nelong to the samples that we had filtered out in the previous section. Also it will filter out cells that belogn to the samples that are not present in the original sample annotation file.

In [43]:
anno_cells <- merge.data.frame(anno
             ,anno_samples
             ,by.x=c("sample_ID")
             ,by.y=c("sample_ID")
                        )

print(str(anno_cells))

'data.frame':	58354 obs. of  73 variables:
 $ sample_ID          : chr  "AML-0024" "AML-0024" "AML-0024" "AML-0024" ...
 $ cell               : chr  "2020-09-15-AML0024:CATCAAGGTTAGCGGA" "2020-09-15-AML0024:CATCAAGTCCGAGAAG" "2020-09-15-AML0024:CATCCACAGGGACCAT" "2020-09-15-AML0024:CCTCAACAGAGCAAGA" ...
 $ UMAP_1             : num  -0.731 -2.2 -2.867 -1.666 -0.972 ...
 $ UMAP_2             : num  -15.8 -16.7 -16.1 -16.1 -17.5 ...
 $ orig.ident         : chr  "2020-09-15-AML0024" "2020-09-15-AML0024" "2020-09-15-AML0024" "2020-09-15-AML0024" ...
 $ samples            : chr  "AML0024" "AML0024" "AML0024" "AML0024" ...
 $ Broad_cell_identity: chr  "CD14+ monocyte" "CD14+ monocyte" "CD16+ monocyte" "CD14+ monocyte" ...
 $ Cell_type_identity : chr  "CD14+ IFN+" "CD14+" "CD16+" "CD14+ IFN+" ...
 $ clusters_res.2     : int  7 7 7 7 80 7 7 7 7 7 ...
 $ CNV_pos            : chr  "CNV+" "CNV+" "CNV+" "CNV+" ...
 $ malignant          : chr  "malignant" "malignant" "malignant" "malignant" ...
 $ a

These are the remaining samples:

### unify cell annotation
For communication analysis, the ```anno_cells``` file should contain a column ```cell_type```. To make the communication analysis more robust, we merge the very fine-grained original cell type annotations into bigger cell type categories ("cell_type" column). We preserve the original cell type annotation information in the "original_cell_type" column. For later visualization purposes, we also add cell subtypes ("cell_subtype" column), which has a higher cell type resolution than the "cell_type", but is still not as detailed as the original cell type annotations.

In [44]:
# deduplicate cell_relabelling file
cell_relabelling <- cell_relabelling[!duplicated(cell_relabelling$cell_type_original),]
rownames(cell_relabelling) <- cell_relabelling$cell_type_original

In [45]:
anno_cells$cell_type <- cell_relabelling[anno_cells$cell_type_original,"cell_type"]
# add cell_subtype column
anno_cells$cell_subtype <- cell_relabelling[anno_cells$cell_type_original,"cell_subtype"]


### export anno_cells

In [46]:
save(anno_cells, file = "outs/anno_cells.RData")

# preprocessing gene annotation files

- **INPUT:**
    - ```LR_database.rda```  stored in additional_input_files sub-directory
<!--     - ```- counts.RData```  -->
   
    
- **OUTPUT:**

    -  ```anno_genes.RData``` contains information about what genes are ligands and receptors

### load LR_database.rda data base


In [47]:
library(community)

In [48]:
data(LR_database)

In [49]:
# file="LR_database.rda"
# load(paste(path_to_additional_files,file,sep=""))
# cat("str(LR_database)\n\n")
# print(str(LR_database))

In [50]:
# load("./AML_healthy/counts.RData")

### create anno_genes

The initial `anno_genes` file will contain the coulmns "gene_symbol", "is_in _LR_database", "isLigand", and "isReceptor.

In [51]:
anno_genes <- data.frame(gene_symbol = rownames(counts))

In [52]:
# mark if genes are in LR_database 
anno_genes$is_in_LR_database <- (anno_genes$gene_symbol %in% LR_database$Ligand) | (
        anno_genes$gene_symbol %in% LR_database$Receptor
)


In [53]:
# mark if gene are ligands or receptors
anno_genes$isLigand <- anno_genes$gene_symbol %in% LR_database$Ligand
anno_genes$isReceptor <- anno_genes$gene_symbol %in% LR_database$Receptor

cat("str(anno_genes)\n\n")
print(str(anno_genes))

str(anno_genes)

'data.frame':	31843 obs. of  4 variables:
 $ gene_symbol      : chr  "MIR1302-2HG" "AL627309.1" "AL627309.3" "AL627309.5" ...
 $ is_in_LR_database: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ isLigand         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ isReceptor       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
NULL


In [54]:
cat("total nr genes are", nrow(anno_genes),"\n")
cat("nr ligands in LR_database are", length(unique(LR_database$Ligand)),"\n")
cat("nr ligands in our data are",sum(anno_genes$isLigand),"\n")
cat("nr receptors in LR_database are",length(unique(LR_database$Receptor)),"\n")
cat("nr receptors in our data are",sum(anno_genes$isReceptor),'\n')

total nr genes are 31843 
nr ligands in LR_database are 1521 
nr ligands in our data are 1443 
nr receptors in LR_database are 1306 
nr receptors in our data are 1248 


### export


In [55]:
save(anno_genes, file = "outs/anno_genes.RData")