# PART 1
## READ IN AND CLEAN UP DATA

- **INPUT:**
    - ```$path_in``` argument: raw_data directory containing two sub-directories: GSE116256_RAW and GSE120221_RAW
    - raw count matrices from the GSE116256_RAW dataset
	- raw count matrices from the GSE120221_RAW dataset
    
- **NOTES:**

    -  samples BM5, MUTZ3, and OCI-AML3 are excluded from the GSE116256_RAW dataset
    -  all non-D0 samples are excluded from the GSE116256_RAW dataset
    -  samples Ck, C2, S1, Sk1, and S2 are excluded from the GSE120221_RAW dataset, as those are replicating samples C1 and Sk2.
    -  all-zero genes are excluded
    
- **OUTPUT:**

    -  ```counts.RData``` raw merged counts
    -  ```samples.RData``` variable containing IDs of selected samples

### prepare list of samples

In [1]:
library(Matrix)

In [2]:
# raw_data directory contains two sub-directories: GSE116256_RAW and GSE120221_RAW

path_in <- "../../../data/data_preprocessing/vanGalen_Hourigan/raw_data"
print("input path is:")
print(path_in)

path_out <- "../../../results/data_preprocessing/vanGalen_Hourigan/preprocessed/"
print("output path is:")
print(path_out)

[1] "input path is:"
[1] "../../../data/data_preprocessing/vanGalen_Hourigan/raw_data"
[1] "output path is:"
[1] "../../../results/data_preprocessing/vanGalen_Hourigan/preprocessed/"


In [3]:
# variables####
print("variables")
datasets <- list.files(path_in)
print("datasets:")
print(datasets)

[1] "variables"
[1] "datasets:"
[1] "GSE116256_RAW" "GSE120221_RAW"


In [4]:
files <- lapply(datasets
               ,function(dataset){
                   list.files(paste0(path_in
                                   ,"/"
                                   ,dataset))
               })
names(files) <- datasets
print("files:")
print(files)

[1] "files:"
$GSE116256_RAW
 [1] "GSM3587923_AML1012-D0.dem.txt"    "GSM3587924_AML1012-D0.anno.txt"  
 [3] "GSM3587925_AML210A-D0.dem.txt"    "GSM3587926_AML210A-D0.anno.txt"  
 [5] "GSM3587927_AML314-D0.dem.txt"     "GSM3587928_AML314-D0.anno.txt"   
 [7] "GSM3587929_AML314-D31.dem.txt"    "GSM3587930_AML314-D31.anno.txt"  
 [9] "GSM3587931_AML328-D0.dem.txt"     "GSM3587932_AML328-D0.anno.txt"   
[11] "GSM3587933_AML328-D113.dem.txt"   "GSM3587934_AML328-D113.anno.txt" 
[13] "GSM3587935_AML328-D171.dem.txt"   "GSM3587936_AML328-D171.anno.txt" 
[15] "GSM3587937_AML328-D29.dem.txt"    "GSM3587938_AML328-D29.anno.txt"  
[17] "GSM3587939_AML328.nanopore.txt"   "GSM3587940_AML329-D0.dem.txt"    
[19] "GSM3587941_AML329-D0.anno.txt"    "GSM3587942_AML329-D20.dem.txt"   
[21] "GSM3587943_AML329-D20.anno.txt"   "GSM3587944_AML329-D37.dem.txt"   
[23] "GSM3587945_AML329-D37.anno.txt"   "GSM3587946_AML371-D0.dem.txt"    
[25] "GSM3587947_AML371-D0.anno.txt"    "GSM3587948_AML371-D34.dem.txt" 

The function below takes datasets which are the files in the path and excludes BM5-34p, MUTZ3, OCI-AML3 from the GSE116256_RAW dataset, as well as excludes samples S1, Sk1, S2, Ck, C2 from the GSE120221_RAW dataset.

In [5]:
# Define patient IDs
samples <- lapply(datasets
                        ,function(dataset){
                            my_files <- files[[dataset]]
                            if(any(grepl(".txt"
                                        ,my_files))){
                                samples <- sub(".*_", "", my_files)
                                samples <- sub("[.]dem.*","", samples)
                                samples <- samples[!grepl("anno",samples) & !grepl("nano", samples)]
                                # exclude BM5-34p, MUTZ3, OCI-AML3
                                samples <- samples[!grepl("BM5",samples) & !grepl("MUTZ3", samples) 
                                                   & 
                                                   !grepl("OCI-AML3", samples)]
                                samples
                            } else {
                                samples <- sub(".*barcodes_", "", my_files)
                                samples <- sub("[.]tsv.*","", samples)
                                samples <- samples[!grepl("matrix",samples) & !grepl("genes", samples)]
                                # exclude duplicating samples
                                samples <- samples[!grepl("S1",samples) & !grepl("Sk1", samples) 
                                                   & 
                                                   !grepl("S2", samples) ]
                                samples <- samples[!grepl("Ck",samples) & !grepl("C2", samples) ]
                                samples
                            }
                        }
                        )
names(samples) <- datasets

In the GSE116256_RAW dataset, only healthy bone marrow samples, as well as bone marrow samples at day 0 of diagnosis are kept.

In [6]:
print("remove non-diagnosis AML samlpes")
idx_D0_or_healthy <- grepl("-D0", samples$GSE116256_RAW ) | grepl("BM", samples$GSE116256_RAW )
samples$GSE116256_RAW <- samples$GSE116256_RAW[idx_D0_or_healthy]

[1] "remove non-diagnosis AML samlpes"


In the GSE116256_RAW dataset, the AML samples with less than 50% blasts are removed.

In [7]:
cat("remove samples with less than 50% blasts at diagnosis:\n\n")
cat("AML314-D0, AML371-D0, AML475-D0, AML997-D0, AML329-D0, AML420-D0\n\n")
idx_less_50percent_blasts <- samples$GSE116256_RAW %in% c("AML314-D0"
                                                          , "AML371-D0"
                                                          , "AML475-D0"
                                                          , "AML997-D0"
                                                          , "AML329-D0"
                                                          , "AML420-D0"
                                                          )
samples$GSE116256_RAW <- samples$GSE116256_RAW[!idx_less_50percent_blasts]

cat("all samples:\n\n")
print(samples)

remove samples with less than 50% blasts at diagnosis:

AML314-D0, AML371-D0, AML475-D0, AML997-D0, AML329-D0, AML420-D0

all samples:

$GSE116256_RAW
 [1] "AML1012-D0" "AML210A-D0" "AML328-D0"  "AML419A-D0" "AML420B-D0"
 [6] "AML556-D0"  "AML707B-D0" "AML722B-D0" "AML870-D0"  "AML916-D0" 
[11] "AML921A-D0" "BM1"        "BM2"        "BM3"        "BM4"       

$GSE120221_RAW
 [1] "A"   "B"   "C1"  "E"   "F"   "G"   "H"   "J"   "K"   "L"   "M"   "N"  
[13] "O"   "P"   "Q"   "R"   "Sk2" "T"   "U"   "W"  



### load data

In [8]:
# read in raw counts
print("read in raw counts")
counts_raw <- unlist(lapply(datasets
                ,function(dataset){
                    my_path_in <- paste0(path_in
                                           ,"/"
                                           ,dataset)
                    
                    my_samples <- samples[[dataset]]
                    
                    my_files <- files[[dataset]]
                    my_files <- unlist(sapply(my_samples
                                      ,function(sample){
                                          my_files[grepl(paste0("_",sample)
                                                        ,my_files)]
                                      }))
                    
                    idx_dem <- grepl("dem", my_files)
                    
                    idx_mtx <- grepl("mtx", my_files)
                    names(idx_mtx) <- my_samples
                    
                    if(sum(idx_dem) != 0){
                        lapply(my_files[idx_dem]
                              ,function(file){
                                  print(paste("loading file"
                                             ,file))
                                  counts <- read.csv(paste0(my_path_in
                                                            ,"/"
                                                            ,file
                                                            )
                                                     ,sep = "\t"
                                                     ,check.names = F
                                                     ,row.names = 1
                                                    )
                                  counts
                              })
                    } else if(sum(idx_mtx) != 0) {
                        lapply(my_samples
                              ,function(my_sample){
                                  
                                  print(paste("loading file"
                                             ,my_files[3,my_sample]))
                                  
                                  colnames <- read.csv(paste0(my_path_in
                                                            ,"/"
                                                            ,my_files[1,my_sample]
                                                            )
                                                     ,sep = "\t"
                                                     ,check.names = F
                                                     ,header=FALSE
                                                    )[,1]
                                  # remove "-1" at the end of each cell_ID
                                  colnames <- gsub("-1.*", "", colnames)
                                  
                                  # add sample name 
                                  colnames <- paste0("BM-"
                                                    ,my_sample
                                                    ,"_"
                                                    ,colnames)
                                  
                                  genes <- read.csv(paste0(my_path_in
                                                            ,"/"
                                                            ,my_files[2,my_sample]
                                                            )
                                                     ,sep = "\t"
                                                     ,check.names = F
                                                     ,header=FALSE
                                                    )
                                  colnames(genes) <- c("EnsID", "Symbol")
                                  
                                  counts <- as.data.frame(as.matrix(Matrix::readMM(paste0(my_path_in
                                                            ,"/"
                                                            ,my_files[3,my_sample]
                                                            )
                                                                         )
                                                                   )
                                                         )
                                  
                                  rownames(counts) <- genes$EnsID
                                  colnames(counts) <- colnames
                                  
                                  # remove gene duplicates
                                  counts <- counts[!duplicated(genes$Symbol),]
                                  genes <- genes[!duplicated(genes$Symbol),]
                                  rownames(counts) <- genes$Symbol
                                  
                                  counts
                              })
                    }
                }
                    )
                     ,recursive=FALSE
                     )

[1] "read in raw counts"
[1] "loading file GSM3587923_AML1012-D0.dem.txt"
[1] "loading file GSM3587925_AML210A-D0.dem.txt"
[1] "loading file GSM3587931_AML328-D0.dem.txt"
[1] "loading file GSM3587950_AML419A-D0.dem.txt"
[1] "loading file GSM3587953_AML420B-D0.dem.txt"
[1] "loading file GSM3587963_AML556-D0.dem.txt"
[1] "loading file GSM3587969_AML707B-D0.dem.txt"
[1] "loading file GSM3587980_AML722B-D0.dem.txt"
[1] "loading file GSM3587984_AML870-D0.dem.txt"
[1] "loading file GSM3587988_AML916-D0.dem.txt"
[1] "loading file GSM3587990_AML921A-D0.dem.txt"
[1] "loading file GSM3587996_BM1.dem.txt"
[1] "loading file GSM3587997_BM2.dem.txt"
[1] "loading file GSM3587998_BM3.dem.txt"
[1] "loading file GSM3588000_BM4.dem.txt"
[1] "loading file GSM3396161_matrix_A.mtx"
[1] "loading file GSM3396162_matrix_B.mtx"
[1] "loading file GSM3396163_matrix_C1.mtx"
[1] "loading file GSM3396166_matrix_E.mtx"
[1] "loading file GSM3396167_matrix_F.mtx"
[1] "loading file GSM3396168_matrix_G.mtx"


“sparse->dense coercion: allocating vector of size 1.1 GiB”


[1] "loading file GSM3396169_matrix_H.mtx"


“sparse->dense coercion: allocating vector of size 1.1 GiB”


[1] "loading file GSM3396170_matrix_J.mtx"
[1] "loading file GSM3396171_matrix_K.mtx"


“sparse->dense coercion: allocating vector of size 1.8 GiB”


[1] "loading file GSM3396172_matrix_L.mtx"


“sparse->dense coercion: allocating vector of size 1.1 GiB”


[1] "loading file GSM3396173_matrix_M.mtx"
[1] "loading file GSM3396174_matrix_N.mtx"


“sparse->dense coercion: allocating vector of size 1.1 GiB”


[1] "loading file GSM3396175_matrix_O.mtx"


“sparse->dense coercion: allocating vector of size 1.3 GiB”


[1] "loading file GSM3396176_matrix_P.mtx"
[1] "loading file GSM3396177_matrix_Q.mtx"
[1] "loading file GSM3396178_matrix_R.mtx"
[1] "loading file GSM3396182_matrix_Sk2.mtx"


“sparse->dense coercion: allocating vector of size 1.2 GiB”


[1] "loading file GSM3396183_matrix_T.mtx"


“sparse->dense coercion: allocating vector of size 1.1 GiB”


[1] "loading file GSM3396184_matrix_U.mtx"


“sparse->dense coercion: allocating vector of size 1.0 GiB”


[1] "loading file GSM3396185_matrix_W.mtx"


In [9]:
names(counts_raw) <- unlist(samples)

### process data

We will keep only the genes that are present in both datasets.

In [10]:
# check genes in overlap
print("check genes in overlap")
overlap_genes <- Reduce(intersect
                        ,lapply(counts_raw
                                , function(set){
                                    rownames(set)
                                }
                               )
                       )

print(paste("we have"
           ,length(overlap_genes
                  )
            , "overlapping genes"
           )
     )

print(str(overlap_genes))


[1] "check genes in overlap"
[1] "we have 21112 overlapping genes"
 chr [1:21112] "A1BG" "A1BG-AS1" "A1CF" "A2M" "A2M-AS1" "A2ML1" "A3GALT2" ...
NULL


In [11]:
# subset all counts for overlapping genes
print("subset all counts for overlapping genes")

counts_raw_geneOverlap <- lapply(unlist(samples)
                             ,function(sample){
                                 my_counts <- counts_raw[[sample]]
                                 
                                 my_counts <- my_counts[overlap_genes,]
                                 my_counts
                             })

names(counts_raw_geneOverlap) <- unlist(samples)

rm(counts_raw)

[1] "subset all counts for overlapping genes"


In [12]:
# concatenate: counts
counts_raw_merged <- do.call(cbind.data.frame, counts_raw_geneOverlap)

In [13]:
# remove additional samples name in front of the cell_ID
colnames(counts_raw_merged) <- gsub(".*[.]", "", colnames(counts_raw_merged))
for(i in names(counts_raw_geneOverlap)){
    print(i)
    print(dim(counts_raw_geneOverlap[[i]]))
}

print("dimensions of the merged dataset (genes x cells):")
print(dim(counts_raw_merged))


[1] "AML1012-D0"
[1] 21112  1136
[1] "AML210A-D0"
[1] 21112   748
[1] "AML328-D0"
[1] 21112  1094
[1] "AML419A-D0"
[1] 21112  1189
[1] "AML420B-D0"
[1] 21112   485
[1] "AML556-D0"
[1] 21112  2328
[1] "AML707B-D0"
[1] 21112  1586
[1] "AML722B-D0"
[1] 21112    79
[1] "AML870-D0"
[1] 21112   345
[1] "AML916-D0"
[1] 21112   933
[1] "AML921A-D0"
[1] 21112  3813
[1] "BM1"
[1] 21112   108
[1] "BM2"
[1] 21112   188
[1] "BM3"
[1] 21112   643
[1] "BM4"
[1] 21112  3738
[1] "A"
[1] 21112  2994
[1] "B"
[1] 21112  3293
[1] "C1"
[1] 21112  3556
[1] "E"
[1] 21112  3939
[1] "F"
[1] 21112  3746
[1] "G"
[1] 21112  4283
[1] "H"
[1] 21112  4516
[1] "J"
[1] 21112  3446
[1] "K"
[1] 21112  7247
[1] "L"
[1] 21112  4548
[1] "M"
[1] 21112  3964
[1] "N"
[1] 21112  4522
[1] "O"
[1] 21112  5013
[1] "P"
[1] 21112  3383
[1] "Q"
[1] 21112  1700
[1] "R"
[1] 21112  3593
[1] "Sk2"
[1] 21112  4726
[1] "T"
[1] 21112  4293
[1] "U"
[1] 21112  4118
[1] "W"
[1] 21112  3643
[1] "dimensions of the merged dataset (genes x cells):

In [14]:
rm(counts_raw_geneOverlap)

We will filter out the genes that have zero expression in all cells.

In [15]:
#filter zero genes
idx_nonZero <- counts_raw_merged != 0

idx_nonZero_sum <- rowSums(idx_nonZero)
print(paste("we have"
            ,sum(idx_nonZero_sum == 0)
            ,"all-zero genes. These genes will be removed."))

[1] "we have 1809 all-zero genes. These genes will be removed."


In [16]:
counts <- counts_raw_merged[idx_nonZero_sum != 0,]
print("dim(counts_raw_merged)")
print(dim(counts_raw_merged))
print("dim(counts)")
print(dim(counts))

[1] "dim(counts_raw_merged)"
[1] 21112 98936
[1] "dim(counts)"
[1] 19303 98936


### export

Save the intermediate results

In [17]:
# save intermediate results: counts
save(counts, file = paste0(path_out,"counts.RData"))

In [18]:
# save samples variable
save(samples, file = paste0(path_out,"samples.RData"))

# Preprocessing sample annotation files


- **INPUT:**
    - sample annotation file vanGalen_TableS1_Patient_Info.tsv stored in additional_input_files sub-directory
    - sample annotation file Oetjen_Table1_patientInfo.tsv stored in additional_input_files sub-directory
<!-- 	- ```samples.RData``` variable containing IDs of selected samples -->

    
- **OUTPUT:**

    -  ```anno_samples.RData``` cleaned and merged sample annotation file

In [19]:
path_to_additional_files <- sub("raw_data", "additional_input_files/", path_in)

In [20]:
# read in sample annotation for GSE116256_RAW
tsv_file="vanGalen_TableS1_Patient_Info.tsv"
anno_GSE116256_samples <- read.csv(paste(path_to_additional_files,tsv_file,sep="")
                                  ,sep = "\t"
                                  ,header = TRUE)
print(str(anno_GSE116256_samples))

'data.frame':	43 obs. of  17 variables:
 $ Sample              : chr  "BM1" "BM2" "BM3" "BM4" ...
 $ Days.from.diagnosis : chr  NA NA NA NA ...
 $ Tissue              : chr  "Bone marrow" "Bone marrow" "Bone marrow" "Bone marrow" ...
 $ Gender              : chr  "M" "M" "M" "M" ...
 $ Age                 : int  52 21 56 23 45 45 32 67 54 57 ...
 $ Blast.count         : chr  NA NA NA NA ...
 $ RHP.Mutations       : chr  NA NA NA NA ...
 $ Cytogenetics        : chr  "Unknown" "Unknown" "Unknown" "Unknown" ...
 $ Common.translocation: chr  NA NA NA NA ...
 $ Remarks             : chr  "" "" "" "" ...
 $ Cell.number         : int  108 188 643 3738 1431 1590 1136 748 1189 933 ...
 $ UMIs.min            : int  1069 1180 1007 1000 1000 1005 1002 1001 1001 1000 ...
 $ UMIs.mean           : int  3018 3908 3093 4059 2822 2558 2308 2684 2965 2179 ...
 $ UMIs.max            : int  13186 19653 26834 39196 16225 11932 14065 20440 28543 9348 ...
 $ Genes.min           : int  503 503 500 500 500 501 

In [21]:
# read in sanple annotation for GSE120221_RAW
tsv_file="Oetjen_Table1_patientInfo.tsv"
anno_GSE120221_samples <- read.csv(paste(path_to_additional_files,tsv_file,sep="")
                                   ,sep = "\t"
                                  ,header = TRUE)

print(str(anno_GSE120221_samples))

'data.frame':	25 obs. of  8 variables:
 $ Sample              : chr  "T" "W" "E" "R" ...
 $ Patient_ID          : chr  "T" "W" "E" "R" ...
 $ Sex                 : chr  "F" "F" "M" "M" ...
 $ Age                 : int  24 28 30 31 41 43 46 47 50 50 ...
 $ Technical_replicate : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ TecReplicate_sample : chr  NA NA NA NA ...
 $ Biological_replicate: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ BioReplicate_sample : chr  NA NA NA NA ...
NULL


### process data

Clean and process sample annotation for the GSE116256_RAW dataset.

In [22]:
# add day to anno_GSE116256_samples
anno_GSE116256_samples$day <- as.integer(gsub("D","",anno_GSE116256_samples$Days.from.diagnosis))

In [23]:
# add sample_ID to anno_GSE116256_samples
anno_GSE116256_samples$sample_ID <- sapply(1:nrow(anno_GSE116256_samples)
                                          ,function(i){
                                              ifelse(is.na(anno_GSE116256_samples$Days.from.diagnosis[i]
                                                          )
                                                    ,anno_GSE116256_samples$Sample[i]
                                                    ,paste0(anno_GSE116256_samples$Sample[i]
                                                            ,"-"
                                                            ,anno_GSE116256_samples$Days.from.diagnosis[i]))
                                          })

In [24]:
# order anno_GSE116256_samples
rownames(anno_GSE116256_samples) <- anno_GSE116256_samples$sample_ID
anno_GSE116256_samples <- anno_GSE116256_samples[samples[["GSE116256_RAW"]],]
print("str(anno_GSE116256_samples)")
print(str(anno_GSE116256_samples))

[1] "str(anno_GSE116256_samples)"
'data.frame':	15 obs. of  19 variables:
 $ Sample              : chr  "AML1012" "AML210A" "AML328" "AML419A" ...
 $ Days.from.diagnosis : chr  "D0" "D0" "D0" "D0" ...
 $ Tissue              : chr  "Bone marrow" "Bone marrow" "Bone marrow" "Bone marrow" ...
 $ Gender              : chr  "F" "M" "F" "F" ...
 $ Age                 : int  32 67 74 54 58 70 26 52 32 57 ...
 $ Blast.count         : chr  "65 %" "83 %" "55 %" "60 %" ...
 $ RHP.Mutations       : chr  "KRAS NM_004985 c.38G>A p.G13D (4.6%) /// NRAS NM_002524 c.38G>A p.G13D (39.0%) /// NOTCH2 NM_024408 c.4238T>A p"| __truncated__ "DNMT3A NM_175629 c.2644C>T p.R882C (43.4%) /// NPM1 NM_002520 c.859_860insTCTG p.W288fs*>9 (42.7%) /// TET2 NM_"| __truncated__ "DNMT3A NM_175629 c.1910T>A p.L637Q (43.9%) /// TP53 NM_000546 c.431A>C p.Q144P (38.7%, VUS) /// TP53 NM_000546 "| __truncated__ "CEBPA NM_004364 c.118_118insC p.P39fs* (42.9%) /// DNMT3A NM_175629 c.2644C>T p.R882C (41.9%) /// NPM1 NM_00252"| _

Clean and process sample annotation for the GSE120221_RAW dataset.

In [25]:
# order anno_GSE120221_samples
rownames(anno_GSE120221_samples) <- anno_GSE120221_samples$Sample
anno_GSE120221_samples <- anno_GSE120221_samples[samples[["GSE120221_RAW"]],]
print("str(anno_GSE120221_samples)")
print(str(anno_GSE120221_samples))

[1] "str(anno_GSE120221_samples)"
'data.frame':	20 obs. of  8 variables:
 $ Sample              : chr  "A" "B" "C1" "E" ...
 $ Patient_ID          : chr  "A" "B" "C" "E" ...
 $ Sex                 : chr  "F" "M" "F" "M" ...
 $ Age                 : int  59 47 60 30 41 58 50 43 84 57 ...
 $ Technical_replicate : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...
 $ TecReplicate_sample : chr  NA NA "C2" NA ...
 $ Biological_replicate: logi  FALSE FALSE TRUE FALSE FALSE FALSE ...
 $ BioReplicate_sample : chr  NA NA "Ck" NA ...
NULL


In [26]:
# add sample_ID to anno_GSE120221_samples
anno_GSE120221_samples$sample_ID <- paste0("BM-"
                                           ,anno_GSE120221_samples$Sample)

Merge the two annotation files.

In [27]:
# unify columns
anno_samples <- data.frame(sample_ID = c(anno_GSE116256_samples$sample_ID
                                       ,anno_GSE120221_samples$sample_ID
                                       )
                          ,patient_ID = c(anno_GSE116256_samples$Sample
                                         ,anno_GSE120221_samples$Patient_ID)
                          ,dataset = c(rep(gsub("\\_.*","",names(samples)[1])
                                          ,nrow(anno_GSE116256_samples))
                                      ,rep(gsub("\\_.*","",names(samples)[2])
                                          ,nrow(anno_GSE120221_samples)))
                          ,gender = c(anno_GSE116256_samples$Gender
                                      ,anno_GSE120221_samples$Sex
                                     )
                          ,age = c(anno_GSE116256_samples$Age
                                       ,anno_GSE120221_samples$Age
                                       )
                         )

In [28]:
names(samples)

In [29]:
# gsub("\\_.*","",names(samples))[1]

In [30]:
table(anno_samples$dataset)


GSE116256 GSE120221 
       15        20 

In [31]:
anno_samples$health_status <- sapply(anno_samples$sample_ID
                                    ,function(sample_ID){
                                        ifelse(grepl("BM", sample_ID)
                                              ,"healthy"
                                              ,"AML")
                                    }
                                   )

In [32]:
anno_samples$case_or_control <- sapply(anno_samples$health_status
                                      ,function(health_status){
                                              ifelse(grepl("healthy", health_status)
                                                     ,"control"
                                                     ,"case")
                                      }
                                      )

In [33]:
anno_samples$day <- c(anno_GSE116256_samples$day
                    ,rep(NA, nrow(anno_GSE120221_samples)))

In [34]:
cat("anno_samples:\n\n**************\n\n")
print(anno_samples)

anno_samples:

**************

    sample_ID patient_ID   dataset gender age health_status case_or_control day
1  AML1012-D0    AML1012 GSE116256      F  32           AML            case   0
2  AML210A-D0    AML210A GSE116256      M  67           AML            case   0
3   AML328-D0     AML328 GSE116256      F  74           AML            case   0
4  AML419A-D0    AML419A GSE116256      F  54           AML            case   0
5  AML420B-D0    AML420B GSE116256      M  58           AML            case   0
6   AML556-D0     AML556 GSE116256      M  70           AML            case   0
7  AML707B-D0    AML707B GSE116256      M  26           AML            case   0
8  AML722B-D0    AML722B GSE116256      F  52           AML            case   0
9   AML870-D0     AML870 GSE116256      M  32           AML            case   0
10  AML916-D0     AML916 GSE116256      F  57           AML            case   0
11 AML921A-D0    AML921A GSE116256      M  42           AML            case   0
12       

### export anno_samples

In [35]:
save(anno_samples, file = paste0(path_out,"anno_samples.RData"))

# Preprocessing cell annotation files

- **INPUT:**
    - ```cell_relabelling.csv``` file containing unified cell type annotations. Stored in additional_input_files sub-directory
	- cell annotation matrices from the GSE116256_RAW dataset
    - ```Oetjen_celltype.csv``` cell annotation file stored in additional_input_files sub-directory
    
- **NOTES:**

    -  ```anno_cells``` file should contain columns "cell_ID", "cell_type" and "sample_ID" for communication analysis
    
- **OUTPUT:**

    -  ```anno_cells.RData``` cleaned and merged cell annotation file

In [36]:
cat("load cell_relabelling\n\n**************\n\n")
tsv_file="cell_relabelling.csv"
cell_relabelling <- read.csv(file = paste(path_to_additional_files,tsv_file,sep="")
                             ,sep = ";"
                             )
rownames(cell_relabelling) <- cell_relabelling$cell_type_original
print(cell_relabelling)

load cell_relabelling

**************

                                       cell_type_original      cell_type
B                                                       B              B
CD10+ B cells                               CD10+ B cells              B
CD20+ B cells                               CD20+ B cells              B
ProB                                                 ProB              B
Mono                                                 Mono           Mono
CD14+ monocytes                           CD14+ monocytes           Mono
CD16+ monocytes                           CD16+ monocytes           Mono
Mono-like                                       Mono-like           Mono
ProMono                                           ProMono           Mono
Monocyte progenitors                 Monocyte progenitors           Mono
ProMono-like                                 ProMono-like           Mono
T                                                       T              T
CD4+ memory 

In [37]:
# read in raw annotation for GSE116256_RAW
anno_GSE116256_RAW <- lapply(samples[["GSE116256_RAW"]]
                            ,function(sample){
                                my_input_path <- paste0(path_in
                                                               ,"/GSE116256_RAW"
                                                              )
                                file <- list.files(path=my_input_path
                                                   ,pattern=paste0("*"
                                                                  ,sample 
                                                                  ,".anno.txt"
                                                                  )
                                                   )
                                print(file) 
                                
                                read.csv(paste0(my_input_path
                                                ,"/"
                                                ,file )
                                         ,sep = "\t"
                                         ,check.names = FALSE 
                                        )
                            })

[1] "GSM3587924_AML1012-D0.anno.txt"
[1] "GSM3587926_AML210A-D0.anno.txt"
[1] "GSM3587932_AML328-D0.anno.txt"
[1] "GSM3587951_AML419A-D0.anno.txt"
[1] "GSM3587954_AML420B-D0.anno.txt"
[1] "GSM3587964_AML556-D0.anno.txt"
[1] "GSM3587970_AML707B-D0.anno.txt"
[1] "GSM3587981_AML722B-D0.anno.txt"
[1] "GSM3587985_AML870-D0.anno.txt"
[1] "GSM3587989_AML916-D0.anno.txt"
[1] "GSM3587991_AML921A-D0.anno.txt"
[1] "GSM3587996_BM1.anno.txt"
[1] "GSM3587997_BM2.anno.txt"
[1] "GSM3587999_BM3.anno.txt"
[1] "GSM3588001_BM4.anno.txt"


In [38]:
names(anno_GSE116256_RAW) <- samples[["GSE116256_RAW"]]
cat("str(anno_GSE116256_RAW[[1]])\n\n")
print(str(anno_GSE116256_RAW[[1]]))

str(anno_GSE116256_RAW[[1]])

'data.frame':	1136 obs. of  28 variables:
 $ Cell                  : chr  "AML1012-D0_AAAAAGTTACGT" "AML1012-D0_AAAACACCAATC" "AML1012-D0_AAAATAGCCTTT" "AML1012-D0_AAACATTAAACG" ...
 $ NumberOfReads         : int  25292 85496 35481 39408 22632 67767 19245 59107 64919 19221 ...
 $ AlignedToGenome       : int  18329 63659 28600 29874 17136 51466 14938 45082 51291 14587 ...
 $ AlignedToTranscriptome: int  14531 43073 17200 23273 12459 37700 10455 36843 39665 11502 ...
 $ TranscriptomeUMIs     : int  1464 4810 2305 2266 1561 3572 1297 3886 2719 1572 ...
 $ NumberOfGenes         : int  670 1713 853 976 744 1416 638 1433 1203 667 ...
 $ CyclingScore          : num  -0.49 -0.484 -0.557 -0.581 -0.222 0.527 -0.386 -0.722 0.204 -0.233 ...
 $ CyclingBinary         : chr  "no" "no" "no" "no" ...
 $ MutTranscripts        : chr  "" "" "" "" ...
 $ WtTranscripts         : chr  "" "" "" "" ...
 $ PredictionRF2         : chr  "normal" "malignant" "malignant" "normal" ...
 

In [39]:
# read in raw annotation for GSE120221_RAW
tsv_file="Oetjen_celltype.csv"

anno_GSE120221_RAW_unsplit <- read.csv(paste(path_to_additional_files,tsv_file,sep="")
                                       ,sep = ",")
colnames(anno_GSE120221_RAW_unsplit) <- c("cell_ID"
                                          ,"cell_type")
cat("str(anno_GSE120221_RAW_unsplit)\n\n")
print(str(anno_GSE120221_RAW_unsplit))

str(anno_GSE120221_RAW_unsplit)

'data.frame':	76645 obs. of  2 variables:
 $ cell_ID  : chr  "S1_AAACCTGAGTTCCACA" "S1_AAACGGGGTCTAGTCA" "S1_AAAGATGCAATCGAAA" "S1_AAAGATGGTGTGCCTG" ...
 $ cell_type: chr  "Plasmacytoid dendritic cells" "CD20+ B cells" "CD4+ memory T cells" "CD14+ monocytes" ...
NULL


### process data

In [40]:
# add "BM-" to sample names in GSE120221_RAW
samples[["GSE120221_RAW"]] <- paste0("BM-"
                                     ,samples[["GSE120221_RAW"]])

# add columns cell_ID and cell_type
anno_GSE116256_RAW <- lapply(samples[["GSE116256_RAW"]]
                             ,function(sample){
                                 my_anno <- anno_GSE116256_RAW[[sample]]
                                 my_anno$cell_ID <- my_anno$Cell
                                 my_anno$cell_type <- my_anno$CellType
                                 anno_GSE116256_RAW[[sample]] <- my_anno
                             }
                            )
names(anno_GSE116256_RAW) <- samples[["GSE116256_RAW"]]

In [41]:
# add "BM-" to cell IDs
anno_GSE120221_RAW_unsplit$cell_ID <- paste0("BM-"
                                            ,anno_GSE120221_RAW_unsplit$cell_ID)

# add column sample_ID
anno_GSE120221_RAW_unsplit$sample_ID <- sapply(anno_GSE120221_RAW_unsplit$cell_ID
                                              ,function(cell_ID){
                                                  sub("\\_.*","",cell_ID)
                                              })

In [42]:
# split anno_GSE120221_RAW_unsplit
anno_GSE120221_RAW <- lapply(samples[["GSE120221_RAW"]]
                            ,function(sample){
                                idx_sample <- anno_GSE120221_RAW_unsplit$sample_ID == sample
                                anno_GSE120221_RAW_unsplit[idx_sample,]
                            })
names(anno_GSE120221_RAW) <- samples[["GSE120221_RAW"]]

In [43]:
# merge anno_GSE116256_RAW and anno_GSE120221_RAW into one list
anno_GSE116256_RAW_GSE120221_RAW <- c(anno_GSE116256_RAW
                                     ,anno_GSE120221_RAW)

### unify columns

In [44]:
# make anno with all columns needed
anno <- lapply(unlist(samples)
              ,function(sample){
                  my_anno <- anno_GSE116256_RAW_GSE120221_RAW[[sample]]
                  my_df <- data.frame(cell_ID = my_anno$cell_ID
                                      ,cell_type_original = my_anno$cell_type
                                      ,sample_ID = rep(sample, nrow(my_anno))
                                      ,malignant = grepl("-like", my_anno$cell_type) # in the GSE116256_RAW dataset, the cells marked as "-like" bare a mutation, we thus mark them as malignant
                                     )
                  my_df
              })
names(anno) <- unlist(samples)

In [45]:
# add anno_samples info
anno <- lapply(unlist(samples)
              ,function(sample){
                  my_anno <- anno[[sample]]
                  my_anno_sample <- anno_samples[anno_samples$sample_ID == sample,]
                  my_anno <- cbind(my_anno
                                  ,my_anno_sample[-1])
              })
names(anno) <- unlist(samples)

“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short variable and have been discarded”
“row names were found from a short

In [46]:
# concatenate
anno_cells <- do.call(rbind.data.frame, anno)

### unify cell annotation
For communication analysis, the `anno_cells` file should contain a column `cell_type`. To make the communication analysis more robust, we merge the very fine-grained original cell type annotations into bigger cell type categories ("cell_type" column). We preserve the original cell type annotation information in the "original_cell_type" column. For later visualization purposes, we also add cell subtypes ("cell_subtype" column), which has a higher cell type resolution than the "cell_type", but is still not as detailed as the original cell type annotations.

In [47]:
anno_cells$cell_type <- cell_relabelling[anno_cells$cell_type_original,"cell_type"]
# add cell_subtype column
anno_cells$cell_subtype <- cell_relabelling[anno_cells$cell_type_original,"cell_subtype"]

In [48]:
# add cell_type_original_datasetName column
datasetName <- rep(NA,nrow(anno_cells))
idx_vanGalen <- anno_cells$dataset == "GSE116256"
datasetName[idx_vanGalen] <- "vanGalen"
datasetName[!idx_vanGalen] <- "Oetjen"
anno_cells$cell_type_original_datasetName <- paste(anno_cells$cell_type_original
                                                   ,datasetName
                                                   ,sep = "_")

In [49]:
print(str(anno_cells))

'data.frame':	87333 obs. of  14 variables:
 $ cell_ID                       : chr  "AML1012-D0_AAAAAGTTACGT" "AML1012-D0_AAAACACCAATC" "AML1012-D0_AAAATAGCCTTT" "AML1012-D0_AAACATTAAACG" ...
 $ cell_type_original            : chr  "GMP" "GMP-like" "Prog-like" "ProMono-like" ...
 $ sample_ID                     : chr  "AML1012-D0" "AML1012-D0" "AML1012-D0" "AML1012-D0" ...
 $ malignant                     : logi  FALSE TRUE TRUE TRUE TRUE TRUE ...
 $ patient_ID                    : chr  "AML1012" "AML1012" "AML1012" "AML1012" ...
 $ dataset                       : chr  "GSE116256" "GSE116256" "GSE116256" "GSE116256" ...
 $ gender                        : chr  "F" "F" "F" "F" ...
 $ age                           : int  32 32 32 32 32 32 32 32 32 32 ...
 $ health_status                 : chr  "AML" "AML" "AML" "AML" ...
 $ case_or_control               : chr  "case" "case" "case" "case" ...
 $ day                           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cell_type                     : 

### export anno_cells

In [50]:
save(anno_cells, file = paste0(path_out, "anno_cells.RData"))

# preprocessing gene annotation files

- **INPUT:**
    - ```LR_database.rda```  stored in additional_input_files sub-directory
<!--     - ```- counts.RData```  -->
   
    
- **OUTPUT:**

    -  ```anno_genes.RData``` contains information about what genes are ligands and receptors

### load LR_database.rda data base


In [51]:
file="LR_database.rda"
load(paste(path_to_additional_files,file,sep=""))
cat("str(LR_database)\n\n")
print(str(LR_database))

str(LR_database)

'data.frame':	7244 obs. of  26 variables:
 $ Pair.Name            : chr  "A2M_LRP1" "ACTR2_ADRB2" "ACTR2_LDLR" "ACTR2_LRP2" ...
 $ Ligand               : chr  "A2M" "ACTR2" "ACTR2" "ACTR2" ...
 $ Ligand.Name          : chr  "alpha-2-macroglobulin" "actin related protein 2" "actin related protein 2" "actin related protein 2" ...
 $ Receptor             : chr  "LRP1" "ADRB2" "LDLR" "LRP2" ...
 $ Receptor.Name        : chr  "LDL receptor related protein 1" "adrenoceptor beta 2" "low density lipoprotein receptor" "LDL receptor related protein 2" ...
 $ complex_pair         : chr  NA NA NA NA ...
 $ partner_a            : chr  "P01023" "P61160" "P61160" "P61160" ...
 $ partner_b            : chr  "Q07954" "P07550" "P01130" "P98164" ...
 $ source               : chr  "P01023" "P61160" "P61160" "P61160" ...
 $ target               : chr  "Q07954" "P07550" "P01130" "P98164" ...
 $ source_genesymbol    : chr  "A2M" "ACTR2" "ACTR2" "ACTR2" ...
 $ target_genesymbol    : chr  "LR

In [52]:
# load("./AML_healthy/counts.RData")

### create anno_genes

The initial `anno_genes` file will contain the column "gene_symbol", "is_in _LR_database", "isLigand", and "isReceptor.

In [53]:
anno_genes <- data.frame(gene_symbol = rownames(counts))

In [54]:
# mark if genes are in LR_database 
anno_genes$is_in_LR_database <- (anno_genes$gene_symbol %in% LR_database$Ligand.ApprovedSymbol) | (
        anno_genes$gene_symbol %in% LR_database$Receptor.ApprovedSymbol
)


In [55]:
# mark if gene are ligands or receptors
anno_genes$isLigand <- anno_genes$gene_symbol %in% LR_database$Ligand
anno_genes$isReceptor <- anno_genes$gene_symbol %in% LR_database$Receptor

cat("str(anno_genes)\n\n")
print(str(anno_genes))

str(anno_genes)

'data.frame':	19303 obs. of  4 variables:
 $ gene_symbol      : chr  "A1BG" "A1BG-AS1" "A1CF" "A2M" ...
 $ is_in_LR_database: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ isLigand         : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
 $ isReceptor       : logi  TRUE FALSE FALSE FALSE FALSE FALSE ...
NULL


In [56]:
cat("total nr genes are", nrow(anno_genes),"\n")
cat("nr ligands in LR_database are", length(unique(LR_database$Ligand)),"\n")
cat("nr ligands in our data are",sum(anno_genes$isLigand),"\n")
cat("nr receptors in LR_database are",length(unique(LR_database$Receptor.)),"\n")
cat("nr receptors in our data are",sum(anno_genes$isReceptor),'\n')

total nr genes are 19303 
nr ligands in LR_database are 1585 
nr ligands in our data are 1445 
nr receptors in LR_database are 1408 
nr receptors in our data are 1314 


### export


In [57]:
save(anno_genes, file = paste0(path_out, "anno_genes.RData"))