## Data preparation for the app

Data cleaning, outlier removal, etc.

- sample: keep all patients, except PPCM
- gene: standard filtering for gene based analyses


In [2]:
library(dplyr)
library(stringr)


In [3]:
# prep data

parentDir <- 'magnetiqueCode2022/'

ctsFile <- file.path(parentDir, 'gene_count_matrix.csv', fsep=.Platform$file.sep)
traitsFile <- file.path(parentDir, 'MAGE_metadata.txt', fsep=.Platform$file.sep)
dedupFile <- file.path(parentDir, 'mage_dedup_metrics_final.txt', fsep=.Platform$file.sep) # CD

cts <- read.csv(ctsFile, row.names="gene_id", as.is=T)
colnames(cts) <- gsub("_stringtieRef", "", colnames(cts))

datTraits <- read.csv(traitsFile, stringsAsFactors = FALSE)
datTraits <- datTraits[c('Run', 'etiology', 'race', 'Age', 'sex')]
colnames(datTraits) <- c('Run', 'Etiology', 'Race', 'Age', 'Sex')
datTraits <- datTraits[datTraits$Etiology != '',]
datTraits$Etiology <- str_sub(datTraits$Etiology, start = 1, end = 1)
datTraits[datTraits$Etiology=='D',]$Etiology <- 'DCM'
datTraits[datTraits$Etiology=='H',]$Etiology <- 'HCM'
datTraits[datTraits$Etiology=='N',]$Etiology <- 'NFD'
datTraits[datTraits$Etiology=='P',]$Etiology <- 'PPCM'
datTraits$Etiology <- relevel(as.factor(datTraits$Etiology), ref='NFD')
datTraits[datTraits$Race=='African American',]$Race <- 'AA'
datTraits[datTraits$Race=='Caucasian',]$Race <- 'C'
datTraits$Race <- relevel(as.factor(datTraits$Race), ref='C')
datTraits[datTraits$Sex=='female',]$Sex <- 'F'
datTraits[datTraits$Sex=='male',]$Sex <- 'M'
datTraits$Sex <- relevel(as.factor(datTraits$Sex), ref='M')

idx <- intersect(datTraits$Run, colnames(cts))
datTraits <- datTraits[na.omit(match(idx, datTraits$Run)),]
cts <- cts[,na.omit(match(idx, colnames(cts)))]

dedup <- read.table(dedupFile, as.is=T, header=F)
dedup <- dedup[,c(1,3)]
dedup[,1] <- gsub(".metrics.txt-Unknown","", dedup[,1])
colnames(dedup) <- c("Run","DuplicationRate")
datTraits <- merge(datTraits, dedup, by.x='Run', by.y='Run')
rownames(datTraits) <- datTraits$Run

ins <- intersect(rownames(datTraits), colnames(cts))
datTraits <- datTraits[ins, ]
cts <- cts[, ins]

all(rownames(datTraits)==colnames(cts))

In [4]:
dim(cts)

We wrote 2 files to disk: the full data with _flagged arrays_ , and the final data, after filtering (using flags)

In [12]:
# gene-level filtering

varQuant <- 0.01
countQuant <- 0.01

# raw threshold
chk <- cts[rowSums(cts) > 10, ]
# threshold on variance: remove varQuant% of genes with lowest variance
var.all <- apply(chk, 1, var)
chk <- chk[var.all > quantile(var.all, probs=varQuant, type=8), ]
# threshold on normalised counts: remove countQuant% of genes with lowest
# (sequencing-depth normalised) average expression
sf <- DESeq2::estimateSizeFactorsForMatrix(chk)
chk.norm <- t(t(chk)/sf)
keep <- rowMeans(chk.norm) > quantile(rowMeans(chk.norm), probs=countQuant, type=8)
chk <- chk[keep,]

background <- rownames(cts)
universe <- background %in% background
selection <- background %in% rownames(chk)
genes <- factor(as.integer(selection[universe]))
names(genes) <- background

dim(chk)

In [13]:
# sample-level filtering (no outlier removal)
# remove PPCM only

samples <- rep(1, dim(cts)[2])
names(samples) <- colnames(cts)
samples[names(samples) %in% datTraits[datTraits$Etiology == 'PPCM',]$Run] <- 0
samples <- as.factor(samples)


In [14]:
# full data as RData 
dirloc <- 'magnetiqueCode2022/analysis/data/'

gene.attrs <- list(GeneFlag=genes)
cell.attrs <- list(SampleFlag=samples,
                   Run=datTraits$Run,
                   Etiology=datTraits$Etiology,
                   Race=datTraits$Race,
                   Age=datTraits$Age,
                   Sex=datTraits$Sex,
                   DuplicationRate=datTraits$DuplicationRate)

save(cts, cell.attrs, gene.attrs, file=file.path(dirloc, 'MAGNet_full.RData'))

# write sample info - metadata 
write.table(as.data.frame(cell.attrs)[c(2,1,3,4,5,6,7)], file.path(dirloc, 'samples.txt'), 
            row.names=F, col.names=T, quote=F, sep=",")

Now write the final data in a nice format for DGE, etc. This is the data used for the app.

In [15]:
# get data
MAGNet_data <- load(file.path(dirloc, 'MAGNet_full.RData'))

meta <- as.data.frame(cell.attrs)
# already filtered 
gene_counts <- cts[as.logical(as.numeric(levels(gene.attrs$GeneFlag))[gene.attrs$GeneFlag]),
                   as.logical(as.numeric(levels(meta$SampleFlag))[meta$SampleFlag])]
# filter meta after filtering counts!
meta <- meta[as.logical(as.numeric(levels(meta$SampleFlag))[meta$SampleFlag]),]
# then drop unused levels
meta$Etiology <- droplevels(meta$Etiology)

# add new pheno data
pheno_data <- read.csv(file.path(parentDir, 'phenoData.csv'))
# mapping
mapping <- read.csv(file.path(parentDir, 'map_SRR_to_pid.csv'))
pheno_data <- merge(pheno_data, mapping[,c('V2', 'Run', 'Experiment')], by.x='sample_name', by.y='V2')
meta <- merge(meta, pheno_data, by='Run', all.x=TRUE)

names <- c("sample_name", "Run", "Experiment", "Library.Pool", "TIN.median.", 'RIN', "DuplicationRate", "tissue_source", "Etiology", "Race", "Age", "Sex", "weight", "height", "hw", "lv_mass", "afib", "VTVF", "Diabetes", "Hypertension", "LVEF")
meta <- meta[names]
rownames(meta) <- meta[,1]
meta$sample_name <- NULL

colnames(meta) <- c("Run", "Experiment", "LibraryPool", "TIN", 'RIN', "DuplicationRate", "TissueSource", "Etiology", "Race", "Age", "Sex", "Weight", "Height", "HW", "LVMass", "AFib", "VTVF", "Diabetes", "Hypertension", "LVEF")

all.equal(colnames(gene_counts), meta$Run)
colnames(gene_counts) <- rownames(meta)
all.equal(colnames(gene_counts), rownames(meta))

countData <- gene_counts
colData <- meta
save(countData, colData, file=file.path(dirloc, 'MAGNet.RData'))
write.table(colData, file=file.path(dirloc, 'samples.txt'), sep=",", row.names=TRUE, col.names=TRUE, quote=FALSE)

In [17]:
dim(countData)
dim(colData)