In [1]:
suppressMessages({
    library(edgeR)
    library(tximport)
    library(zinbwave)
    library(SingleCellExperiment)
})

In [2]:
get_samples_list <- function(dataset, drop=c(), keep=c(), age_cutoff=FALSE){
    fname <- sprintf('Datasets/%s-labels.tsv', dataset)
    samples <- read.table(fname, header=TRUE, sep='\t', stringsAsFactors=FALSE)
    if (age_cutoff & dataset=='Lab_Pvalb'){
        fname <- sprintf('Datasets/%s-transcriptional_labels.tsv', dataset)
        labels <- read.table(fname, header=TRUE, sep='\t', stringsAsFactors=FALSE)
        labels <- labels[labels$Age > 20, ]
        samples <- samples[samples$Cell %in% labels$Cell, ]
    }
    #
    samples <- samples[!(samples$CellType %in% drop), ]
    if (length(keep) > 0){
        samples <- samples[samples$CellType %in% keep, ]
    }
    row.names(samples) <- samples$Cell

    dir <- sprintf('/media/soma/DavidWork/kallisto/kallisto_95/%s_95', dataset)
    files <- file.path(dir, samples$Cell, 'abundance.h5')
    samples <- samples[file.exists(files),]
    
    return (samples)
}

sum_gene_level <- function(txi){
    fname <- 'References/tx2gene.95.tsv'
    tx2gene <- read.table(fname, header=TRUE, sep='\t', stringsAsFactors=FALSE)
    tx2gene <- tx2gene[,c('TXNAME', 'GENESYMBOL')]
    colnames(tx2gene) <- c('TXNAME', 'GENEID')
    
    txi <- summarizeToGene(txi, tx2gene)
    
    return (txi)
}

get_txi_from_files <- function(files){
    txi <- tximport(files, type='kallisto', txOut=TRUE)
    txi <- sum_gene_level(txi)
    
    return (txi)
}

get_txi_from_cells <- function(cells, dir){
    files <- file.path(dir, cells, 'abundance.h5')
    txi <- get_txi_from_files(files)
    
    return (txi)
}

txi_to_edgeR <- function(txi, cells){
    kept = rowSums(txi$counts>5)>-1
    cts <- txi$counts[kept,]
    normMat <- txi$length[kept,]

    # Obtaining per-observation scaling factors for length, adjusted to avoid
    # changing the magnitude of the counts.
    normMat <- normMat/exp(rowMeans(log(normMat)))
    normCts <- cts/normMat

    # Computing effective library sizes from scaled counts, to account for
    # composition biases between samples.
    eff.lib <- calcNormFactors(normCts) * colSums(normCts)

    # Combining effective library sizes with the length factors, and calculating
    # offsets for a log-link GLM.
    normMat <- sweep(normMat, 2, eff.lib, "*")
    normMat <- log(normMat)

    # Creating a DGEList object for use in edgeR.
    dge <- DGEList(cts)
    dge <- scaleOffset(dge, normMat)
    
    # add cell names
    colnames(dge$counts) <- cells
    row.names(dge$samples) <- cells
    
    return (dge)
}

add_celltypes <- function(dge, samples){
    dge$samples$group <- samples$CellType
    return (dge)
}

add_weights <- function(dge, samples){
    # create scedata
    dge_int <- apply (dge$counts, c (1, 2), function (x) {
      (as.integer(x))
      })
    sceset <- SingleCellExperiment(assays = list(counts = dge_int),
                                   colData=samples, rowData=row.names(dge_int))
    
    # calculate weights
    zinb <- zinbFit(sceset, K=2, epsilon=1000)
    sceset <- zinbwave(sceset, fitted_model=zinb, K=2, epsilon=1e10)
    
    # add weights
    dge$weights <- assays(sceset)$weights
    
    return (dge)
}

get_dataset_targets <- function(dataset, drop=c(), keep=c(), age_cutoff=FALSE){
    dir <- sprintf('/media/soma/DavidWork/kallisto/kallisto_95/%s_95', dataset)
    samples <- get_samples_list(dataset, drop=drop, keep=keep, age_cutoff=age_cutoff)
    cells <- samples$Cell
    files <- file.path(dir, cells, 'abundance.h5')
    
    return (list(samples=samples, cells=cells, files=files))
}

get_sceset <- function(dataset, cells=c()){
    fname <- sprintf('Datasets/%s-tpm.tsv', dataset)
    df_tpm <- read.table(fname, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names='Gene')
    fname <- sprintf('Datasets/%s-counts.tsv', dataset)
    df_count <- read.table(fname, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names='Gene')
    genes = row.names(df_count)
    df_count <- sapply(df_count, as.integer)
    row.names(df_count) <- genes
    fname <- sprintf('Datasets/%s-labels.tsv', dataset)
    df_label <- read.table(fname, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names='Cell')
    
    df_label$CellType <- as.factor(df_label$CellType)
    
    sceset <- SingleCellExperiment(assays = list(counts=as.matrix(df_count), normcounts=as.matrix(df_tpm)),
                                   colData = df_label, rowData = rownames(df_tpm))
    
    if (length(cells) > 0){
        sceset <- sceset[,cells]
    }
    
    return (sceset)
}

generate_dataset_data <- function(dataset, drop=c(), keep=c(), age_cutoff=FALSE, savename=''){
    targets = get_dataset_targets(dataset, drop=drop, keep=keep, age_cutoff=age_cutoff)
    if (savename == ''){
        savename = dataset
    }
    samples = targets$samples
    cells = targets$cells
    files = targets$files
    txi <- get_txi_from_files(files)
    dge <- txi_to_edgeR(txi, cells)
    #sceset <- get_sceset(dataset, cells=cells)
    
    dge <- add_celltypes(dge, samples)
    #dge <- add_weights(dge)
    
    saveRDS(dge, file=sprintf('Datasets/%s_edgeR.RData', savename))
}

generate_lab_dataset <- function(){
    targets_pvalb = get_dataset_targets('Lab_Pvalb', age_cutoff=TRUE)
    targets_olm = get_dataset_targets('Lab_OLM', drop='Htr3a-OLM')
    
    samples = rbind(targets_pvalb$samples[,c('Cell', 'CellType')], targets_olm$samples)
    files = c(targets_pvalb$files, targets_olm$files)
    cells = c(targets_pvalb$cells, targets_olm$cells)
    
    txi <- get_txi_from_files(files)
    dge <- txi_to_edgeR(txi, cells)
    
    dge <- add_celltypes(dge, samples)
    #dge <- add_weights(dge)
    
    saveRDS(dge, file='Datasets/Lab_Dataset_edgeR.RData')
}

In [3]:
generate_dataset_data('Lab_Pvalb', age_cutoff=TRUE)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 
summarizing abundance
summarizing counts
summarizing length
summarizing inferential replicates


In [4]:
generate_lab_dataset()

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 
summarizing abundance
summarizing counts
summarizing length
summarizing inferential replicates


In [None]:
keep <- c('Pvalb Akr1c18 Ntf3',
          'Pvalb Calb1 Sst',
          'Pvalb Gabrg1',
          'Pvalb Gpr149 Islr',
          'Pvalb Reln Itm2a',
          'Pvalb Reln Tac1',
          'Pvalb Sema3e Kank4',
          'Pvalb Th Sst',
          'Pvalb Tpbg',
          'Pvalb Vipr2',
          'Sst Calb2 Necab1',
          'Sst Calb2 Pdlim5',
          'Sst Chodl',
          'Sst Chrna2 Glra3',
          'Sst Chrna2 Ptgdr',
          'Sst Crh 4930553C11Rik ',
          'Sst Crhr2 Efemp1',
          'Sst Esm1',
          'Sst Hpse Cbln4',
          'Sst Hpse Sema3c',
          'Sst Mme Fam114a1',
          'Sst Myh8 Etv1 ',
          'Sst Myh8 Fibin',
          'Sst Nr2f2 Necab1',
          'Sst Nts',
          'Sst Rxfp1 Eya1',
          'Sst Rxfp1 Prdm8',
          'Sst Tac1 Htr1d',
          'Sst Tac1 Tacr3',
          'Sst Tac2 Myh4',
          'Sst Tac2 Tacstd2'
         )
generate_dataset_data('Gauwens', keep=keep)

In [None]:
keep <- c('Pvalb.C1ql1.Cpne5',
          'Pvalb.C1ql1.Npy',
          'Pvalb.C1ql1.Pvalb',
          'Pvalb.Tac1.Akr1c18',
          'Pvalb.Tac1.Nr4a2',
          'Pvalb.Tac1.Sst',
          'Pvalb.Tac1.Syt2')
generate_dataset_data('GSE99888', keep=keep, savename='Harris_Pvalb')