In [2]:
# Loading the libraries
RPYTHON_PYTHON_VERSION=3.4
library(genefilter)
library(statmod)
require(ggplot2)
library(gplots)
require(DESeq2)
library(scLVM)
library(scde)
library(tools)
library(RColorBrewer)
library(biomaRt)
library(GO.db)

In [3]:
# Setting up the directory where the htseq-count files are
directory<-"/home/baker/Rna-seq Data Analysis/Single-Cell_Rna_seq_lane_effect/htseq_count_files"

In [5]:
no_of_cells = 96
# The sampleFiles try to grep all the files that starts with the name mentioned with grep
sampleFiles<-grep("htseq",list.files(directory), value=TRUE)
# Sample condition
sampleCondition<-c(paste0(paste0('Cell_',c(rep(1:no_of_cells,each=5)),'_Lane',c(4:8))))
#sampleCondition <- c('C1','C2')
sampleTable<-data.frame(sampleName=file_path_sans_ext(sampleFiles), fileName=sampleFiles, condition=sampleCondition)

In [72]:
sampleTable$fileName[1]

In [62]:
head(read.table(file.path(directory,sampleTable[1,2])))

Unnamed: 0,V1,V2
1,ENSMUSG00000000001.4,0
2,ENSMUSG00000000003.12,0
3,ENSMUSG00000000028.11,0
4,ENSMUSG00000000031.12,0
5,ENSMUSG00000000037.13,0
6,ENSMUSG00000000049.8,0


In [7]:
# Generating the model with non-normalized data
ddsHTSeq<-DESeqDataSetFromHTSeqCount(sampleTable=sampleTable,directory=directory, design = ~condition)

# #Reading from a pregenerated file
#countsTable<-read.csv("Single-cell_RNA_Lane_effect_valid_counts.csv",check.names=FALSE,row.names=1)
#cd<-read.csv("Single_cell_Lane_effect_RNA_seq_RAW_Count.csv",check.names=FALSE,row.names=1)
#myNames <- file_path_sans_ext(sampleFiles[colSums(cd)>1e4])
#sampleConditionValid <- sampleCondition[colSums(cd)>1e4]
#colDataNames<-data.frame(sampleName=myNames, condition=sampleCondition)
#colDataNames<-data.frame(sampleName=myNames, condition=sampleConditionValid)
# #Generating the desing formula
#des<-formula(~condition)
# #Reading the count table that was generated using htseq-count
#ddsHTSeqCount <-DESeqDataSetFromMatrix(countsTable, colData=colDataNames, design=des, ignoreRank = FALSE)

# #Running the actual DESeq program
#ddsHTSeqDE<-DESeq(ddsHTSeqCount)

In [67]:
NonNormCounts<-as.data.frame(counts(ddsHTSeq,normalized=FALSE))
colnames(NonNormCounts) <- paste0("C",(gsub("[_A-Za-z]", "", colnames(NonNormCounts))))
head(NonNormCounts,  n=20)
write.csv(NonNormCounts,"Single_cell_Lane_effect_RNA_seq_RAW_Count.csv")

Unnamed: 0,C01-004,C01-005,C01-006,C01-007,C01-008,C02-004,C02-005,C02-006,C02-007,C02-008,ellip.h,C95-004,C95-005,C95-006,C95-007,C95-008,C96-004,C96-005,C96-006,C96-007,C96-008
ENSMUSG00000000001.4,0,0,0,0,0,1,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000003.12,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000028.11,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000031.12,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000037.13,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000049.8,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000056.7,0,0,0,0,0,12,22,18,16,16,⋯,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000058.6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0
ENSMUSG00000000078.6,0,0,0,1,1,162,213,188,194,200,⋯,0,0,1,0,0,505,478,518,471,468
ENSMUSG00000000085.13,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [68]:
#Getting the size factor
sizeFactor <- estimateSizeFactors(ddsHTSeq)
write.csv(sizeFactors(sizeFactor), "Single_cell_RNA_seq_Lane_effect_sizeFactor.csv")

In [None]:
# Running the actual DESeq program in order to get Normalized Data
ddsHTSeq<-DESeq(ddsHTSeq,betaPrior=FALSE)

# Extracting the Normalized counts and saving the data into a csv file
NormCounts<-as.data.frame(counts(ddsHTSeq,normalized=TRUE))
head(NormCounts)
write.csv(NormCounts,"Single_cell_RNA_seq_Lane_Effect_Normalized_Count.csv")

estimating size factors
estimating dispersions
In .local(object, ...): same number of samples and coefficients to fit,
  estimating dispersion by treating samples as replicates.
  read the ?DESeq section on 'Experiments without replicates'gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing


In [None]:
cd<-read.csv("Single_cell_Lane_effect_RNA_seq_RAW_Count.csv",check.names=FALSE,row.names=1)


In [None]:
# omit genes that are never detected
cd <- cd[rowSums(cd)>0,];
# omit cells with very poor coverage
cd <- cd[,colSums(cd)>1e4]; 

In [None]:
# Cells that remained
colnames(cd)

In [None]:
#Calculating the mean and standard deviation
meanNormCounts <- colMeans(NormCounts)
sdNormCounts <- apply(NormCounts,2,sd)

In [None]:
#plotting data with the standard plot function and coloring points according to a factor variable
plot.col<-function(x,y,factor,data,title.legend,pch=16,...){
  require(RColorBrewer)
  # get all elements as vectors arguments x y factor 
  if(class(factor)!="factor"){
    factor<-factor(factor)
  }
  lvl<-length(levels(factor)) 
  #if(lvl>7){
  #  cat("More than 7 levels in the provided factor, not enough colors available\n")
  #  break
  #}
  #if(lvl==2){
  #  pal<-brewer.pal(lvl+1,"Set1")
  #  pal<-pal[-3]
  #}
  #else{
    #pal<-brewer.pal(lvl,"Set1")
     pal <-  sample(colours(),96)
  #}
  par(xpd=TRUE,mar=c(5,4,4,10))
  plot(x,y,col=pal[factor],pch=pch,...)
  xcoord<-max(x)+((max(x)-min(x))/20)
  ycoord<-max(y)
  legend(xcoord,ycoord,legend=levels(factor),col=pal,pch=pch,title=title.legend)
  par(xpd=FALSE,mar=c(5,4,4,2))
}

In [None]:
sampleConditionFactor<-factor(paste0('Cell_',c(rep(1:no_of_cells,each=5))), levels = paste0('Cell_',rep(1:no_of_cells) ), labels = paste0('Cell_',rep(1:no_of_cells) ))

In [None]:
totalInfo <- cbind(sampleConditionValid, meanNormCounts, sdNormCounts, CV)

In [None]:
# Applying variance stabilizer
vsd <- varianceStabilizingTransformation(ddsHTSeq, blind=TRUE)
rlogVal <- rlog(ddsHTSeq, blind=TRUE)