In [None]:
# http://combine-australia.github.io/RNAseq-R/06-rnaseq-day1.html

In [None]:
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("gplots")
# biocLite("limma")
install.packages('ggrepel')


In [None]:
version

In [None]:
library(limma)
library(reshape2)
library(Glimma)
library(ggplot2)
library(ggrepel)
# library(reshape)
library(edgeR)
library(gplots)
library(amap)
# library(Mus.musculus)

In [None]:

# change this
setwd("C:\\Users\\Pete\\DSC180B-PROJECT")

In [None]:
file_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE44639&format=file'            
raw_dir = "data/raw/"
temp_dir = "data/temp/"
file_name = 'GSE44639_RAW.tar'

In [None]:
ingest_data <- function(file_url, file_name, raw_dir, temp_dir) {
    if (!dir.exists(raw_dir)) {
        dir.create(raw_dir, recursive = TRUE)
    }
    out = paste(raw_dir, file_name, sep="")
    utils::download.file(file_url, destfile=out, mode="wb")
    utils::untar(out, exdir=temp_dir)
}

ingest_data(file_url, file_name, raw_dir, temp_dir)

In [None]:
read_all_files_in_dir_with_columns <- function(file_dir, required_columns) {
    files <- list.files(path=file_dir)
    
    all <- list()
    
    for (i in files) {
        fp <- paste(file_dir, i, sep='')
        seqdata <- read.delim(gzfile(fp), stringsAsFactors = FALSE)
        columns <- colnames(seqdata)
        
        has_unidentified_col <- FALSE
        
        # ignore samples with columns not present in required_columns
        for (c in columns) {
            if (!any(required_columns==c)) {
                has_unidentified_col <- TRUE
                break
            }
        }
        
        if (has_unidentified_col) {
            next
        }
        
        sampleName = strsplit(fp, "/")[[1]][3]
        sampleName = strsplit(sampleName, ".txt")[[1]][1]
        
        
        cellType = strsplit(fp, "_")[1]
        cellType = cellType[[1]][3]
        cellType = strsplit(cellType, ".txt")
        cellType = cellType[[1]][1]
        
        healthy <- !grepl("P", fp, fixed=TRUE)
        
        # add healthy column
        seqdata$healthy <- healthy
        seqdata$cellType <- cellType
        seqdata$sampleName <- sampleName
        
        if (length(all) == 0) {
            all <- seqdata
        } else {
            all <- rbind(all, seqdata)
        }
    }
    
    return(all)
}

required_columns <- list("miRNA", "Chromosome", "Position", "Strand", "Total.miRNA.reads", "RPM..reads.per.million.")
all <- read_all_files_in_dir_with_columns(temp_dir, required_columns)

In [None]:
unique(all['cellType'])

# TEST USING MANUALLY CALCULATED RPM


In [None]:
countmatrix <- dcast(all, miRNA ~ sampleName, value.var= "Total.miRNA.reads", fill= 0)
rownames(countmatrix) <- countmatrix$miRNA
countmatrix$miRNA <- NULL

testcpm <- cpm(countmatrix)
testcpm

In [None]:
countmatrix

In [None]:
# Which values in myCPM are greater than 0.5?
thresh <- testcpm > 100
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)

In [None]:
table(rowSums(thresh))

In [None]:
# we would like to keep genes that have at least 5 TRUES in each row of thresh
keep <- rowSums(thresh) >= 9
# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)

In [None]:
dim(counts.keep)


In [None]:
y <- DGEList(counts.keep)
# Apply normalisation to DGEList object
y <- calcNormFactors(y)
y

In [None]:
sampleinfo <- all[row.names(unique(all[,c("sampleName", "cellType", "healthy")])),][,c("sampleName", "healthy", "cellType")]
row.names(sampleinfo) <- NULL

data.frame(sampleinfo$CellType)
head(sampleinfo)

In [None]:
labels <- paste(sampleinfo$sampleName, sampleinfo$cellType, sampleinfo$healthy)
group <- paste(sampleinfo$cellType,sampleinfo$healthy,sep=".")
group <- factor(group)
glMDSPlot(y, labels=labels, groups=group, folder="mds")

In [None]:
# our groups
group

In [None]:
# Specify a design matrix without an intercept term
design <- model.matrix(~ 0 + group)

In [None]:
colnames(design) <- levels(group)

In [None]:
par(mfrow=c(1,1))
v <- voom(y,design,plot = TRUE)

In [None]:
y

In [None]:
fit <- lmFit(v)
names(fit)

In [None]:
# we want to compare healthy naive cells to pre-t1 naive cells
cont.matrix <- makeContrasts(B.healthVsdiab=Naive.TRUE - Naive.FALSE,levels=design)
cont.matrix

In [None]:
fit.cont <- contrasts.fit(fit, cont.matrix)
fit.cont <- eBayes(fit.cont)
dim(fit.cont)


In [None]:
summa.fit <- decideTests(fit.cont)
summary(summa.fit)

# END TEST

In [None]:
#subset the pre-t1 group from all
pret1 <- all[all$healthy == FALSE,] 
naive <- all[all$cellType == 'Naive',] 


In [None]:
countdata <- dcast(naive, miRNA ~ sampleName, value.var= "Total.miRNA.reads", fill= 0)

rownames(countdata) <- countdata$miRNA
countdata$miRNA <- NULL
countdata

In [None]:
group<-factor(c(rep("pret1Naive",6),rep("healthyNaive",7)))

# # create model
# design2<-model.matrix(~0+group)
# design1<-model.matrix(~group)
# design<-cbind(design1[,1],design1[,3],design2[,1],design1[,4:6])
# # colnames(design)<-c("(Intercept)","grouprTreg","groupaTreg","groupTcm","groupTem","groupTtm")
# design


In [None]:
d <- DGEList(counts=countdata,group=group)

# #filter, need counts over 1 in 6 samples
# keep <- rowSums(cpm(d)>1) >= 6
# d <- d[keep,]
dim (d)

In [None]:
nf<-calcNormFactors(d$counts)
design<-model.matrix(~group)
design

In [None]:
nf<-calcNormFactors(d$counts)
y<-voom(d$counts,plot=TRUE,design,lib.size=colSums(d$counts)*nf)

fit <- lmFit(y,design)
fit <- eBayes(fit)
summary(decideTests(fit))
degene<-topTable(fit,coef=2,n=5)

In [None]:
topTable(fit,coef=2, sort.by='logFC',number=20)

In [None]:
topTable(fit,coef=2, sort.by='logFC',number=10)

In [None]:
par(mfrow=c(1,2))
plotMD(fit,coef=1, values = c(-1, 1))

# For the volcano plot we have to specify how many of the top genes to highlight.
# We can also specify that we want to plot the gene symbol for the highlighted genes.
# let's highlight the top 100 most DE genes
volcanoplot(fit,coef=1,highlight=20,names=rownames(fit))
title('Healthy vs Pre-T1D in Naive Cells')

In [None]:
# topTable(fit,coef=2,sort.by="p", n = "Inf")


In [None]:
# selects only rtreg and atreg
c1<- countdata[8:20]+0.5
group<-factor(c(rep("rTreg",7),rep("aTreg",6)))
d <- DGEList(counts=c1,group=group)
d$samples
dd <- d[rowSums(d$counts) >= 1.5*ncol(d), ]
d<-1e+06 * dd$count/expandAsMatrix(d$samples$lib.size,dim(dd))

d<-log2(d)

F<-data.frame(nrow(d))


x<-data.frame(nrow(d),2)

# ABS F FUNCTION
for (i in 1:nrow(d)) { # for each mirna
    a<-7*abs(mean(d[i,1:7])-mean(d[i,1:13]))+6*abs(mean(d[i,8:13])-mean(d[i,1:13]))
    # mean of the first group - mean of all groups for each mirna, + the difference in means between the second group and all mirna
    # MULTIPLY EACH ABS DIFF BY THE GROUP SIZE.
    bb = 0
    for (j in 1:7) { # FIRST GROUP
        b<-abs(d[i,j]-mean(d[i,1:7]))
        bb = bb +b
        }

    for (j in 8:13) { # SECOND GROUP
        b<-abs(d[i,j]-mean(d[i,8:13]))
        bb = bb +b
    }

    F[i]<-12*a/bb

    x[i,2]<-mean(d[i,1:7])
    x[i,1]<-mean(d[i,8:13])

}

# F is a vector with the absF score for each mirna.
names(F)<-rownames(d)
rownames(x)<-rownames(d)
length(F[F>9.6,])

## 93 for rTreg 2 aTreg
# 109

mydist<-function(c) {
require(amap)
Dist(c,method="pearson")
 }
myclust<-function(c) { hclust(c,method='ward.D') }
heatmap.2(d,distfun=mydist,hclustfun=myclust,trace="none",cexRow=1.0,cexCol=1.2,las=2,col=greenred(30),lhei=c(1,7),scale="row",sepcolor="yellow", sepwidth = 0.1,margin=c(9,9))




In [None]:
f<-x[names(-sort(-F))[1:93],1]-x[names(-sort(-F))[1:93],2]
names(f)<-names(-sort(-F))[1:93]
id<-c(names(f[f>=2]),names(f[-f>=2]))

In [None]:
d

In [None]:
# When in doubt, a threshold of 1 CPM in at least the minimum group's sample size is a good rule of thumb.

In [None]:
# print(required_columns)
# countdata <- cast(all, miRNA ~ sampleName, value= 'Total.miRNA.reads', fill= 0)
# countdata

In [None]:
# myCPM <- cpm(countdata)
myCPM <- countdata
head(myCPM)

In [None]:
# Which values in myCPM are greater than 0.5?
thresh <- myCPM > 1
# This produces a logical matrix with TRUEs and FALSEs
head(thresh)

In [None]:
table(rowSums(thresh))

In [None]:
# we would like to keep genes that have at least 5 TRUES in each row of thresh
keep <- rowSums(thresh) >= 7
# Subset the rows of countdata to keep the more highly expressed genes
counts.keep <- countdata[keep,]
summary(keep)

In [None]:
dim(counts.keep)


In [None]:
# rownames(counts.keep) <- counts.keep$miRNA
# counts.keep$miRNA <- NULL
# counts.keep

# Get log2 counts per million
logcounts <- cpm(countdata, log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 reads per million")
# Let's add a blue horizontal line that corresponds to the median logCPM
# abline(h=median(logcounts),col="blue")
title("Boxplots of logCPMs (unnormalised)")


In [None]:

y2 <- DGEList(countdata)
# Apply normalisation to DGEList object
y2 <- calcNormFactors(y2)
y2

In [None]:
y <- DGEList(counts.keep)
# Apply normalisation to DGEList object
y <- calcNormFactors(y)
y

In [None]:
hist(log2(all$RPM..reads.per.million), main='Histogram of log2 of miRNA reads per million')

In [None]:
hist(log2(all$"Total.miRNA.reads"), main='Histogram of log2 of Total miRNA reads')


In [None]:
counts.keep

In [None]:
sampleinfo <- all[row.names(unique(all[,c("sampleName", "cellType", "healthy")])),][,c("sampleName", "healthy", "cellType")]
row.names(sampleinfo) <- NULL

data.frame(sampleinfo$CellType)
head(sampleinfo)

In [None]:
labels <- paste(sampleinfo$sampleName, sampleinfo$cellType, sampleinfo$healthy)
group <- paste(sampleinfo$healthy,sep=".")
group <- factor(group)
glMDSPlot(y, labels=labels, groups=group, folder="mds")

In [None]:
# our groups
group

In [None]:
# Specify a design matrix without an intercept term
design <- model.matrix(~ 0 + group)

In [None]:
colnames(design) <- levels(group)

In [None]:
par(mfrow=c(1,1))
v <- voom(y,design,plot = TRUE)

In [None]:
par(mfrow=c(1,2))
boxplot(logcounts, xlab="", ylab="Log2 counts per million",las=2,main="Unnormalised logCPM")
## Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")
boxplot(v$E, xlab="", ylab="Log2 counts per million",las=2,main="Voom transformed logCPM")
## Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(v$E),col="blue")

In [None]:
names(v)

In [None]:
fit <- lmFit(v)
names(fit)

In [None]:
# we want to compare healthy naive cells to pre-t1 naive cells
cont.matrix <- makeContrasts(B.healthVsdiab=Naive.TRUE - Naive.FALSE,levels=design)
cont.matrix

In [None]:
fit.cont <- contrasts.fit(fit, cont.matrix)
fit.cont <- eBayes(fit.cont)
dim(fit.cont)


In [None]:
summa.fit <- decideTests(fit.cont)
summary(summa.fit)

In [None]:
topTable(fit.cont,coef="B.healthVsdiab",sort.by="logFC", number=30)


In [None]:
par(mfrow=c(1,2))
plotMD(fit.cont,coef=1,status=summa.fit[,"B.healthVsdiab"], values = c(-1, 1))

# For the volcano plot we have to specify how many of the top genes to highlight.
# We can also specify that we want to plot the gene symbol for the highlighted genes.
# let's highlight the top 100 most DE genes
volcanoplot(fit.cont,coef=1,highlight=100,names=fit.cont$genes$SYMBOL)

In [None]:
fit.cont