In [3]:
library(limma)
library(reshape2)
library(Glimma)
library(ggplot2)
library(ggrepel)
# library(reshape)
library(edgeR)
library(gplots)
library(amap)
library(ggfortify)
library(data.table)
# library(Mus.musculus)


Attaching package: 'gplots'


The following object is masked from 'package:stats':

    lowess



Attaching package: 'data.table'


The following objects are masked from 'package:reshape2':

    dcast, melt




In [4]:

# change this
setwd("C:\\Users\\Pete\\DSC180B-PROJECT")

In [5]:
file_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE44639&format=file'            
raw_dir = "data/raw/"
temp_dir = "data/temp/"
file_name = 'GSE44639_RAW.tar'

In [6]:
ingest_data <- function(file_url, file_name, raw_dir, temp_dir) {
    if (!dir.exists(raw_dir)) {
        dir.create(raw_dir, recursive = TRUE)
    }
    out = paste(raw_dir, file_name, sep="")
    utils::download.file(file_url, destfile=out, mode="wb")
    utils::untar(out, exdir=temp_dir)
}

ingest_data(file_url, file_name, raw_dir, temp_dir)

In [7]:
read_all_files_in_dir_with_columns <- function(file_dir, required_columns) {
    files <- list.files(path=file_dir)
    
    all <- list()
    
    for (i in files) {
        fp <- paste(file_dir, i, sep='')
        seqdata <- read.delim(gzfile(fp), stringsAsFactors = FALSE)
        columns <- colnames(seqdata)
        
        has_unidentified_col <- FALSE
        
        # ignore samples with columns not present in required_columns
        for (c in columns) {
            if (!any(required_columns==c)) {
                has_unidentified_col <- TRUE
                break
            }
        }
        
        if (has_unidentified_col) {
            next
        }
        
        sampleName = strsplit(fp, "/")[[1]][3]
        sampleName = strsplit(sampleName, ".txt")[[1]][1]
        
        
        cellType = strsplit(fp, "_")[1]
        cellType = cellType[[1]][3]
        cellType = strsplit(cellType, ".txt")
        cellType = cellType[[1]][1]
        
        healthy <- !grepl("P", fp, fixed=TRUE)
        
        # add healthy column
        seqdata$healthy <- healthy
        seqdata$cellType <- cellType
        seqdata$sampleName <- sampleName
        
        if (length(all) == 0) {
            all <- seqdata
        } else {
            all <- rbind(all, seqdata)
        }
    }
    
    return(all)
}

required_columns <- list("miRNA", "Chromosome", "Position", "Strand", "Total.miRNA.reads", "RPM..reads.per.million.")
all <- read_all_files_in_dir_with_columns(temp_dir, required_columns)

In [10]:
head(all)

Unnamed: 0_level_0,miRNA,Chromosome,Position,Strand,Total.miRNA.reads,RPM..reads.per.million.,healthy,cellType,sampleName
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<lgl>,<chr>,<chr>
1,hsa-miR-378c,chr10,132760851,-,7,1,True,Naive,GSM1088200_M7_Naive
2,hsa-mir-1321,chrX,85090785,+,26,6,True,Naive,GSM1088200_M7_Naive
3,hsa-miR-378d,chr8,94928250,-,2,1,True,Naive,GSM1088200_M7_Naive
4,hsa-miR-2110,chr10,115933864,-,5,1,True,Naive,GSM1088200_M7_Naive
5,hsa-mir-200b,chr1,1102484,+,2,1,True,Naive,GSM1088200_M7_Naive
6,hsa-miR-125a-5p,chr19,52196507,+,7,1,True,Naive,GSM1088200_M7_Naive


In [97]:
pivot_data <- function(countdata, value_column, groupNames, groupSizes) {
    ### pivot ingested miRNA counts data
    ### dataframe must have columns miRNA and sampleName
    # countdata - data with raw reads and from ingestion function
    # value_column - name of column with the reads
    # groupNames - list of group names
    # groupSizes - list of size of each group
    
    
    countdata <- dcast(all, miRNA ~ sampleName, value.var= value_column, fill= 0)
    rownames(countdata) <- countdata$miRNA
    countdata$miRNA <- NULL
    
    factoredGroups <- vector()
    for (i in 1:length(groupNames)) {
        curGroup <- rep(groupNames[i], groupSizes[i])
        factoredGroups <- c(factoredGroups, curGroup)
    }
    group <- factor(factoredGroups)
    toReturn <- list('countdata'=countdata, 'group'=group)
    return(toReturn)

}

filter_logcounts <- function(countdata, group) {
    countdata <- countdata + 0.5

    d <- DGEList(counts=countdata,group=group)

    # got values for filter from paper, Yuxia Zhang
    # filter all mirna with less than 1.5*(# of total samples) of counts across all samples. lowly expressing gene
    dd <- d[rowSums(d$counts) >= 1.5*ncol(d),]
    # multiply by million and divide by column library size to get RPM
    d<-1e+06 * dd$count/expandAsMatrix(d$samples$lib.size,dim(dd))
    # take the log RPM
    d<-log2(d)
    return(d)
}

In [99]:
groupNames <- c("pret1Naive","pret1rTreg","pret1aTreg","pret1Tcm","pret1Ttm","pret1rTem","healthyNaive","healthyrTreg",
               "healthyaTreg","healthyTcm","healthyTem","healthyTtm")
groupSizes <- c(6,8,9,5,5,5,7,7,6,7,7,7)
dataAndGroupList <- pivot_data(all, "Total.miRNA.reads", groupNames, groupSizes)
filter_logcounts(dataAndGroupList$countdata, dataAndGroupList$group)



Unnamed: 0,GSM1088200_M7_Naive,GSM1088201_M8_Naive,GSM1088202_M9_Naive,GSM1088203_M10_Naive,GSM1088204_M12_Naive,GSM1088205_M29_Naive,GSM1088206_M7_rTreg,GSM1088207_M8_rTreg,GSM1088208_M9_rTreg,GSM1088209_M10_rTreg,...,GSM1088270_P5_Tem,GSM1088271_P6_Tem,GSM1088272_P7_Tem,GSM1088273_P1_Ttm,GSM1088274_P2_Ttm,GSM1088275_P3_Ttm,GSM1088276_P4_Ttm,GSM1088277_P5_Ttm,GSM1088278_P6_Ttm,GSM1088279_P7_Ttm
hsa-let-7a,15.601431,14.667918,15.931762,15.345674,15.737569,16.7248120,14.662525,15.519505,14.599880,15.926746,...,15.304624,16.065307,15.752622,13.858913,16.149960,15.810463,15.539709,15.6540257,15.671697,15.630897
hsa-let-7a-1,10.959754,8.627902,9.849978,10.852441,10.494944,10.1990432,7.522973,12.655194,8.946438,9.670511,...,9.640929,13.259520,9.786825,7.595817,9.153334,9.956967,9.112977,8.7341197,9.059052,9.861231
hsa-let-7a-2,11.013193,10.006414,10.165284,9.620815,10.737283,10.2595460,7.522973,10.079882,9.683404,9.899416,...,8.747845,9.939419,9.905654,7.968860,9.314697,10.309735,9.412896,9.0535916,9.163075,9.991969
hsa-let-7a-3,10.786422,8.627902,9.805240,9.334907,10.446557,10.1786705,7.522973,10.079882,8.946438,9.630801,...,8.747845,9.518281,9.733061,7.497969,9.128162,9.883177,9.030177,8.6914228,9.059052,9.831817
hsa-let-7a*,5.231833,6.305974,2.816555,3.145082,4.283326,4.3549630,7.522973,5.436026,7.361476,1.882608,...,5.940490,3.635638,4.808737,5.913007,4.118104,1.461112,2.726396,6.4839993,2.317585,1.493452
hsa-let-7b,14.117530,14.864395,14.174657,12.153511,14.278151,14.8376991,15.022819,12.330844,13.701326,15.442583,...,12.540403,13.806815,14.325295,12.322398,15.131468,13.990299,13.452330,13.7198614,14.794078,13.728569
hsa-let-7b*,5.231833,6.305974,2.816555,3.145082,4.283326,0.3667723,7.522973,5.436026,7.361476,1.882608,...,5.940490,3.635638,4.127267,3.987007,2.695871,1.461112,1.141433,2.2192961,3.902547,2.715845
hsa-let-7c,10.589385,9.765405,11.204573,10.439703,9.824208,10.1553133,12.732427,10.862291,13.089396,9.519233,...,7.525452,8.764921,8.865933,7.731169,8.716462,9.337629,9.538038,8.6622396,10.118485,9.024834
hsa-let-7d,11.674777,12.087334,12.622299,13.001507,12.871208,13.0178922,9.107936,14.363804,11.448939,12.536349,...,11.366754,13.831626,12.698340,11.033586,12.768674,12.908712,12.418139,12.6259735,12.855774,12.994792
hsa-let-7d*,5.231833,6.305974,6.275987,4.730044,6.322855,3.1378311,7.522973,5.436026,8.946438,4.204536,...,5.940490,5.220601,6.312236,7.219668,2.806902,4.920544,4.311358,5.3531518,6.018025,5.791133


In [90]:
dataAndGroupList$countdata

ERROR: Error in dataAndGroupList$countdata: $ operator is invalid for atomic vectors
