## The purpose of this notebook is to create a count matrix and metadata frame for the 2019 pilot data

### Load packages

In [None]:
library(tidyverse)

### Import Counts

In [None]:
### Due to time constraints, we skip the import of the count files

cntfile <- '/home/jovyan/work/scratch/analysis_output/out/hts-pilot-2019.RData'
attach(cntfile)
tools::md5sum(cntfile)
ls(2)



In [None]:
### Look at the first five genes across the libraries

genecounts[1:8, 1:5]
dim(genecounts)

### Filter out 2018 libraries from count file

In [None]:
genecounts %>% 
    filter(str_detect(expid, "2019")) -> 
        genecounts2019

dim(genecounts2019)

### Import metadata file

In [None]:
metadtfile <- '/data/hts_2019_data/hts2019_pilot_rawdata/2019_pilot_metadata.tsv'
tools::md5sum(metadtfile)

mtdf<-readr::read_tsv(metadtfile)

mtdf

### filter out 2018 samples

In [None]:
mtdf %>%
    filter(str_detect(Label, "2019")) -> 
        mtdf2019

dim(mtdf2019)

### Add a label column to the counts object so that it can be merged with the metadata

In [None]:
myregex <- "_S[1-9][0-9]{0,2}_L00[1-4]_ReadsPerGene.out.tab"


### dim before adding label column
dim(genecounts2019)

genecounts2019 %>% 
    mutate(Label=str_replace(expid, myregex, "")) -> 
        genecounts2019

### dim before adding label column

dim(genecounts2019)

In [None]:
### Look at the first five genes across the libraries

genecounts2019[1:8, c(1:5, ncol(genecounts2019))]
dim(genecounts2019)

### Add counts across lanes 

In [None]:
genecounts2019 %>% 
    group_by(Label) %>%
        summarize_each(sum, -expid) -> 
            cnt2019

In [None]:
cnt2019[1:4, 1:5]

### Check to make sure that every label in your count file has an entry in the metadata file

In [None]:
setdiff(cnt2019$Label, mtdf2019$Label)
setdiff(mtdf2019$Label, cnt2019$Label)

In [None]:
### Save objects to image file

In [None]:
curdir <- "/home/jovyan/work/scratch/analysis_output"
imgdir <- file.path(curdir, "img")

imgfile <- file.path(imgdir, "pilotcnt2019.RData")

imgfile

In [None]:
save(cnt2019, mtdf2019, file = imgfile)

tools::md5sum(imgfile)

In [None]:
sessionInfo()