# DESeq2: Create count matrix and metadata data frame

## Objective: create a count matrix and metadata frame for the 2019 pilot data

### Load packages

In [1]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.2     [32m✔[39m [34mdplyr  [39m 0.8.1
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


### Import Counts

In [2]:
### Due to time constraints, we skip the import of the count files (see HTS tidyverse notebook)

cntfile <- '/home/jovyan/work/scratch/analysis_output/out/hts-pilot-2019.RData'
attach(cntfile)
tools::md5sum(cntfile)
ls(2)

In [3]:
### Look at the first five genes across the libraries

genecounts[1:8, 1:5]
dim(genecounts)

expid,CNAG_00001,CNAG_00002,CNAG_00003,CNAG_00004
<chr>,<int>,<int>,<int>,<int>
1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,0,35,48,223
1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,0,43,46,227
1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,0,46,49,232
1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,0,34,58,222
10_2019_P_M1_S10_L001_ReadsPerGene.out.tab,0,30,36,130
10_2019_P_M1_S10_L002_ReadsPerGene.out.tab,0,37,37,117
10_2019_P_M1_S10_L003_ReadsPerGene.out.tab,0,29,31,135
10_2019_P_M1_S10_L004_ReadsPerGene.out.tab,0,23,27,131


### Filter out 2018 libraries from count file

In [4]:
genecounts %>% 
    filter(str_detect(expid, "2019")) -> 
        genecounts2019

dim(genecounts2019)

### Import metadata file

In [5]:
metadtfile <- '/data/hts_2019_data/hts2019_pilot_rawdata/2019_pilot_metadata.tsv'
tools::md5sum(metadtfile)

mtdf<-readr::read_tsv(metadtfile)

mtdf

Parsed with column specification:
cols(
  .default = col_character(),
  sample_year = [32mcol_double()[39m,
  enrich_rep = [32mcol_double()[39m,
  RNA_sample_num = [32mcol_double()[39m,
  library_num = [32mcol_double()[39m,
  bio_replicate = [32mcol_double()[39m,
  Nanodrop_260_280 = [32mcol_double()[39m,
  Nanodrop_260_230 = [32mcol_double()[39m,
  Nanodrop_concentration_ng_ul = [32mcol_double()[39m,
  Bioanalyzer_concentration_ng_ul = [32mcol_double()[39m,
  RIN_lowered_threshold = [32mcol_double()[39m
)
See spec(...) for full column specifications.


Label,sample_year,group,enrich_rep,RNA_sample_num,genotype,condition,libprep_person,enrichment_method,enrichment_short,⋯,i5_primer,i7_primer,library_num,bio_replicate,Nanodrop_260_280,Nanodrop_260_230,Nanodrop_concentration_ng_ul,Bioanalyzer_concentration_ng_ul,RIN_normal_threshold,RIN_lowered_threshold
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
1_2019_P_M1,2019,P,1,1,WT,pH4,C,mRNA,M,⋯,i501,i701,1,1,2.14,1.52,293.0,197,,9.8
2_2019_P_M1,2019,P,1,2,WT,pH4,C,mRNA,M,⋯,i502,i701,2,2,2.12,1.79,290.0,225,,9.9
3_2019_P_M1,2019,P,1,3,WT,pH4,C,mRNA,M,⋯,i503,i701,3,3,2.11,2.49,302.0,241,,9.9
4_2019_P_M1,2019,P,1,4,WT,pH4,P,mRNA,M,⋯,i504,i701,4,4,2.13,1.15,296.0,189,,9.7
5_2019_P_M1,2019,P,1,5,WT,pH4,P,mRNA,M,⋯,i505,i701,5,5,2.09,2.42,337.0,268,10.0,10.0
6_2019_P_M1,2019,P,1,6,WT,pH4,P,mRNA,M,⋯,i506,i701,6,6,2.08,2.4,319.0,276,10.0,10.0
7_2019_P_M1,2019,P,1,7,sre1d,pH4,C,mRNA,M,⋯,i507,i701,7,1,2.13,2.23,232.0,127,,9.9
8_2019_P_M1,2019,P,1,8,sre1d,pH4,C,mRNA,M,⋯,i508,i701,8,2,2.08,2.24,320.0,311,,10.0
9_2019_P_M1,2019,P,1,9,sre1d,pH4,C,mRNA,M,⋯,i501,i702,9,3,2.09,1.46,342.0,326,9.6,9.6
10_2019_P_M1,2019,P,1,10,sre1d,pH4,P,mRNA,M,⋯,i502,i702,10,4,2.16,1.25,262.0,168,10.0,10.0


### filter out 2018 samples

In [6]:
mtdf %>%
    filter(str_detect(Label, "2019")) -> 
        mtdf2019

dim(mtdf2019)

### Add a label column to the counts object so that it can be merged with the metadata

In [7]:
myregex <- "_S[1-9][0-9]{0,2}_L00[1-4]_ReadsPerGene.out.tab"


### dim before adding label column
dim(genecounts2019)

genecounts2019 %>% 
    mutate(Label=str_replace(expid, myregex, "")) -> 
        genecounts2019

### dim before adding label column

dim(genecounts2019)

In [8]:
### Look at the first five genes across the libraries

genecounts2019[1:8, c(1:5, ncol(genecounts2019))]
dim(genecounts2019)

expid,CNAG_00001,CNAG_00002,CNAG_00003,CNAG_00004,Label
<chr>,<int>,<int>,<int>,<int>,<chr>
1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,0,35,48,223,1_2019_P_M1
1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,0,43,46,227,1_2019_P_M1
1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,0,46,49,232,1_2019_P_M1
1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,0,34,58,222,1_2019_P_M1
10_2019_P_M1_S10_L001_ReadsPerGene.out.tab,0,30,36,130,10_2019_P_M1
10_2019_P_M1_S10_L002_ReadsPerGene.out.tab,0,37,37,117,10_2019_P_M1
10_2019_P_M1_S10_L003_ReadsPerGene.out.tab,0,29,31,135,10_2019_P_M1
10_2019_P_M1_S10_L004_ReadsPerGene.out.tab,0,23,27,131,10_2019_P_M1


### Add counts across lanes 

In [9]:
genecounts2019 %>% 
    group_by(Label) %>%
        summarize_each(sum, -expid) -> 
            cnt2019

In [10]:
cnt2019[1:4, 1:5]

Label,CNAG_00001,CNAG_00002,CNAG_00003,CNAG_00004
<chr>,<int>,<int>,<int>,<int>
1_2019_P_M1,0,158,201,904
10_2019_P_M1,0,119,131,513
11_2019_P_M1,0,90,121,573
12_2019_P_M1,0,81,151,533


### Check to make sure that every label in your count file has an entry in the metadata file

In [11]:
setdiff(cnt2019$Label, mtdf2019$Label)
setdiff(mtdf2019$Label, cnt2019$Label)

### Save objects to image file

In [13]:
curdir <- "/home/jovyan/work/scratch/analysis_output"
imgdir <- file.path(curdir, "img")

imgfile <- file.path(imgdir, "pilotcnt2019.RData")

imgfile

In [14]:
save(cnt2019, mtdf2019, file = imgfile)

tools::md5sum(imgfile)

In [15]:
sessionInfo()

R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)

Matrix products: default
BLAS:   /usr/lib/openblas-base/libblas.so.3
LAPACK: /usr/lib/libopenblasp-r0.2.19.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.1     purrr_0.3.2    
[5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.2    ggplot2_3.1.1  
[9] tidyverse_1.2.1

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.1       cellranger_1.1.0 plyr_1.8.4       pillar_1.4.1    
 [5] compiler_3.