In [1]:
library(tidyverse)
library(foreach)
library(ggplot2)
library(RColorBrewer)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.2     [32m✔[39m [34mdplyr  [39m 0.8.1
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when



In [2]:
curdir <- "/home/jovyan/work/scratch/analysis_output"
outdir <- file.path(curdir, "out")
imgdir <- file.path(curdir, "img")

### Import Counts

In [3]:
sum_cntfile <- file.path(outdir, "hts-pilot-2019.RData")
attach(sum_cntfile)
tools::md5sum(sum_cntfile)

In [4]:
dim(genecounts)

In [5]:
dim(mapresults)

### Import metadata file

In [6]:
metadtfile <- "/data/hts_2019_data/hts2019_pilot_rawdata/2019_pilot_metadata.tsv"
tools::md5sum(metadtfile)
mtdf<-readr::read_tsv(metadtfile)
dim(mtdf)

Parsed with column specification:
cols(
  .default = col_character(),
  sample_year = [32mcol_double()[39m,
  enrich_rep = [32mcol_double()[39m,
  RNA_sample_num = [32mcol_double()[39m,
  library_num = [32mcol_double()[39m,
  bio_replicate = [32mcol_double()[39m,
  Nanodrop_260_280 = [32mcol_double()[39m,
  Nanodrop_260_230 = [32mcol_double()[39m,
  Nanodrop_concentration_ng_ul = [32mcol_double()[39m,
  Bioanalyzer_concentration_ng_ul = [32mcol_double()[39m,
  RIN_lowered_threshold = [32mcol_double()[39m
)
See spec(...) for full column specifications.


In [7]:
head(mtdf)

Label,sample_year,group,enrich_rep,RNA_sample_num,genotype,condition,libprep_person,enrichment_method,enrichment_short,⋯,i5_primer,i7_primer,library_num,bio_replicate,Nanodrop_260_280,Nanodrop_260_230,Nanodrop_concentration_ng_ul,Bioanalyzer_concentration_ng_ul,RIN_normal_threshold,RIN_lowered_threshold
<chr>,<dbl>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
1_2019_P_M1,2019,P,1,1,WT,pH4,C,mRNA,M,⋯,i501,i701,1,1,2.14,1.52,293,197,,9.8
2_2019_P_M1,2019,P,1,2,WT,pH4,C,mRNA,M,⋯,i502,i701,2,2,2.12,1.79,290,225,,9.9
3_2019_P_M1,2019,P,1,3,WT,pH4,C,mRNA,M,⋯,i503,i701,3,3,2.11,2.49,302,241,,9.9
4_2019_P_M1,2019,P,1,4,WT,pH4,P,mRNA,M,⋯,i504,i701,4,4,2.13,1.15,296,189,,9.7
5_2019_P_M1,2019,P,1,5,WT,pH4,P,mRNA,M,⋯,i505,i701,5,5,2.09,2.42,337,268,10.0,10.0
6_2019_P_M1,2019,P,1,6,WT,pH4,P,mRNA,M,⋯,i506,i701,6,6,2.08,2.4,319,276,10.0,10.0


In [8]:
str(mtdf)

Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	36 obs. of  22 variables:
 $ Label                          : chr  "1_2019_P_M1" "2_2019_P_M1" "3_2019_P_M1" "4_2019_P_M1" ...
 $ sample_year                    : num  2019 2019 2019 2019 2019 ...
 $ group                          : chr  "P" "P" "P" "P" ...
 $ enrich_rep                     : num  1 1 1 1 1 1 1 1 1 1 ...
 $ RNA_sample_num                 : num  1 2 3 4 5 6 7 8 9 10 ...
 $ genotype                       : chr  "WT" "WT" "WT" "WT" ...
 $ condition                      : chr  "pH4" "pH4" "pH4" "pH4" ...
 $ libprep_person                 : chr  "C" "C" "C" "P" ...
 $ enrichment_method              : chr  "mRNA" "mRNA" "mRNA" "mRNA" ...
 $ enrichment_short               : chr  "M" "M" "M" "M" ...
 $ i7_index                       : chr  "ATTACTCG" "ATTACTCG" "ATTACTCG" "ATTACTCG" ...
 $ i5_index                       : chr  "AGGCTATA" "GCCTCTAT" "AGGATAGG" "TCAGAGCC" ...
 $ i5_primer                      : chr  "i501"

### Check to see if we can match the labels of counts
### to those in the metadata file

In [9]:
myregex <- "_[A-Z](100|[1-9][0-9]?)_L00[1-4]_ReadsPerGene.out.tab"

mtdf$Label %>%
    setdiff(gsub(myregex, "", mapresults$expid))

### Add the "Label" to the count matrix and mapping results,
### and the merge in phenotype data (by Label)

### Add "Label" to genecounts

In [11]:
genecounts %>%
    mutate(Label=gsub(myregex, "", c(expid))) ->
    annogenecnts

In [12]:
dim(genecounts)

In [13]:
dim(annogenecnts)

### Collapse the gene counts within each label

In [14]:
annogenecnts %>%
    select(-expid) %>%
    group_by(Label) %>%
    summarize_all(sum) %>%
    gather(gene, value, -Label) %>% 
    spread(Label, value) ->
    annogenecnts0

dim(annogenecnts0)

In [15]:
annogenecnts0[1:6, 1:6]

gene,1_2019_P_M1,10_2019_P_M1,11_2019_P_M1,12_2019_P_M1,13_2019_P_M1
<chr>,<int>,<int>,<int>,<int>,<int>
CNAG_00001,0,0,0,0,0
CNAG_00002,158,119,90,81,188
CNAG_00003,201,131,121,151,215
CNAG_00004,904,513,573,533,474
CNAG_00005,22,24,18,20,25
CNAG_00006,5964,3712,3353,3631,2088


### Add "Label" to read map results and merge in phenotype data

In [16]:
mapresults %>%
    mutate(Label=str_replace(expid, myregex, "")) %>%
    full_join(mtdf, by="Label") ->
    annomapres

dim(annomapres)

In [17]:
grpvars<-vars(Label, genotype, condition, sample_year,
              libprep_person,enrichment_method)
sumvars<-vars(prob.gene,prob.nofeat,prob.unique,depth)

annomapres %>%
  group_by_at(grpvars) %>%
  summarize_at(sumvars,mean) -> annomapres0

In [18]:
outfile <- file.path(outdir, "HTS-Pilot-Annotated-STAR-counts.RData")
save(mtdf, annogenecnts0, annomapres0, annogenecnts, annomapres, file = outfile)
tools::md5sum(outfile)

### Figures for mapping results

In [19]:
mygeom <-  geom_point(position = position_jitter(w = 0.3, h = 0))
mypal <- scale_colour_manual(name="",  values =brewer.pal(3,"Set1"))
mytheme <-  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme_bw()
myfacet <- facet_grid(genotype ~ group, drop=TRUE, scales="free_x", space="free")

In [20]:
p1 <- ggplot(annomapres, aes(x=factor(Label), y=prob.unique, shape=genotype, color=condition))+
        myfacet+mygeom+mytheme+mypal

png(file.path(imgdir, "p1.png"), height=480*2, width=480*2)
plot(p1)
graphics.off()

“Removed 9 rows containing missing values (geom_point).”

In [21]:
p2 <- ggplot(annomapres, aes(x=Label, y=prob.gene, shape=genotype, color=condition))+
        myfacet+mygeom+mytheme+mypal
  
png(file.path(imgdir, "p2.png"), height=480*2, width=480*2)
plot(p2)
graphics.off()

“Removed 9 rows containing missing values (geom_point).”

In [22]:
p3 <- ggplot(annomapres, aes(x=Label, y=prob.nofeat, shape=genotype, color=condition))+
        myfacet+mygeom+mytheme+mypal
png(file.path(imgdir, "p3.png"), height=480*2, width=480*2)
plot(p3)
graphics.off()

“Removed 9 rows containing missing values (geom_point).”

In [23]:
p4 <- ggplot(annomapres, aes(x=Label, y=depth, shape=genotype, color=condition))+
        myfacet+mygeom+mytheme+mypal
png(file.path(imgdir, "p4.png"), height=480*2, width=480*2)
plot(p4)
graphics.off()

“Removed 9 rows containing missing values (geom_point).”