### Load paths and libraries

In [2]:
library(foreach)
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.1.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.2     [32m✔[39m [34mdplyr  [39m 0.8.1
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32maccumulate()[39m masks [34mforeach[39m::accumulate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[31m✖[39m [34mpurrr[39m::[32mwhen()[39m       masks [34mforeach[39m::when()


In [3]:
data_root_dir <- "/data/hts_2019_data/"
raw_fastq_dir <- paste(data_root_dir, "hts2019_pilot_rawdata", sep = "")
metadata_file <- file.path(raw_fastq_dir, "2019_pilot_metadata.tsv")
count_dir <- paste(data_root_dir, "hts2019_pilot_counts", sep = "")

# Create directories

Below is an illustration of our folder structure.
```
scratch
└── bioinf_intro
└── analysis_output
    ├── out -> the folder to store all our output data files
    └── img -> the folder to store all our images  
```

In [4]:
curdir <- "/home/jovyan/work/scratch/analysis_output"
outdir <- file.path(curdir, "out")
imgdir <- file.path(curdir, "img")

In [5]:
system(paste("mkdir -p", outdir), intern = TRUE)
system(paste("mkdir -p", imgdir), intern = TRUE)

In [6]:
count_suffix <- "_ReadsPerGene.out.tab"

In [7]:
list.files(count_dir, pattern = paste0(count_suffix,"$"), full.names = FALSE) -> countfiles

### Load STAR Count Data

In [6]:
list.files(count_dir, pattern = paste0(count_suffix,"$"), full.names = FALSE) -> countfiles
countfiles

In [8]:
length(countfiles)

So there are 96 files about 2019 pilot data under this folder. Each file is generated from a fastq file using STAR.

In [9]:
mycombine <- function(df1, df2) {
    # Combine two data frames by gene names
    #
    # Args:
    #   df1 (Dataframe): the first count data
    #   df2 (Dataframe): the second count data
    #
    # Returns:
    #   (Dataframe) The combined data frame of df1 and df2
    full_join(df1, df2, by = "gene")
}


mystarfile <- function(rootdir, stardir) {
    # Get the absolute paths of a file
    #
    # Args:
    #   rootdir  (Character): the directory of the folder
    #   stardir (Character): the filename
    #
    # Returns:
    #   (Character) the directory of the input file
    file.path(rootdir, stardir)
}

# Data type for each column
coltypes <- list(col_character(), col_integer(), col_integer(), col_integer())

read the count files and combine them

In [10]:
out <- foreach(file = countfiles, .combine = mycombine) %do% {
    cntfile <- mystarfile(count_dir, file)
    readr::read_tsv(cntfile, col_names = FALSE, col_types = coltypes ) %>%
        dplyr::select(X1, X4) %>%
            dplyr::rename_(.dots=setNames(names(.), c("gene", file)))
}


“rename_() is deprecated. 
Please use rename() instead

The 'programming' vignette or the tidyeval book can help you
to program with rename() : https://tidyeval.tidyverse.org

In [10]:
dim(out)

In [11]:
out[1:6, 1:6]

gene,1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,10_2019_P_M1_S10_L001_ReadsPerGene.out.tab
<chr>,<int>,<int>,<int>,<int>,<int>
N_unmapped,21162,19644,19307,17356,6456
N_multimapping,95540,94070,97787,98723,75280
N_noFeature,26923,26053,27257,26775,26470
N_ambiguous,606,638,615,607,464
CNAG_04548,0,0,0,0,0
CNAG_07303,0,0,0,0,0


### Gather and spread the genes to get a count matrix (genecounts)

In [12]:
out %>%
    slice(-(1:4)) %>%
    gather(expid, value, -gene) %>% 
    spread(gene, value) -> genecounts

genecounts[1:6, 1:6]

expid,CNAG_00001,CNAG_00002,CNAG_00003,CNAG_00004,CNAG_00005
<chr>,<int>,<int>,<int>,<int>,<int>
1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,0,35,48,223,5
1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,0,43,46,227,7
1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,0,46,49,232,8
1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,0,34,58,222,2
10_2019_P_M1_S10_L001_ReadsPerGene.out.tab,0,30,36,130,5
10_2019_P_M1_S10_L002_ReadsPerGene.out.tab,0,37,37,117,7


### Gather and spread the first four rows to nmisc

For nmisc, we will take the first 4 rows of out since those are the summarizing features. Next, we want to transform the data frame so that it is in statistical format (the samples are the rows and the feature types are the columns). Using a combination of gather and spread, we can transpose our matrix into the desired format.

In [13]:
out %>%
    slice(1:4) %>%
    gather(expid, value, -gene) %>% 
    spread(gene, value) %>%
    rename_(.dots=setNames(names(.), c("expid", "namb", "nmulti", "nnofeat","nunmap"))) -> nmisc

nmisc[1:6, ]

expid,namb,nmulti,nnofeat,nunmap
<chr>,<int>,<int>,<int>,<int>
1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,606,95540,26923,21162
1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,638,94070,26053,19644
1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,615,97787,27257,19307
1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,607,98723,26775,17356
10_2019_P_M1_S10_L001_ReadsPerGene.out.tab,464,75280,26470,6456
10_2019_P_M1_S10_L002_ReadsPerGene.out.tab,445,73190,25919,6097


### Gather and spread the gene rows

For each samples, we want to sum up all the counts, so we can create a variable denoting the number of total genes mapped for each sample by summing across the rows.

In [14]:
out %>%
    slice(-(1:4)) %>%
    gather(expid, value, -gene) %>% 
    spread(gene, value) %>%
    mutate(ngenemap=rowSums(.[-1])) %>%
    select(expid, ngenemap) -> ngene

ngene[1:6, ]

expid,ngenemap
<chr>,<dbl>
1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,4660549
1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,4591006
1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,4715846
1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,4681095
10_2019_P_M1_S10_L001_ReadsPerGene.out.tab,3261459
10_2019_P_M1_S10_L002_ReadsPerGene.out.tab,3208423


### merge in the 4 misc counts and add summaries

So far, we can create a comprehensive data frame mapresults which will combine ngene with nmisc. This data frame will have summarizing mapping features in addition to proportion features.

In [15]:
ngene %>%
    full_join(nmisc, by="expid") %>%
    mutate(depth = as.integer(ngenemap + namb + nmulti + nnofeat + nunmap)) %>%
    mutate(prob.gene = ngenemap / depth) %>%
    mutate(prob.nofeat = nnofeat / depth) %>%
    mutate(prob.unique = (ngenemap+nnofeat) / depth) -> mapresults

mapresults[1:6, ]

expid,ngenemap,namb,nmulti,nnofeat,nunmap,depth,prob.gene,prob.nofeat,prob.unique
<chr>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>
1_2019_P_M1_S1_L001_ReadsPerGene.out.tab,4660549,606,95540,26923,21162,4804780,0.9699818,0.005603378,0.9755851
1_2019_P_M1_S1_L002_ReadsPerGene.out.tab,4591006,638,94070,26053,19644,4731411,0.9703249,0.005506391,0.9758313
1_2019_P_M1_S1_L003_ReadsPerGene.out.tab,4715846,615,97787,27257,19307,4860812,0.9701766,0.005607499,0.9757841
1_2019_P_M1_S1_L004_ReadsPerGene.out.tab,4681095,607,98723,26775,17356,4824556,0.9702644,0.005549733,0.9758141
10_2019_P_M1_S10_L001_ReadsPerGene.out.tab,3261459,464,75280,26470,6456,3370129,0.9677549,0.007854299,0.9756092
10_2019_P_M1_S10_L002_ReadsPerGene.out.tab,3208423,445,73190,25919,6097,3314074,0.9681205,0.007820888,0.9759414


In [16]:
outfile <- file.path(outdir, "hts-pilot-2019.RData")
save(mapresults, genecounts, file=outfile)
tools::md5sum(outfile)