# Biostrings and ShortRead

## Setup

In [None]:
library(ShortRead)
library(Biostrings)
library(tidyverse) # for %>%
library(gdata) # for humanReadable
set.seed(100)

In [None]:
### Specify FASTQ file
myfqdir <- "/data/hts_2019_data/hts2019_pilot_rawdata/"
myfqname <- "4_2018_P_H1_S27_L001_R1_001.fastq.gz"

myfqfile <- file.path(myfqdir, myfqname)
tools::md5sum(myfqfile)

# Set up ouput directory
outdir = path.expand("~/work/scratch/shortreadtutorial")
dir.create(outdir)

## FASTQ Basics

In [None]:
### Read entire FASTQ file into memory
myfq <- readFastq(myfqfile)

### Check class
class(myfq)

In [None]:
### Get Summary for FASTQ file
myfq

In [None]:
### Get number of reads
length(myfq)

In [None]:
### Subset based on first three records
myfq[1:3]

In [None]:
### Subset based on three randomly selected records
myfq[sample(seq_len(length(myfq)),3, replace=FALSE)]

In [None]:
### Get sequences, phred scores and read ids for reads 1,2, 3 and 1 (add duplicate on
### purpose
myrecs <- myfq[c(1,2,3,1)]

ShortRead::sread(myrecs) -> myseqs

Biostrings::quality(myrecs) -> myphreds

ShortRead::id(myrecs) -> myids

### Compare classes
class(myfq)
class(myseqs)
class(myphreds)

### Get read ids

In [None]:
myids

### Get sequencing reads

In [None]:
myseqs

### Get phred scores

In [None]:
myphreds

### Explore classes

In [None]:
###? DNAStringSet

### Get slot names for objects

In [None]:
slotNames(myseqs)

### bash command to read the first 12 lines from the fastqfile

In [None]:
mycmd <- paste("zcat ", myfqfile, "| head -n 12")
mycmd

In [None]:
system(mycmd, intern = TRUE)

### Read Quality Scores

In [None]:
### Check encoding of phred scores
myphreds %>% encoding

In [None]:
as(quality(myrecs), "matrix")


### Read Sequences

In [None]:
### Get the sequences as character strings
as.character(myseqs)

In [None]:
### Check for duplicate reads
myseqs %>% duplicated

In [None]:
### Find Unique reads
myseqs %>% unique

In [None]:
### Get rid of duplicate reads
myseqs %>% unique -> myseqs

In [None]:
### letter frequency counting
Biostrings::alphabetFrequency(myseqs)
Biostrings::letterFrequency(myseqs, c("A","T"))

### Sequence Transformations

In [None]:
### Reverse reads
myseqs %>% reverse

In [None]:
### complement reads
myseqs %>% complement

In [None]:
### Reverse complement reads
myseqs %>% reverseComplement

In [None]:
### Translate to AA
myseqs %>% translate

In [None]:
myseqs

### Subset the sequences

In [None]:
myseqs

In [None]:
Biostrings::subseq(myseqs, start=1, width=9)

In [None]:
### Translate to AA
myseqs %>% 
    subseq(start=7) %>%
    translate

In [None]:
### Translate to AA
myseqs %>% 
    subseq(start=7, end=75) %>%
    translate

## Searching

In [None]:
as.character(myseqs)

In [None]:
mypattern0 <- "CATGA"
matchPattern(pattern = mypattern0, myseqs[[1]], max.mismatch=0)

In [None]:
# change a base to introduce a mismatch
mypattern1 <- "CAGGA"
matchPattern(pattern = mypattern1, myseqs[[1]], max.mismatch=0)

In [None]:
# search allowing a single mismatch 
matchPattern(pattern = mypattern1, myseqs[[1]], max.mismatch=1)

### Search multiple sequences
use `vmatchPattern` to search multiple sequences

In [None]:
vmatchPattern(pattern = mypattern0, myseqs, max.mismatch=0)

if we allow mismatches in our pattern we find hits in other sequences

In [None]:
vmatchPattern(pattern = mypattern0, myseqs, max.mismatch=2)

### Count sequence matches

In [None]:
vcountPattern(pattern = mypattern0, myseqs, max.mismatch = 0)

In [None]:
vcountPattern(pattern = mypattern0, myseqs, max.mismatch = 1)

In [None]:
vcountPattern(pattern = mypattern0, myseqs, max.mismatch = 2)

### Pairwise Alignment

In [None]:
pairwiseAlignment(pattern = mypattern0, myseqs[1])

In [None]:
pairwiseAlignment(pattern="ACACGTCTGAACTCCAGTCA", myseqs[1])

In [None]:
pairwiseAlignment(pattern="ACACGTCTGAACAGTCA", myseqs[2])

## Streaming and Sampling

In [None]:
object.size(myfq) %>% humanReadable

In [None]:
file.size(myfqfile) %>% humanReadable

### Streaming
Stream a FASTQ rather than loading the whole thing into memory.

After initializing the streamer, each `yield` call will return the next `n=5` reads from the FASTQ

In [None]:
mystream <- FastqStreamer(myfqfile, n = 5)
mystream

In [None]:
object.size(mystream) %>% humanReadable

In [None]:
### The first yield will retrieve records 1 through 5
cur_reads <- yield(mystream)
cur_reads

In [None]:
object.size(cur_reads) %>% humanReadable

In [None]:
ShortRead::id(cur_reads)

In [None]:
### Check status of stream (this yield added 5 for a total of 0+5 = 5 records)
mystream

In [None]:
### The second yield will retrieve records 6 through 10
cur_reads <- yield(mystream)
ShortRead::id(cur_reads)

In [None]:
### Check status of stream (this yield added 5 for a total of 5+5 = 10 records)
mystream

In [None]:
### Be sure to close the stream
close(mystream)

In [None]:
### Check status of stream
mystream

#### Double Check

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
cur_reads <- yield(mystream)
ShortRead::id(cur_reads)
close(mystream)

In [None]:
mystream <- FastqStreamer(myfqfile, 5)
cur_reads <- yield(mystream)
ShortRead::id(cur_reads)
close(mystream)

### Sampling
Random sampling without reading in entire file upfront.

After initializing the sampler, each `yield` call returns `n=5` random reads from the FASTQ

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads <- yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
### Reproducible random sampling
set.seed(18381)
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
### Reproducible random sampling
set.seed(18381)
mysampler <- FastqSampler(myfqfile, n=5)

sample_reads = yield(mysampler)
ShortRead::id(sample_reads)

close(mysampler)

In [None]:
mysampler <- FastqSampler(myfqfile, n=5)
sample_reads = yield(mysampler)
object.size(mysampler) %>% humanReadable
object.size(sample_reads)%>% humanReadable
close(mysampler)

### Iterate over fastq file in chunks of 5000 reads

In [None]:
fq <- FastqStreamer(myfqfile, 5000)

while (length(reads <- yield(fq))) {
    print(reads)
}

close(fq)

## QC Report

### Get 2019 fastq filenames for the four lanes of sample 3 (2019 libraries)

- /data/hts_2019_data/hts2019_pilot_rawdata//1_2019_P_M1_S1_L001_R1_001.fastq.gz
- /data/hts_2019_data/hts2019_pilot_rawdata//1_2019_P_M1_S1_L002_R1_001.fastq.gz
- /data/hts_2019_data/hts2019_pilot_rawdata//1_2019_P_M1_S1_L003_R1_001.fastq.gz 
- /data/hts_2019_data/hts2019_pilot_rawdata//1_2019_P_M1_S1_L004_R1_001.fastq.gz

In [None]:
### See https://krijnhoetmer.nl/stuff/regex/cheat-sheet/
### This pulls 2018 samples
myregex <- "^3_"
fqfiles <- list.files(myfqdir, pattern = myregex, full.names = TRUE) 
fqfiles

In [None]:
myregex <- "^3_2019"
fqfiles <- list.files(myfqdir, pattern = myregex, full.names = TRUE) 
fqfiles

In [None]:
### Create QA report for the four lanes (serial execution)
fqfiles %>% 
    ShortRead::qa(type = "fastq", BPPARAM = SerialParam()) %>% 
        report(dest=file.path(outdir, "2019tutorial-serial"))

### Create QA report for the four lanes (parallel execution).
```
# Does not work! Problem with BIOCParallel?
fqfiles %>% 
    ShortRead::qa(type = "fastq", BPPARAM = MulticoreParam(workers = 4)) %>% 
        report(dest=file.path(outdir, "2019tutorial-par"))
```

## The End

In [None]:
sessionInfo()
q(save = "no")