# File I/O

In [None]:
library(tidyverse)

## Strucrure of count data from STAR

- column 1: gene ID
- column 2: counts for unstranded RNA-seq
- column 3: counts for the 1st read strand aligned with RNA (htseq-count option -s yes)
- column 4: counts for the 2nd read strand aligned with RNA (htseq-count option -s reverse)

## Get list of files

In [None]:
count_dir <- "/data/hts_2019_data/hts2019_pilot_counts"

In [None]:
files <- list.files(path=count_dir, pattern="*.tab", full.names = TRUE)

In [None]:
files[1]

## Read in first 2 files

In [None]:
d1 <- read_tsv(files[1], col_names=FALSE, col_type=cols())

In [None]:
d2 <- read_tsv(files[2], col_names=FALSE, col_type=cols())

In [None]:
d1 %>% head(5)

In [None]:
d2 %>% head(5)

## Combining files

In [None]:
bind_cols(d1, d2) %>% head(5)

In [None]:
bind_rows(d1, d2) %>% head(5)

In [None]:
inner_join(x=d1, y=d2, by='X1') %>% head(5)

## Custom function to read files

In [None]:
read_file <- function(file) {
    read_tsv(file, col_names = FALSE, skip=4, col_types = cols()) %>%
    mutate(source=basename(file)) %>%
    select(source=source, gene=X1, count=X4) %>%
    separate(source, sep='_', into=c("sample", "year", "p", "method", "s", "lane", "junk")) %>%
    select(sample, year, p, method, s, lane, gene, count)
}

In [None]:
read_file(files[1]) %>% head(5)