# Intro to DTU

First, let's load the transcript abundance values computed with Salmon

We use `rtracklayer` this lib to load and handle GTF files
and `stringr` to make string operations easier 

In [None]:
library(rtracklayer)
library(stringr)

Creates tx2gene, a object the holds the mapping between genes and transcripts

In [None]:
gtf.path <- '/biodb/genomes/homo_sapiens/GRCh38_96/GRCh38.96.gtf'
gtf <- rtracklayer::import(gtf.path)

In [None]:
head(gtf)

Here we load the Salmon results using [`tximport`](https://bioconductor.org/packages/release/bioc/html/tximport.html)

In [None]:
tx2gene <- as.data.frame(gtf)[
    , c('gene_id', 'transcript_id', 'gene_name', 'transcript_name')] # nolint

In [None]:
library(tximport)

In [None]:
files <- Sys.glob('salmon/*/quant.sf')
txi <- tximport(
  files,
  type = "salmon",
  tx2gene = tx2gene,
  countsFromAbundance = "scaledTPM",
  txOut = TRUE
)

cts <- txi$counts

If you know R, you can subset or filter transcript in the next step. **But** be careful, some operations may break the assumptions of the program you are using. Here we only change the conditions names, which we extract from the file names we use as input

In [None]:
colnames(cts) <- str_split(files, '/', simplify = TRUE)[, 2]

In [None]:
rownames(cts) <- str_split(rownames(cts), '\\.', simplify = TRUE)[, 1]

In [None]:
head(cts)

For this analysis we use DRIMSeq, and the manual is [here](https://www.bioconductor.org/packages/release/bioc/vignettes/DRIMSeq/inst/doc/DRIMSeq.pdf)

In [None]:
matching <- intersect(rownames(cts), tx2gene$transcript_id)

In [None]:
gene_ids <- setNames(tx2gene$gene_id, tx2gene$transcript_id)

In [None]:
counts <- base::data.frame(
  gene_id = gene_ids[matching],
  feature_id = matching,
  cts[matching, ]
)

# Now we write the experimental design matrix

In [None]:
head(counts)

In [None]:
samples <- base::data.frame(sample_id =  make.names(colnames(cts)))
samples$condition <- str_split(samples$sample_id, '__', simplify = TRUE)[, 5]

In [None]:
samples

In [None]:
library(DRIMSeq)
    
d <- dmDSdata(counts = counts, samples = samples)
d <- dmFilter(
  d,
  min_feature_expr = 10,
  min_feature_prop = 0.1,
  min_samps_gene_expr = 3,
  min_gene_expr = 10
)

In [None]:
design_full <- model.matrix(~condition, data = samples(d))
d <- dmPrecision(d, design = design_full)


In [None]:
plotData(d)

In [None]:
d <- dmFit(d, design = design_full, verbose = 1)

In [None]:
d <- dmTest(d, coef = "conditionEGF", verbose = 1)

In [None]:
head(results(d))

In [None]:
res <-  dplyr::filter(results(d), adj_pvalue < 0.05)

In [None]:
head(res)

In [None]:
res <-  dplyr::filter(results(d), pvalue < 0.05)

In [None]:
head(dplyr::arrange(res, adj_pvalue))

In [None]:
p <- plotProportions(
    d, plot_type = 'boxplot2', gene_id = 'ENSG00000160752', group_variable = "condition")

In [None]:
plot(p)