# Intro to DTU

lorem ipsum

First, let's load the transcript abundance values computed with Salmon

We use `rtracklayer` this lib to load and handle GTF files
and `stringr` to make string operations easier 

In [6]:
suppressPackageStartupMessages({ 
    library(rtracklayer)
    library(stringr)
    })

Creates tx2gene, a object the holds the mapping between genes and transcripts

In [7]:
gtf.path <- '/biodb/genomes/homo_sapiens/GRCh38_96/GRCh38.96.gtf'
gtf <- rtracklayer::import(gtf.path)

In [None]:
head(gtf)

Here we load the Salmon results using [`tximport`](https://bioconductor.org/packages/release/bioc/html/tximport.html)

In [None]:
tx2gene <- as.data.frame(gtf)[
    , c('gene_id', 'transcript_id', 'gene_name', 'transcript_name')]

In [None]:
files <- Sys.glob('salmon/*/quant.sf')
txi <- tximport(
  files,
  type = "salmon",
  tx2gene = tx2gene,
  countsFromAbundance = "scaledTPM",
  txOut = TRUE
)

cts <- txi$counts

If you know R, you can subset or filter transcript in the next step. **But** be careful, some operations may break the assumptions of the program you are using. Here we only change the conditions names, which we extract from the file names we use as input

In [None]:
colnames(cts) <- str_split(files, '/', simplify = TRUE)[, 2]

For this analysis we use DRIMSeq, and the manual is [here](https://www.bioconductor.org/packages/release/bioc/vignettes/DRIMSeq/inst/doc/DRIMSeq.pdf)

In [5]:
head(tx2gene)

ERROR: Error in head(tx2gene): object 'tx2gene' not found


In [None]:
# subset cts per protein coding transcripts
matching <- intersect(rownames(cts), tx2gene$transcript_id)
# gene_ids <- setNames(tx2gene$gene_id, tx2gene$transcript_id)

counts <- base::data.frame(
  gene_id = gene_ids[tx2gene$gene_name ],
  feature_id = tx2gene$transcript_id,
  cts <- cts[matching, ]
)

samples <- base::data.frame(sample_id = colnames(cts))
samples$condition <- str_split(samples$sample_id, '_', simplify = TRUE)[,1]



In [None]:
suppressPackageStartupMessages({
    library(DRIMSeq)
    library(BiocParallel)
})
    
d <- dmDSdata(counts = counts, samples = samples)
d <- dmFilter(
  d,
  min_feature_expr = 10,
  min_feature_prop = 0.1,
  min_samps_gene_expr = 3,
  min_gene_expr = 10
)

design_full <- model.matrix(~condition, data = samples(d))
d <- dmPrecision(d, design = design_full, BPPARAM=BiocParallel::MulticoreParam(10))
plotData(d)
d <- dmFit(d, design = design_full, verbose = 1, 
           BPPARAM=BiocParallel::MulticoreParam(40)
)
plotData(d)
library(ggplot2)

dir.create('proportion_plots')

for (gene in names(d@counts) ){
  cat(gene, '\n')
  p <- plotProportions(
    d, plot_type = 'boxplot2', gene_id = gene, group_variable = "condition")
  ggsave( str_glue('proportion_plots/{gene}.png'), width = 10, height = 7 )
}
