In [1]:
#| label: setup
#| include: false

suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
suppressMessages(library(furrr))

plan(strategy = "multisession", workers = min(6, availableCores()))

## Load and process Ben's annotation

Ben's file has 458,713 introns. Each row is a unique intron without duplications.

In [2]:
introns_ben.f  <- "/project2/yangili1/cdai/SpliFi/code/resources/IntronAnnotationsFromYang.tsv.gz"

In [3]:
introns_ben <- fread(introns_ben.f)

In [4]:
introns_ben[, end := end - 1] # convert to standard BED format
introns_ben[, iid := paste(chrom, start, end, strand, sep=":")]

In [5]:
introns_ben$iid %>% unique %>% length
dim(introns_ben)

## Load and process gencode (all) annotation

In [7]:
intron_gb_v43.f  <- "/project2/yangili1/cdai/annotations/hg38/use_gtftk/gencode_v43_productive.intron_by_transcript.bed.gz"
intron_gb_v37.f  <- "/project2/yangili1/cdai/annotations/hg38/use_gtftk/gencode_v37_productive.intron_by_transcript.bed.gz"

In [8]:
introns_gb_v43 <- fread(intron_gb_v43.f, sep='\t', col.names = c("chrom", "start", "end", "label", "score", "strand"))
introns_gb_v37 <- fread(intron_gb_v37.f, sep='\t', col.names = c("chrom", "start", "end", "label", "score", "strand"))

In [12]:
introns_gb_v43 <- separate_wider_delim(introns_gb_v43, label, "|", names = c("feature", "gid", "tid", "gene")) %>% as.data.table
introns_gb_v37 <- separate_wider_delim(introns_gb_v37, label, "|", names = c("feature", "gid", "tid", "gene")) %>% as.data.table

In [13]:
introns_gb_v43[, iid := paste(chrom, start, end, strand, sep=":")]
introns_gb_v37[, iid := paste(chrom, start, end, strand, sep=":")]

In [14]:
dim(introns_gb_v43)
introns_gb_v43[1:2]
dim(introns_gb_v37)
introns_gb_v37[1:2]

chrom,start,end,feature,gid,tid,gene,score,strand,iid
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>
chr1,65433,65519,intron,ENSG00000186092.7,ENST00000641515.2,OR4F5,1,+,chr1:65433:65519:+
chr1,65573,69036,intron,ENSG00000186092.7,ENST00000641515.2,OR4F5,2,+,chr1:65573:69036:+


chrom,start,end,feature,gid,tid,gene,score,strand,iid
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>
chr1,65433,65519,intron,ENSG00000186092.6,ENST00000641515.2,OR4F5,1,+,chr1:65433:65519:+
chr1,65573,69036,intron,ENSG00000186092.6,ENST00000641515.2,OR4F5,2,+,chr1:65573:69036:+


## Agreements between gencode v43 and v37 (basic)

In [15]:
#| label: 'tbl-agreements-between-v43-v37'
#| tbl-caption: 'agreements between V43 and V37'
full_join(
    x = introns_gb_v43[, .(iid, tid)] %>% unique,
    y = introns_gb_v37[, .(iid, tid)] %>% unique,
    by = 'iid',
    suffix = c("_v43", "_v37")
) %>% as.data.table() %>%  
   .[, .(N = length(unique(iid))), by = .(inV43 = !is.na(tid_v43), inV37 = !is.na(tid_v37))] # %>% unique

“[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mℹ[39m Row 3 of `x` matches multiple rows in `y`.
[36mℹ[39m Row 3 of `y` matches multiple rows in `x`.
[36mℹ[39m If a many-to-many relationship is expected, set `relationship =


inV43,inV37,N
<lgl>,<lgl>,<int>
True,True,248183
True,False,5631
False,True,2971


### Combine basic v43 and basic v37

In [29]:
introns_gb_combined <- fread('../../hg38/use_gtftk/gencode_v43_plus_v37_productive.intron_by_transcript_BEDlike.txt.gz', 
                             col.names = c('chrom', 'start', 'end', 'strand', 'label'))

In [30]:
introns_gb_combined[1:2]

chrom,start,end,strand,label
<chr>,<int>,<int>,<chr>,<chr>
chr1,65433,65519,+,productive
chr1,65573,69036,+,productive


In [32]:
introns_gb_combined %>% uniqueN

## Compare v43 and v37 productive combined with Ben's

In [35]:
introns_gb_combined[1:2,]

chrom,start,end,strand,label,iid
<chr>,<int>,<int>,<chr>,<chr>,<chr>
chr1,65433,65519,+,productive,chr1:65433:65519:+
chr1,65573,69036,+,productive,chr1:65573:69036:+


In [34]:
introns_gb_combined[, iid := paste(chrom, start, end, strand, sep=":")]

In [36]:
introns_ben[1:2]

chrom,start,end,strand,NewAnnotation,gene,symbol,SuperAnnotation,SemiSupergroupAnnotations,iid
<chr>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chr1,14501,15004,-,unprocessed_pseudogene.gencode,ENSG00000227232.5,WASH7P,AnnotatedJunc_NoncodingGene,uniquely psueodgene tag,chr1:14501:15004:-
chr1,14829,14929,-,unprocessed_pseudogene.novel_junctions,ENSG00000227232.5,WASH7P,UnannotatedJunc_NoncodingGene,overlaps pseudogene,chr1:14829:14929:-


- All but 29 of Ben's ~250K Annotated_CodingGene introns are found in gencode V37+V43 productive introns

In [39]:
intersect(introns_ben$iid, introns_gb_combined$iid) %>% uniqueN
uniqueN(introns_ben$iid)
uniqueN(introns_gb_combined$iid)

In [37]:
full_join(x = introns_gb_combined[, .(iid, label)],
          y = introns_ben[str_detect(SuperAnnotation, "Anno"), .(iid, gene, SuperAnnotation)],
          by = c('iid'),
          suffix = c("_gb", '_ben')
         ) %>% 
    .[str_detect(SuperAnnotation, "AnnotatedJunc_ProductiveCodingGene"), .(N=length(unique(iid))), by = .(inGB = !is.na(label), inBEN = !is.na(SuperAnnotation))] %>% 
    .[, .(inGB, inBEN, N, P=N/sum(N))]

inGB,inBEN,N,P
<lgl>,<lgl>,<int>,<dbl>
True,True,248971,0.9998835341
False,True,29,0.0001164659


# Make proper Gencode V37+V43 productive introns for leafcutter2


> Problem: the datatable include 1 intron to many gid or tids

> solution: 
> Since all of these introns are productive, we only need to keep coordinates and column to label productive, then collapse.

In [25]:
introns_gb_combined[1:5]

chrom,start,end,strand,iid,gid,tid,gene
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
chr1,65433,65519,+,chr1:65433:65519:+,ENSG00000186092.7,ENST00000641515.2,OR4F5
chr1,65573,69036,+,chr1:65573:69036:+,ENSG00000186092.7,ENST00000641515.2,OR4F5
chr1,924948,925921,+,chr1:924948:925921:+,ENSG00000187634.13,ENST00000616016.5,SAMD11
chr1,926013,930154,+,chr1:926013:930154:+,ENSG00000187634.13,ENST00000616016.5,SAMD11
chr1,930336,931038,+,chr1:930336:931038:+,ENSG00000187634.13,ENST00000616016.5,SAMD11


In [28]:
introns_gb_combined[, .(chrom, start, end, strand, label = "productive")] %>% unique

chrom,start,end,strand,label
<chr>,<int>,<int>,<chr>,<chr>
chr1,65433,65519,+,productive
chr1,65573,69036,+,productive
chr1,924948,925921,+,productive
chr1,926013,930154,+,productive
chr1,930336,931038,+,productive
chr1,931089,935771,+,productive
chr1,935896,939039,+,productive
chr1,939129,939274,+,productive
chr1,939412,941143,+,productive
chr1,941306,942135,+,productive
