In [3]:
library(tidyverse)
library(data.table)
library(glue)

In [2]:
chroms <- glue('chr{c(1:22)}')
chroms <- c(chroms, 'chrX', 'chrY')
chroms <- factor(chroms, levels = chroms)

# Lookup for protein coding only gene_id and transcript_id

In [4]:
lookup.f <- '../../hg38/use_ucsc_table_browser/gencode.v43.primary_assembly_proteinOnly.annotation.transcript.bed'

In [5]:
# extract gene_id and transcript_id
lookup <- fread(lookup.f) %>% 
    filter(V1 %in% chroms) %>% 
    separate_wider_delim(V4, "|", names = c("gene_id", "gene_name", "transcript_id")) %>% 
    select(gene_id, transcript_id) %>%
    distinct %>% 
    as.data.table

# remove .1, .2 from ids
lookup[, `:=`(gene_id = str_remove(gene_id, "\\..*$"),
              transcript_id = str_remove(transcript_id, "\\..*$")
             )]

lookup <- lookup[, .(gene_id, transcript_id)] %>% unique

## Introns

In [11]:
intron.f <- '../../hg38/use_ucsc_table_browser/gencode.v43_intron.bed'

In [12]:
intron <- fread(intron.f) %>%
    separate_wider_regex(V4, patterns = c(transcript_id = "ENST[\\.\\d]+", '_.+')) %>%
    filter(V1 %in% chroms) %>% 
    as.data.table
intron[, transcript_id := str_remove(transcript_id, "\\..*")]

In [13]:
# inner_join with proteincoding only lookup table
intron <- inner_join(intron, lookup, by = "transcript_id")

In [14]:
# concat gene_id and transcript_id, ready for BED output
intron <- intron[, .(V1 = factor(V1, levels = chroms), V2, V3, 
                     V4 = paste(gene_id, transcript_id, sep = "|"), V5, V6)]
# sort
setorder(intron, V1, V2, V3)


In [15]:
fwrite(intron, "../../hg38/use_ucsc_table_browser/gencode.v43_intron_proteinOnly.bed")

## 5' UTR

In [16]:
utr5p.f <- '../../hg38/use_ucsc_table_browser/gencode.v43_UTR5p.bed'

In [17]:
utr5p <- fread(utr5p.f) %>%
    separate_wider_regex(V4, patterns = c(transcript_id = "ENST[\\.\\d]+", '_.+')) %>%
    filter(V1 %in% chroms) %>% 
    as.data.table
utr5p[, transcript_id := str_remove(transcript_id, "\\..*")]

In [18]:
# inner_join with proteincoding only lookup table
utr5p <- inner_join(utr5p, lookup, by = "transcript_id")

In [19]:
# concat gene_id and transcript_id, ready for BED output
utr5p <- utr5p[, .(V1 = factor(V1, levels = chroms), V2, V3, 
                     V4 = paste(gene_id, transcript_id, sep = "|"), V5, V6)]
# sort
setorder(utr5p, V1, V2, V3)


In [20]:
fwrite(utr5p, "../../hg38/use_ucsc_table_browser/gencode.v43_UTR5p_proteinOnly.bed")

## 3' UTR

In [21]:
utr3p.f <- '../../hg38/use_ucsc_table_browser/gencode.v43_UTR3p.bed'

In [22]:
utr3p <- fread(utr3p.f) %>%
    separate_wider_regex(V4, patterns = c(transcript_id = "ENST[\\.\\d]+", '_.+')) %>%
    filter(V1 %in% chroms) %>% 
    as.data.table
utr3p[, transcript_id := str_remove(transcript_id, "\\..*")]

In [23]:
# inner_join with proteincoding only lookup table
utr3p <- inner_join(utr3p, lookup, by = "transcript_id")

In [24]:
# concat gene_id and transcript_id, ready for BED output
utr3p <- utr3p[, .(V1 = factor(V1, levels = chroms), V2, V3, 
                     V4 = paste(gene_id, transcript_id, sep = "|"), V5, V6)]
# sort
setorder(utr3p, V1, V2, V3)


In [25]:
fwrite(utr3p, "../../hg38/use_ucsc_table_browser/gencode.v43_UTR3p_proteinOnly.bed")

## coding exons (do not include UTRs)

In [26]:
codingexon.f <- '../../hg38/use_ucsc_table_browser/gencode.v43_codingExon.bed'

In [27]:
codingexon <- fread(codingexon.f) %>%
    separate_wider_regex(V4, patterns = c(transcript_id = "ENST[\\.\\d]+", '_.+')) %>%
    filter(V1 %in% chroms) %>% 
    as.data.table
codingexon[, transcript_id := str_remove(transcript_id, "\\..*")]

In [28]:
# inner_join with proteincoding only lookup table
codingexon <- inner_join(codingexon, lookup, by = "transcript_id")

In [29]:
# concat gene_id and transcript_id, ready for BED output
codingexon <- codingexon[, .(V1 = factor(V1, levels = chroms), V2, V3, 
                     V4 = paste(gene_id, transcript_id, sep = "|"), V5, V6)]
# sort
setorder(codingexon, V1, V2, V3)


In [30]:
fwrite(codingexon, "../../hg38/use_ucsc_table_browser/gencode.v43_codingExon_proteinOnly.bed")