In [3]:
rm(list = ls())
library(dplyr)
library(stringr)
library(forcats)
library(ggplot2)
library(ggsci)
library(patchwork)

In [6]:
nov = '../data/human/transcript_novelties.tsv'
cerb_ids = '../data/human/suppa/gtex/cerb_ids.tsv'
psi = '../data/human/cerberus_psi.tsv'
lr_meta = '../ref/human/lr_human_library_data_summary.tsv'

af = '../data/human/suppa/cerberus.events_AF_strict.ioe'
a3 = '../data/human/suppa/cerberus.events_A3_strict.ioe'
a5 = '../data/human/suppa/cerberus.events_A5_strict.ioe'
al = '../data/human/suppa/cerberus.events_AL_strict.ioe'
ri = '../data/human/suppa/cerberus.events_RI_strict.ioe'
mx = '../data/human/suppa/cerberus.events_MX_strict.ioe'
se = '../data/human/suppa/cerberus.events_SE_strict.ioe'

g_af = '../data/human/suppa/gtex/cerberus.events_AF_strict.ioe'
g_a3 = '../data/human/suppa/gtex/cerberus.events_A3_strict.ioe'
g_a5 = '../data/human/suppa/gtex/cerberus.events_A5_strict.ioe'
g_al = '../data/human/suppa/gtex/cerberus.events_AL_strict.ioe'
g_ri = '../data/human/suppa/gtex/cerberus.events_RI_strict.ioe'
g_mx = '../data/human/suppa/gtex/cerberus.events_MX_strict.ioe'
g_se = '../data/human/suppa/gtex/cerberus.events_SE_strict.ioe'

## Proportion of novel transcripts per event

In [5]:
## using new transcript table
transcripts <- read.table(nov, sep = '\t', header = T, check.names = F, comment.char = '')
transcripts <- transcripts %>% filter(source == 'lapa') %>% unique()
transcripts$transcript_id <- gsub(',','_', transcripts$transcript_id)

In [7]:
AF <- read.table(af, header = T)
events <- unlist(lapply(AF$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
AF <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'AF')

A3 <- read.table(a3, header = T)
events <- unlist(lapply(A3$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
A3 <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'A3')

A5 <- read.table(a5, header = T)
events <- unlist(lapply(A5$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
A5 <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'A5')

AL <- read.table(al, header = T)
events <- unlist(lapply(AL$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
AL <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'AL')

MX <- read.table(mx, header = T)
events <- unlist(lapply(MX$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
MX <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'MX')

RI <- read.table(ri, header = T)
events <- unlist(lapply(RI$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
RI <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'RI')

SE <- read.table(se, header = T)
events <- unlist(lapply(SE$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
SE <- subset(transcripts, transcripts$transcript_id %in% events) %>% mutate(event = 'SE')

In [9]:
cols <- c('Known' = '#009E73', 'Novel' = '#fc2003')

tss <- rbind(A3, A5, AF, AL, MX, RI, SE) %>% 
  select(transcript_id, type = tss_novelty, event) %>%
  distinct(transcript_id, type, event,.keep_all = TRUE) %>%
  ggplot(aes(x = fct_infreq(event), fill = type)) + geom_bar(position = 'fill') +  scale_fill_manual(values = cols)  +
  coord_flip() + labs(x = 'Local events', y = 'Proportion of novel transcripts defined by TSS') + theme_classic() +
  theme(legend.title=element_blank())

tes <- rbind(A3, A5, AF, AL, MX, RI, SE) %>% 
  select(transcript_id, type = tes_novelty, event) %>%
  distinct(transcript_id, type, event,.keep_all = TRUE) %>%
  ggplot(aes(x = fct_infreq(event), fill = type)) + geom_bar(position = 'fill') + scale_fill_manual(values = cols) +
  coord_flip() + labs(x = 'Local events', y = 'Proportion of novel transcripts defined by TES') + theme_classic() +
  theme(legend.title=element_blank())

ic <- rbind(A3, A5, AF, AL, MX, RI, SE) %>% 
  select(transcript_id, ic_novelty, event) %>% mutate(type = if_else(ic_novelty == 'Known', 'Known','Novel')) %>%
  distinct(transcript_id, type, event,.keep_all = TRUE) %>%
  ggplot(aes(x = fct_infreq(event), fill = type)) + geom_bar(position = 'fill') + scale_fill_manual(values = cols) +
  coord_flip() + labs(x = 'Local events', y = 'Proportion of novel transcripts defined by IC') + theme_classic() +
  theme(legend.title=element_blank())


p <- tss + tes + ic + plot_layout(guides = 'collect')
ggsave('proportion.pdf', width = 15, height = 3)
## total number of local events in cerberus
count <- rbind(A3, A5, AF, AL, MX, RI, SE) %>% 
  select(transcript_id, type = ic_novelty, event) %>%
  distinct(transcript_id, type, event,.keep_all = TRUE) %>%
  group_by(event) %>% summarise(n = n())
write.table(count, file = 'tot_number_of_localEvents.tsv', quote = F, row.names = F, col.names = F, sep = '\t')


## Sankey plot

In [4]:
rm(list = ls())
library(dplyr)
library(stringr)
library(forcats)
library(ggplot2)
library(ggsci)
library(tidyverse)
library(ggalluvial)
library(ggsankey)

Unnamed: 0_level_0,original_transcript_id,ic,ic_id,tss_id,tss,tes_id,tes,gene_id,gene_name,original_transcript_name,transcript_triplet,transcript_id,transcript_name,source,ic_novelty,tss_novelty,tes_novelty
Unnamed: 0_level_1,<fct>,<int>,<fct>,<fct>,<int>,<fct>,<dbl>,<fct>,<fct>,<fct>,<fct>,<chr>,<fct>,<fct>,<fct>,<fct>,<fct>
1,ENCODEHT000208597,32,ENSG00000228794_32,ENSG00000228794_2,2,ENSG00000228794_7,7,ENSG00000228794,LINC01128,ENCODEHT000208597,"[2,32,7]",ENSG00000228794[2_32_7],"LINC01128[2,32,7]",lapa,Known,Known,Known
2,ENCODEHT000208597#0,32,ENSG00000228794_32,ENSG00000228794_2,2,ENSG00000228794_8,8,ENSG00000228794,LINC01128,ENCODEHT000208597,"[2,32,8]",ENSG00000228794[2_32_8],"LINC01128[2,32,8]",lapa,Known,Known,Known
3,ENCODEHT000208597#1,32,ENSG00000228794_32,ENSG00000228794_2,2,ENSG00000228794_8,8,ENSG00000228794,LINC01128,ENCODEHT000208597,"[2,32,8]",ENSG00000228794[2_32_8],"LINC01128[2,32,8]",lapa,Known,Known,Known
4,ENCODEHT000208597#2,32,ENSG00000228794_32,ENSG00000228794_2,2,ENSG00000228794_19,19,ENSG00000228794,LINC01128,ENCODEHT000208597,"[2,32,19]",ENSG00000228794[2_32_19],"LINC01128[2,32,19]",lapa,Known,Known,Novel
5,ENCODEHT000208825#0,29,ENSG00000187634_29,ENSG00000187634_10,10,ENSG00000187634_5,5,ENSG00000187634,SAMD11,ENCODEHT000208825,"[10,29,5]",ENSG00000187634[10_29_5],"SAMD11[10,29,5]",lapa,ISM,Novel,Known
6,ENCODEHT000208825#1,29,ENSG00000187634_29,ENSG00000187634_10,10,ENSG00000187634_1,1,ENSG00000187634,SAMD11,ENCODEHT000208825,"[10,29,1]",ENSG00000187634[10_29_1],"SAMD11[10,29,1]",lapa,ISM,Novel,Known


In [34]:
df <- transcripts %>% 
  filter(source != 'v40' & source != 'v29') %>% 
  select(transcript_id, source, ic_novelty) %>% distinct() %>%
  mutate(across(where(is.factor), as.character)) %>%
  pivot_wider(names_from = source, values_from = ic_novelty, values_fill = 'Missing') %>%
  group_by(lapa, gtex) %>% summarise(n = n()) %>% as.data.frame()

# relabel
df$lapa <- factor(df$lapa, levels =c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced', 'Missing'))
df$gtex <- factor(df$gtex, levels =c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced', 'Missing'))

cols <- c('Known' = '#009E73', 'ISM' = '#0072B2',
          'NIC' = '#D55E00', 'NNC' = '#E69F00', 
          'Unspliced' = '#F0E442', 'Missing' = 'gray60')

tot <- ggplot(df, aes(y = n, axis1 = lapa, axis2 = gtex,
                      fill = factor(after_stat(stratum), levels = c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced','Missing')))) + 
  stat_alluvium(geom = "flow", lode.guidance = "forward", width = 0.1) + 
  stat_stratum(aes(fill = factor(after_stat(stratum), levels = c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced','Missing'))), size = 0.1, width = 0.1) + 
  scale_x_discrete(limits = c('Observed', 'GTEx'), expand = rep(0.1, 2)) +
  scale_fill_manual(values = cols) + labs(x = 'Source', y = 'Count', title = 'All Transcripts') +
  theme_alluvial(base_size = 14) + theme(legend.title=element_blank()) 
# ggsave('sankey_plot_all_transcripts.pdf', tot, width = 5, height = 5)

## sankey plot of lapa transcripts and gtex transcripts for each event
sankeyPlot_event <- function(transcripts, lapa_ioe_path, gtex_ioe_path, mapping, event_name){
  lapa <- read.table(lapa_ioe_path, header = T)
  lapa_events <- unlist(lapply(lapa$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
  gtex <- read.table(gtex_ioe_path, header = T)
  gtex_events <- unlist(lapply(gtex$alternative_transcripts, function(x) str_split(x, ',')[[1]]))
  gtex_events_mapping <- subset(mapping, mapping$original_transcript_id %in% gtex_events)
  events <- c(lapa_events, gtex_events_mapping$transcript_id)
  
  
  df <- transcripts %>% 
    filter(source != 'v40' & source != 'v29' & transcript_id %in% events) %>% 
    select(transcript_id, source, ic_novelty) %>% distinct() %>%
    mutate(across(where(is.factor), as.character)) %>%
    pivot_wider(names_from = source, values_from = ic_novelty, values_fill = 'Missing') %>%
    group_by(lapa, gtex) %>% summarise(n = n()) %>% as.data.frame()
  
  df$lapa <- factor(df$lapa, levels =c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced', 'Missing'))
  df$gtex <- factor(df$gtex, levels =c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced', 'Missing'))
  
  cols <- c('Known' = '#009E73', 'ISM' = '#0072B2',
            'NIC' = '#D55E00', 'NNC' = '#E69F00', 
            'Unspliced' = '#F0E442', 'Missing' = 'gray60')
  p <- ggplot(df, aes(y = n, axis1 = lapa, axis2 = gtex,
                      fill = factor(after_stat(stratum), levels = c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced','Missing')))) + 
    stat_alluvium(geom = "flow", lode.guidance = "forward", width = 0.1) + 
    stat_stratum(aes(fill = factor(after_stat(stratum), levels = c('Known', 'ISM', 'NIC', 'NNC', 'Unspliced','Missing'))), size = 0.1, width = 0.1) +
    scale_x_discrete(limits = c('Observed', 'GTEx'), expand = rep(0.1, 2)) +
    scale_fill_manual(values = cols) + labs(x = 'Source', y = 'Count', title = event_name) +
    theme_alluvial(base_size = 14) + theme(legend.title=element_blank()) 
  return(p)
}

gtex_to_cerberus <- read.table(cerb_ids, header = T)
gtex_to_cerberus$transcript_id <- gsub(',','_', gtex_to_cerberus$transcript_id)

AF <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = af, 
                       gtex_ioe_path = g_af, 
                       gtex_to_cerberus, 'AF')

AL <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = al, 
                       gtex_ioe_path = g_al,
                       gtex_to_cerberus, 'AL')

A3 <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = a3,
                       gtex_ioe_path = g_a3,
                       gtex_to_cerberus, 'A3')


A5 <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = a5,
                       gtex_ioe_path = g_a5,
                       gtex_to_cerberus, 'A5')

MX <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = mx,
                       gtex_ioe_path = g_mx,
                       gtex_to_cerberus, 'MX')

RI <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = ri,
                       gtex_ioe_path = g_ri,
                       gtex_to_cerberus, 'RI')

SE <- sankeyPlot_event(transcripts, 
                       lapa_ioe_path = se,
                       gtex_ioe_path = g_se,
                       gtex_to_cerberus, 'SE')

library(patchwork)
p <- (tot + AF + AL + A3 + plot_layout(guides = 'collect', ncol = 4)) / (A5 + MX + RI + SE + plot_layout(guides = 'collect', ncol = 4)) + plot_layout(guides = 'collect')
ggsave('sankey_plot.pdf', p, width = 16, height = 8)


[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'lapa'. You can override using the
`.groups` argument.


## Overlap between SUPPA and Cerberus