## Extract RNA sequences around splice sites

In [4]:
library(dplyr)
library(GenomicRanges)
library(Biostrings)
library(parallel)
library(BSgenome.Hsapiens.UCSC.hg38 )
library(caret)

Loading required package: ggplot2

Loading required package: lattice



In [2]:
anndir<-"~/lmprojects/altsplicing-methods/suppa2/output/annotations/"

In [3]:
file<-paste0(anndir,"modID.GRCh38.p13.gencode.v37.primary_assembly.annotation_ASALL_strict.ioe")
ioe<-read.table(file,header = T) %>%
     filter(grepl("chr",seqname)) %>%
     rowwise() %>%
     mutate(cords=sub( paste0(paste(gene_id,event_type,sep=";"),":",seqname),"",event_id,fixed = T)) %>%
     as_tibble() %>%
     select(-alternative_transcripts,-total_transcripts,-constitutive_transcripts) %>%
     mutate(strand = stringr::str_sub(cords,-1)) %>%
     mutate(cords = stringr::str_sub(cords, 2,-3)) 

In [4]:
ioe.gr <-  ioe %>%
           mutate(pos_type = case_when(event_type == "SE" ~ paste(c("e1","s2","e2","s3"),collapse=":"),
                                       event_type == "MX" ~ paste(c("e1","s2","e2","s4","e1","s3","e3","s4"),collapse=":"),
                                       event_type == "A5" ~ paste(c("e2","s3","e1","s3"),collapse=":"),
                                       event_type == "A3" ~ paste(c("e1","s2","e1","s3"),collapse=":"),
                                       event_type == "RI" ~ paste(c("s1","e1","s2","e2"),collapse=":"),
                                       event_type == "AF" ~ paste(c("s1","e1","s3","s2","e2","s3"),collapse=":"),
                                       event_type == "AL" ~ paste(c("e1","s2","e2","e1","s3","e3"),collapse=":"))) %>%
           rowwise() %>%
           mutate(pos = strsplit(cords,"\\:|\\-",fixed = F),
                  pos_type = strsplit(pos_type,":")) %>%
           group_by(event_id) %>% 
           tidyr::unnest(c(pos,pos_type)) %>%
           as_tibble()

In [13]:
# Reverse positions of events in the negative strand
ioe.gr<-ioe.gr %>% 
        group_by(event_id) %>%
        mutate(pos_type = ifelse(strand=="-" & event_type%in%c("SE","MX","RI"),rev(pos_type),pos_type))

In [26]:
ioe.gr.up <- ioe.gr %>%
             rowwise()%>%
             mutate(pos = as.numeric(pos)) %>%
             mutate(region_pos = ifelse(strand=="+",
                                         list(c(pos-50, pos-100, pos-200)),
                                         list(c(pos+50, pos+100, pos+200))), # reverse direction for events in the negative strand
                    side = "up") %>%
             tidyr::unnest(region_pos)%>% 
             mutate(window = abs(region_pos-pos),
                    start = ifelse(strand=="+",region_pos,pos),
                    end = ifelse(strand=="+",pos,region_pos)) 

ioe.gr.dw <- ioe.gr %>%
             rowwise()%>%
             mutate(pos = as.numeric(pos)) %>%
             mutate(region_pos = ifelse(strand=="+",
                                   list(c(pos+50, pos+100, pos+200)),
                                   list(c(pos-50, pos-100, pos-200))),  # reverse direction for events in the negative strand
                    side = "down") %>%
             tidyr::unnest(region_pos) %>%
             mutate(window = abs(region_pos-pos),
                    start = ifelse(strand=="+",pos,region_pos),
                    end = ifelse(strand=="+",region_pos,pos))

ioe.gr.ev <- rbind(ioe.gr.dw,ioe.gr.up) %>% 
             as_tibble() %>%
             mutate(seq_id=paste(event_type,pos_type,pos,side,window,event_id,sep="_")) %>%
             select(seqname,start,end,strand,gene_id,seq_id)

In [29]:
ioe.gr.ev %>%
ungroup() %>%
 mutate(length = end-start) %>%
 distinct(length) 

length
<dbl>
50
100
200


In [30]:
ioe.gr.ev <- makeGRangesFromDataFrame(ioe.gr.ev,
                                       keep.extra.columns=TRUE,
                                       seqnames.field="seqname",
                                       start.field="start",
                                       end.field="end",
                                       starts.in.df.are.0based=FALSE)

In [31]:
saveRDS(ioe.gr.ev,file = "../data/objects/event_cords_gr.RDS")

### Fetch sequences

In [1]:
ioe.gr.ev<-readRDS(file = "../data/objects/event_cords_gr.RDS")

In [4]:
ioe.gr.ev <- sortSeqlevels(ioe.gr.ev)
ioe.gr.ev <- sort(ioe.gr.ev)

In [5]:
hg<-BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38

In [8]:
ws<-c(51,101,201)
for(w in ws){
    
    message("Window size ",w-1)
    wgr<-ioe.gr.ev[ioe.gr.ev@ranges@width==w]
    wgr.strands<-as.character(strand(wgr))
    message("Fetching ",length(wgr.strands)," sequences")
    
    # Extract DNA sequences
    message("extracting DNA sequences...")
    dna_seqs<-BSgenome::getSeq(hg, wgr)
    
    message("splitting sequences by strand...")
    dna_seqs_neg<-dna_seqs[wgr.strands=="-"]

    # Converte to mRNA sense 
    message("Converting ",sum(wgr.strands=="-")," seqs to reverse complement sense...")
    rna_seqs <- reverseComplement(dna_seqs_neg)
    
    # Combine seqs objects
    message("Merging back sequences from both strands...")
    seqs <- DNAStringSet(c(dna_seqs[wgr.strands=="+"],
                           rna_seqs))
    names(seqs)<-toupper(c(wgr$seq_id[wgr.strands=="+"],wgr$seq_id[wgr.strands=="-"]))
    
    # Create fasta file
    message("Writting fasta file...")
    Biostrings::writeXStringSet(seqs, paste0('../data/fastas/spliceSites_windowSize_',w-1,"_up_and_down",".fasta"))
}
message("Finished successfully!")

Window size 50

Fetching 2721368 sequences

extracting DNA sequences...

splitting sequences by strand...

Converting 1346756 seqs to reverse complement sense...

Merging back sequences from both strands...

Writting fasta file...

Window size 100

Fetching 2721368 sequences

extracting DNA sequences...

splitting sequences by strand...

Converting 1346756 seqs to reverse complement sense...

Merging back sequences from both strands...

Writting fasta file...

Window size 200

Fetching 2721368 sequences

extracting DNA sequences...

splitting sequences by strand...

Converting 1346756 seqs to reverse complement sense...

Merging back sequences from both strands...

Writting fasta file...

Finished successfully!

