## Extract RNA sequences around splice sites

In [1]:
library(dplyr)
library(GenomicRanges)
library(Biostrings)
library(parallel)
library(BSgenome.Hsapiens.UCSC.hg38 )
library(caret)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package

In [3]:
file<-"../input/event_annotations/GRCh38.p13.gencode.v37.primary_assembly.annotation_ASALL_strict.ioe"
gr_out_file<-"../input/event_annotations/event_cords_gr.RDS"
fasta_prefix<-'../input/ss_fastas/spliceSites_windowSize_'

### Build GRanges object with splice sites coordinates from all events

In [4]:
ioe<-read.table(file,header = T) %>%
     filter(grepl("chr",seqname)) %>%
     rowwise() %>%
     mutate(cords=sub( paste0(paste(gene_id,event_type,sep=";"),":",seqname),"",event_id,fixed = T)) %>%
     as_tibble() %>%
     select(-alternative_transcripts,-total_transcripts,-constitutive_transcripts) %>%
     mutate(strand = stringr::str_sub(cords,-1)) %>%
     mutate(cords = stringr::str_sub(cords, 2,-3)) 

In [5]:
ioe.gr <-  ioe %>%
           mutate(pos_type = case_when(event_type == "SE" ~ paste(c("e1","s2","e2","s3"),collapse=":"),
                                       event_type == "MX" ~ paste(c("e1","s2","e2","s4","e1","s3","e3","s4"),collapse=":"),
                                       event_type == "A5" ~ paste(c("e2","s3","e1","s3"),collapse=":"),
                                       event_type == "A3" ~ paste(c("e1","s2","e1","s3"),collapse=":"),
                                       event_type == "RI" ~ paste(c("s1","e1","s2","e2"),collapse=":"),
                                       event_type == "AF" ~ paste(c("s1","e1","s3","s2","e2","s3"),collapse=":"),
                                       event_type == "AL" ~ paste(c("e1","s2","e2","e1","s3","e3"),collapse=":"))) %>%
           rowwise() %>%
           mutate(pos = strsplit(cords,"\\:|\\-",fixed = F),
                  pos_type = strsplit(pos_type,":")) %>%
           group_by(event_id) %>% 
           tidyr::unnest(c(pos,pos_type)) %>%
           as_tibble()

In [6]:
# Reverse positions of events in the negative strand - not done by default on SE, MX and RI events
ioe.gr<-ioe.gr %>% 
        group_by(event_id) %>%
        mutate(pos_type = ifelse(strand=="-" & event_type%in%c("SE","MX","RI"),rev(pos_type),pos_type))

In [7]:
ioe.gr.up <- ioe.gr %>%
             rowwise()%>%
             mutate(pos = as.numeric(pos)) %>%
             mutate(region_pos = ifelse(strand=="+",
                                         list(c(pos-50, pos-100, pos-200)),
                                         list(c(pos+50, pos+100, pos+200))), # reverse direction for events in the negative strand
                    side = "up") %>%
             tidyr::unnest(region_pos)%>% 
             mutate(window = abs(region_pos-pos),
                    start = ifelse(strand=="+",region_pos,pos),
                    end = ifelse(strand=="+",pos,region_pos)) 

ioe.gr.dw <- ioe.gr %>%
             rowwise()%>%
             mutate(pos = as.numeric(pos)) %>%
             mutate(region_pos = ifelse(strand=="+",
                                   list(c(pos+50, pos+100, pos+200)),
                                   list(c(pos-50, pos-100, pos-200))),  # reverse direction for events in the negative strand
                    side = "down") %>%
             tidyr::unnest(region_pos) %>%
             mutate(window = abs(region_pos-pos),
                    start = ifelse(strand=="+",pos,region_pos),
                    end = ifelse(strand=="+",region_pos,pos))

ioe.gr.ev <- rbind(ioe.gr.dw,ioe.gr.up) %>% 
             as_tibble() %>%
             mutate(seq_id=paste(event_type,pos_type,pos,side,window,event_id,sep="_")) %>%
             select(seqname,start,end,strand,gene_id,seq_id)

In [8]:
ioe.gr.ev <- makeGRangesFromDataFrame(ioe.gr.ev,
                                       keep.extra.columns=TRUE,
                                       seqnames.field="seqname",
                                       start.field="start",
                                       end.field="end",
                                       starts.in.df.are.0based=FALSE)

In [9]:
saveRDS(ioe.gr.ev,file = gr_out_file)

### Fetch sequences from GRanges object

This section is the same as the code `fetch_seqs.r`. That script can be submitted as a job with the wrapper `run_fetch.sh` if more resources are needed. 

In [10]:
ioe.gr.ev<-readRDS(file = gr_out_file)

In [12]:
ioe.gr.ev <- sortSeqlevels(ioe.gr.ev)
ioe.gr.ev <- sort(ioe.gr.ev)

In [13]:
hg<-BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38

In [14]:
ws<-c(51,101,201)
for(w in ws){
    
    message("Window size ",w-1)
    wgr<-ioe.gr.ev[ioe.gr.ev@ranges@width==w]
    wgr.strands<-as.character(strand(wgr))
    message("Fetching ",length(wgr.strands)," sequences")
    
    # Extract DNA sequences
    message("extracting DNA sequences...")
    dna_seqs<-BSgenome::getSeq(hg, wgr)
    
    message("splitting sequences by strand...")
    dna_seqs_neg<-dna_seqs[wgr.strands=="-"]

    # Converte to mRNA sense 
    message("Converting ",sum(wgr.strands=="-")," seqs to reverse complement sense...")
    rna_seqs <- reverseComplement(dna_seqs_neg)
    
    # Combine seqs objects
    message("Merging back sequences from both strands...")
    seqs <- DNAStringSet(c(dna_seqs[wgr.strands=="+"],
                           rna_seqs))
    names(seqs)<-toupper(c(wgr$seq_id[wgr.strands=="+"],wgr$seq_id[wgr.strands=="-"]))
    
    # Create fasta file
    message("Writting fasta file...")
    Biostrings::writeXStringSet(seqs, paste0(fasta_prefix,w-1,"_up_and_down",".fasta"))
}
message("Finished successfully!")

Window size 50

Fetching 2721368 sequences

extracting DNA sequences...

splitting sequences by strand...

Converting 1346756 seqs to reverse complement sense...

Merging back sequences from both strands...

Writting fasta file...

Window size 100

Fetching 2721368 sequences

extracting DNA sequences...

splitting sequences by strand...

Converting 1346756 seqs to reverse complement sense...

Merging back sequences from both strands...

Writting fasta file...

Window size 200

Fetching 2721368 sequences

extracting DNA sequences...

splitting sequences by strand...

Converting 1346756 seqs to reverse complement sense...

Merging back sequences from both strands...

Writting fasta file...

Finished successfully!

