# Functional analysis of relevant RBPs

In [1]:
library(ggplot2)
library(dplyr)
library(data.table)
library(furrr)
library(purrr)
library(dplyr)
library(ggVennDiagram)
library(ggplot2)
library(ggsci)
library(fgsea)
library(ComplexHeatmap)
library(matrixStats)
library(ggpubr)
library(msigdbr)
library(patchwork)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


Loading required package: future


Attaching package: ‘purrr’


The following object is masked from ‘package:data.table’:

    transpose


Loading required package: grid

ComplexHeatmap version 2.8.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.

The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with ze

### Functions

In [2]:
## Process AffiMx output
#------------------------------------
process_file <- function(file_path) {
    
  data <- fread(file_path, header = FALSE, sep = "\t",data.table=FALSE) %>%
            mutate(motif_id=sub("0\\.6_.*","0.6",basename(file_path)))
  colnames(data)[1:2]<-c("Name","affinity")
  data <- data %>%
          mutate(event_id = sub(".*ENS","ENS",Name) %>% sub("CHR","chr",.),
                 seq_id = sub("_ENS.*","",Name) %>% sub("^([A-Z0-9]+)_","",.) %>% sub("_([0-9]*|\\d+\\.\\d+E\\+\\d+)_","_",.))
  data<-data
    
 return(data)
}

## Process all files 
#------------------------------------
process_all_motif_files <- function(directory_path, motif_ids, window_size, event_type) {
    
  files <- file.path(directory_path, paste0(motif_ids,"_spliceSites_windowSize_",window_size,"_up_and_down.affinity_",event_type,".txt"))  
  mdata <- furrr::future_map_dfr(files, process_file) %>%
                      mutate(window_size=window_size,
                             event_type=event_type)  
  return(mdata)
}

In [3]:
# Wrapper to run the function that loads the affinity files and the function that runs the tests of targets
find_targets_all_motifs<-function(motif_ids,event_type,window_size,dir,trex,...){
    
    rbp.aff <- process_all_motif_files(directory_path = dir,
                                       motif_ids = motif_ids,
                                       event_type = event_type,
                                       window_size = window_size)
    # Filter to test only biologically revelant genes
    rbp.aff <- rbp.aff %>%
               filter(event_id %in% unique(trex$event_id))
    
    rbp.targets <- test_motif_targets(rbp.aff,p=0.15) %>%
                   mutate(window_size = window_size)
    
    return(rbp.targets)
}

In [4]:
# Test for target genes
test_motif_targets <- function(rbp.aff,p){

    target.tests <- rbp.aff %>%
                    mutate(gene_id = sub(".\\d+;.*","",event_id)) %>%       
                    group_by(event_type,window_size,seq_id,motif_id,gene_id) %>% 
                    summarize(gene_affinity = mean(log(affinity)),
                              nevents=length(unique(event_id))) %>%   # Summarize is as mean affinity per region over all events in the same gene
                    group_by(event_type,window_size,seq_id,motif_id) %>% 
                    mutate(ma = mean(gene_affinity),
                           zscore = (gene_affinity-ma)/sd(gene_affinity),
                           n = length(zscore),
                           zscore_pvalue = pnorm(zscore,lower.tail = TRUE),
                           zscore_padj = p.adjust(zscore_pvalue,method="fdr")) %>%
                    rename("representative_motif"="motif_id") %>%
                    ungroup() 
    
    target.genes <- target.tests %>%
                    mutate(target_gene_region = as.character(ifelse(zscore_padj<p,"target","other"))) %>%
                    arrange(zscore_padj)
    
    return(target.genes)
}

### Inputs

In [5]:
# Pars
analysis<-"condition"  # Analysis type tag (condition / TumorStageContinuous)
voi<-"conditiontumor"  # Variable of interest (conditiontumor / sex / age / stage)
events_used<-c("SE","RI","AF","AL","A5","A3") # Events to include in the analysis
n<-11 # Number of cancers to call shared events
set.seed(7)
cancer_order_set<-c("SKCM","UVM","SARC","LAML","TGCT","UCEC","CESC","PRAD","READ","BLCA",
                "COAD","PAAD","CHOL","KIRP","KIRC","KICH","ACC","PCPG","STAD","LIHC",
                "MESO","LUSC","LUAD","BRCA","THCA","ESCA","HNSC")

# Event flag information
obj_dir<-"../input/objects"
flags_file<-paste0(obj_dir,"/tcga.",analysis,".psi.flags.RDS")

# RBP info
aff_dir<-"../../../chapter2_Methods/2.5_ASregulators/output/affimx"
rbp_obj_file<-"../output/objects/Figure10_rbp_motif_coefs_gex_sig.RDS"
rbp_cors_file<-"../output/data/RBP_significant_correlations.tsv"

# Internal
vars<-c(voi,"impurity")
res.file<-paste0("../input/objects/tcga.",analysis,".res.lfcShrink.RDS")

In [6]:
plan(multisession, workers = 6)

### Load data

In [7]:
rbp_motif_coefs_gex_sig <- readRDS(file=rbp_obj_file)
trex.res.lfsh <- readRDS(file = res.file) %>%
               filter(event_type %in% events_used)
flags <- readRDS(flags_file)
signif.cors <- data.table::fread(rbp_cors_file)

### Filter events

In [8]:
trex.res.cond <- trex.res.lfsh %>%
                 filter(exp_var=="conditiontumor",
                       !is.na(padj)) %>%
                 mutate(gene_id=sub("\\.\\d+","",gene_id))  %>%
                 distinct(gene_id,event_id)

In [9]:
valid_events <- flags %>%
                group_by(event_type,event_id) %>%
                summarize(ncancers=length(unique(cancer))) %>%
                filter(n!=20)
trex.res.cond <- trex.res.cond%>%
                inner_join(.,valid_events)
dim(trex.res.cond)

[1m[22m`summarise()` has grouped output by 'event_type'. You can override using the
`.groups` argument.
[1m[22mJoining, by = "event_id"


### Find targets of RBPs with significant correlations between LFC and DSR

In [10]:
sig.rbp.motifs <- rbp_motif_coefs_gex_sig %>% 
                  ungroup() %>%
                  distinct(representative_motif,event_type,wsize) %>%
                  group_by(representative_motif,event_type,wsize) %>%
                  mutate(pars=list(list(representative_motif,event_type,wsize))) %>%
                  ungroup() 

In [11]:
motif_targets <- future_map_dfr(sig.rbp.motifs$pars,function(p,...){
                                                    find_targets_all_motifs(motif_ids = p[[1]],
                                                                            event_type = p[[2]],
                                                                            window_size = p[[3]],
                                                                            dir = aff_dir,
                                                                            trex = trex.res.cond)}) %>%
                distinct() 

[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.
[1m[22m`summarise()` has grouped output by 'event_type', 'window_size', 'seq_id',
'motif_id'. You can override using the `.groups` argument.

In [12]:
motif_targets_genes <- motif_targets %>%
                       filter(target_gene_region=="target") 

In [13]:
head(motif_targets_genes)

event_type,window_size,seq_id,representative_motif,gene_id,gene_affinity,nevents,ma,zscore,n,zscore_pvalue,zscore_padj,target_gene_region
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target
A3,200,S3_DOWN_200,M012_0.6,ENSG00000156959,-14.25651,1,-4.617702,-4.145691,8669,1.693954e-05,0.08531045,target
A3,200,S2_DOWN_200,M012_0.6,ENSG00000156959,-14.25651,1,-4.506304,-4.25596,8669,1.04077e-05,0.09022436,target
A3,200,S2_DOWN_200,M012_0.6,ENSG00000198858,-13.75724,1,-4.506304,-4.038029,8669,2.695104e-05,0.11681929,target
A3,200,S3_DOWN_200,M012_0.6,ENSG00000198858,-13.75724,1,-4.617702,-3.930953,8669,4.230485e-05,0.12224692,target
A3,200,E1_DOWN_200,M012_0.6,ENSG00000243710,-13.5679,1,-4.196342,-4.057798,8669,2.476879e-05,0.13289494,target


In [14]:
motif_targets_genes <- motif_targets_genes %>%
                       left_join(.,
                                 rbp_motif_coefs_gex_sig %>% 
                                 mutate(RBP_GENENAME = sub("_.*","",GENENAME)) %>%
                                 distinct(cancer,representative_motif,Motif_ID,RBP_GENENAME,log2FoldChange))

[1m[22mJoining, by = "representative_motif"


In [15]:
head(motif_targets_genes)

event_type,window_size,seq_id,representative_motif,gene_id,gene_affinity,nevents,ma,zscore,n,zscore_pvalue,zscore_padj,target_gene_region,cancer,Motif_ID,log2FoldChange,RBP_GENENAME
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target,BLCA,M012_0.6,-1.4714,CPEB3
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target,BRCA,M012_0.6,-0.1372552,CPEB3
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target,CESC,M012_0.6,-0.4280138,CPEB3
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target,CHOL,M012_0.6,-2.963396,CPEB3
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target,COAD,M012_0.6,-2.5881499,CPEB3
A3,200,S3_DOWN_200,M012_0.6,ENSG00000104852,-14.17628,1,-4.617702,-4.111184,8669,1.968173e-05,0.08531045,target,ESCA,M012_0.6,-1.4573021,CPEB3


In [16]:
motif_targets_genes  %>%
    group_by(RBP_GENENAME,target_gene_region) %>%
    summarize(n=length(unique(gene_id))) %>%
    filter(target_gene_region=="target")

[1m[22m`summarise()` has grouped output by 'RBP_GENENAME'. You can override using the
`.groups` argument.


RBP_GENENAME,target_gene_region,n
<chr>,<chr>,<int>
ACO1,target,106
ANKHD1,target,1
ANKRD17,target,1
CELF3,target,138
CELF4,target,138
CIRBP,target,331
CNOT4,target,14
CPEB3,target,106
CPEB4,target,39
DAZAP1,target,289


In [17]:
rbp.targets <- inner_join(ungroup(motif_targets_genes),
                         rbp_motif_coefs_gex_sig %>% 
                         mutate(seq_id = paste(position,direction,wsize,sep="_"))) %>%
                 mutate(RBP_GENENAME=sub("_.*","",GENENAME),
                        RBP_dgex_group = ifelse(log2FoldChange>0,"up","down")) %>%
                 rename("RBP.log2FoldChange"="log2FoldChange")  

[1m[22mJoining, by = c("event_type", "seq_id", "representative_motif", "cancer",
"Motif_ID", "log2FoldChange")


## Save objects

In [18]:
saveRDS(motif_targets_genes,file="../output/objects/Figure10_all_motif_targets.RDS")

In [19]:
saveRDS(rbp.targets,file="../output/objects/Figure10_rbp.targets.RDS")