# Analysis of upstream regulators 
In association with condition

### Libraries

In [1]:
library(dplyr)
library(ggplot2)
library(ggsci)
library(matrixStats)
library(ggpubr)
library(patchwork)
library(readr)
library(data.table)
library(tidyr)
library(purrr)
library(caret)
library(stringr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘matrixStats’


The following object is masked from ‘package:dplyr’:

    count



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last



Attaching package: ‘purrr’


The following object is masked from ‘package:data.table’:

    transpose


Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift




### Functions

In [2]:
load_coef_file<-function(infile){
    coefs<-data.table::fread(header = TRUE,file = infile) %>%
           mutate(cancer = sub("_.*","",basename(infile)),
                  event_type = sub(paste0(unique(cancer),"_"),"",basename(infile)) %>%
                               sub("_.*","",.)) %>%
           mutate(wsize=sub(".*_","",basename(infile)) %>% sub(".tsv","",.)) 

    colnames(coefs)[colnames(coefs)=="Std. Error"]<-"StdError"
    colnames(coefs)[colnames(coefs)=="Pr(>|t|)"]<-"coef.pvalue"

    coefs<-coefs%>%
            mutate(Estimate=as.numeric(Estimate),
                   StdError=as.numeric(StdError)) 
    return(coefs)
}

In [3]:
plot.rbp.cors<-function(signif.cors,event){
    pdata<-signif.cors %>%
            filter(event_type==event) %>%
            select(-event_type,-wsize,-position,-direction)

    col_ord<-pdata %>%
             group_by(cancer) %>%
             summarize(n1=sum(cor.estimate)*length(cor.estimate),
                       n2=length(cor.estimate)) %>%
             arrange(n2)

    row_ord<-pdata %>%
             group_by(GENENAME) %>%
             summarize(n1=sum(cor.estimate)*length(cor.estimate),
                       n2=length(cor.estimate)) %>%         
            arrange(n1)

    g<-pdata %>%
        mutate(cancer=factor(cancer,levels=col_ord$cancer),
               GENENAME=factor(GENENAME,levels=row_ord$GENENAME)) %>%
        ggplot(.,aes(x=cancer,y=GENENAME,fill=coef.estimate,size=-log(coef.pvalue)))+
        geom_point(pch=21)+
        theme_pubr()+
        theme(legend.position = "bottom",
                panel.background = element_blank(),
                panel.grid.major = element_line(linewidth=0.25,linetype="dashed"),
                panel.grid.minor = element_line(linewidth=0.25,linetype="dashed"),
                axis.ticks = element_blank(),
                axis.text.x = element_text(angle=90,hjust=1,vjust=0.5,size=10),
                axis.line = element_blank(),
                legend.key.height= unit(0.25, 'cm'),
                legend.key.width= unit(0.15, 'cm'),
                legend.spacing.x = unit(0.1, 'cm'),
                legend.margin=margin(t = 0, unit='cm'),
                legend.text = element_text(size=6),
                strip.background = element_blank(),
                strip.text = element_text(),
                panel.spacing.y = unit(0.1, "cm"))+
        scale_fill_gradient2(low = "blue",mid = "white",high = "red",
                            guide = guide_colourbar(direction = "horizontal", 
                                                         barheight = 0.25,
                                                         barwidth = 4.5,
                                                         title.hjust = 0.5,
                                                         label.position = "bottom"))+
        scale_y_discrete(position = "right")+
        scale_x_discrete(position = "top")+
        labs(y="",x="",color="",fill=expression(rho),title=event)
    return(g)
}

In [4]:
plot.rbp.coefs.filtered<-function(rb.coefs,event){
    
    pdata <- rb.coefs %>%
                filter(event_type==event) %>%
                select(-event_type,-wsize) %>%

                group_by(RBPs,position,direction) %>%
                mutate(nsig_cancer=sum(coef.pvalue<0.01)) %>%
                group_by(RBPs) %>%
                mutate(nv_reg=sum(nsig_cancer>=3)) %>%
                filter(nv_reg!=0) %>%

                group_by(position,direction,cancer) %>%
                mutate(nsig_rbp=sum(coef.pvalue<0.01)) %>%
                filter(nsig_rbp!=0) 

    msr <- pdata %>%
             group_by(position,direction) %>%
             summarize(nsig=sum(coef.pvalue<0.01)) %>%
             ungroup() %>%
             slice_max(nsig,n=1,with_ties = FALSE)

    rbp_ord <- pdata %>%
               filter(position==msr$position, direction==msr$direction) %>%
               group_by(RBPs) %>%
               summarize(n=median(Estimate),
                        s=sd(Estimate)) %>%
               arrange(n*s)

    pdata<-pdata %>%
            select(-Motif_ID) %>%
            distinct() %>%
            mutate(RBPs=factor(RBPs,levels=rbp_ord$RBPs),
                   coef.significant=coef.pvalue<0.01) 
    g<-pdata %>%
        ggplot(.,aes(x=tidytext::reorder_within(x=cancer,
                                                by=coef.pvalue,
                                                within = list(position,direction),
                                                fun = function(p){sum(p>0.01)}),
                     y=RBPs))+
        geom_tile(aes(fill=Estimate))+
        geom_tile(data = pdata %>% 
                          filter(coef.significant),
                  color="black",
                  aes(fill=Estimate))+
        facet_grid(~position+direction,scales="free_x",space="free")+
        theme_pubr()+
        theme(legend.position = "bottom",
                panel.background = element_blank(),
                panel.grid.major = element_line(linewidth=0.05,linetype="solid"),
                panel.grid.minor = element_line(linewidth=0.05,linetype="solid"),
                axis.ticks = element_blank(),
                axis.text.x = element_text(angle=90,hjust=1,vjust=0.5,size=7),
                axis.text.y = element_text(size=9),
                axis.line = element_blank(),
                legend.key.height= unit(0.25, 'cm'),
                legend.key.width= unit(0.15, 'cm'),
                legend.spacing.x = unit(0.1, 'cm'),
                legend.margin=margin(t = 0, unit='cm'),
                legend.text = element_text(size=6),
                strip.background = element_blank(),
                strip.text = element_text(),
                panel.spacing.y = unit(0.1, "cm"))+
        scale_fill_gradient2(low = "blue",mid = "white",high = "red",
                            guide = guide_colourbar(direction = "horizontal", 
                                                         barheight = 0.25,
                                                         barwidth = 6.5,
                                                         title.hjust = 0.5,
                                                         label.position = "bottom"))+
        scale_y_discrete(position = "right")+
        scale_x_discrete(position = "bottom")+
        tidytext::scale_x_reordered()+
        labs(y="",x="",color="",fill=expression(beta),title=event)
    
    return(list("plot"=g,"msr"=msr))
}

In [5]:
plot.corr.sidebar<-function(d.bar,msr){
    
    pdata<-d.bar %>%
            filter(ncancers>10) %>%
            group_by(RBPs,position,direction) %>%
            slice_min(cor.pvalue,n=1,with_ties=FALSE) # Same RBP with multiple motifs, show the one with smaallest pvalue

    rbp_ord <- pdata %>%
               filter(position==msr$position) %>%
               group_by(RBPs) %>%
               summarize(n=mean(cor.estimate),
                         s=sd(cor.estimate)) %>%
               arrange(n)

    pdata <- pdata %>%
             mutate(signif=cor.padj<0.05,
                    RBPs=factor(RBPs,levels=rbp_ord$RBPs)) 


    g<-ggplot(pdata,aes(x=direction,y=RBPs,fill = cor.estimate,size=ncancers,color=signif))+
        geom_point(pch=21)+
        facet_grid(~position)+
        theme_pubr()+
        theme(legend.position = "bottom",
                panel.background = element_blank(),
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                axis.ticks = element_blank(),
                axis.text.x = element_text(angle=90,hjust=1,vjust=0.5,size=7),
                axis.text.y = element_text(size=10),
                axis.line = element_blank(),
                legend.key.height= unit(0.25, 'cm'),
                legend.key.width= unit(0.15, 'cm'),
                legend.spacing.x = unit(0.1, 'cm'),
                legend.margin=margin(t = 0, unit='cm'),
                legend.text = element_text(size=6),
                strip.background = element_blank(),
                strip.text = element_text(),
                panel.spacing.y = unit(0.1, "cm"))+
        scale_color_manual(values=c("white","black"),guide="none")+
        scale_fill_gradient2(low = "blue",mid = "white",high = "red",
                            guide = guide_colourbar(direction = "horizontal", 
                                                         barheight = 0.25,
                                                         barwidth = 6.5,
                                                         title.hjust = 0.5,
                                                         label.position = "bottom"))+
        scale_y_discrete(position = "left")+
        scale_x_discrete(position = "bottom")+
        scale_size(range=c(1,2))+
        labs(y="",x="",color="",fill=expression(rho),title=event,size="n")
    g
    return(g)
}

In [6]:
plot.rbp.scater<-function(rbp_motif_coefs_gex,gn,event,...){
    
    plot.data<-rbp_motif_coefs_gex %>%
               filter(GENENAME==gn,
                      event_type==event) %>%
               distinct() 
    
    g<-ggplot(plot.data,aes(x=Estimate,y=log2FoldChange,color=cancer))+
        geom_vline(xintercept = 0,linewidth=0.5,linetype="dashed",color="grey")+
        geom_hline(yintercept = 0,linewidth=0.5,linetype="dashed",color="grey")+
        geom_point(size=3)+
        theme_pubr()+
        stat_cor(aes(x=Estimate,y=log2FoldChange),inherit.aes = F)+
        scale_color_manual(values=tcga.colors)+
        labs(x="Differential binding",y="Differential expression",title=gn)+
        theme(legend.position="none")

    return(g)
}

### Inputs

In [7]:
gex.file<-"../input/objects/tcga.condition.gex.res.lfcShrink.RDS"
gmapfile<-"../input/references/gencode.v37.primary_assembly.annotation.geneIDmap.tsv"
model_res_dir<-"../input/data/coefficients"
rbp_info_file<-"../input/references/cisbp_Homo_sapiens_2023_06_RBP_Information_all_motifs.txt"
motif_clusters_file<-"../input/data/25112023_representative_cluster_motifs.tsv"

In [8]:
coef.obj.file<-"../input/objects/tcga.condition.res.RDS"
coef.sh.obj.file<-"../input/objects/tcga.condition.res.lfcShrink.RDS"

In [9]:
events_used<-c("SE","MX","RI","A3","A5","AF","AL")
ss_levels<-list("SE" = c("E1","S2","E2","S3"),
                "MX" = c("E1","S2","E2","S3","E3","S4"),
                "A5" = c("E1","E2","S3"),
                "A3" = c("E1","S2","S3"),
                "RI" = c("S1","E1","S2","E2"),
                "AF" = c("S1","E1","S2","E2","S3"),
                "AL" = c("E1","S2","E2","S3","E3"))

### Load data
---

#### RBP info

In [None]:
motif_clusters <- data.table::fread(motif_clusters_file,sep="\t")  %>%
                  rename("Motif_ID"="motif")

In [None]:
rbp_info <- fread(rbp_info_file,head = TRUE,sep = "\t",data.table = FALSE)
rbp_motif <- rbp_info %>%
             select('RBP_ID','Motif_ID',"ENSDBID",'RBP_Name','Family_Name') %>%
             rename("GENEID"="ENSDBID") %>%
             inner_join(.,motif_clusters) %>%
             ungroup()

In [None]:
message("Number of RBPS = ",length(unique(rbp_motif$GENEID)))
message("Number of ref motifs = ",length(unique(rbp_motif$representative_motif)))
message("Number of motifs = ",length(unique(rbp_motif$Motif_ID)))

#### DGE from deseq

In [None]:
genemap<-data.table::fread(gmapfile,data.table=F)
gex.res<-readRDS(gex.file) %>%
         filter(exp_var=="condition_tumor_vs_normal")%>%
         mutate(GENEID=sub("\\..*","",gene_id))%>%
         left_join(.,genemap)

#### RBP model coefficients

In [None]:
model_files<-list.files(path = model_res_dir,full.names = TRUE,pattern = "200")
coefs<-furrr::future_map_dfr(model_files,load_coef_file) %>%
       filter(coefficient!="(Intercept)") %>%
       rowwise() %>% 
       mutate(coefficient_lis=strsplit(coefficient,"_"),
              Motif_ID=paste(coefficient_lis[1:2],collapse="_"),
              position=coefficient_lis[3],
              direction=coefficient_lis[4]) %>%
       as_tibble() %>%
       select(-coefficient_lis) %>%
       rename("representative_motif"="Motif_ID") %>%
       mutate(seq_id = paste(position,direction,wsize,sep="_")) 
dim(coefs)

### Combine tables
---

In [None]:
rbp_motif_coefs <- inner_join(coefs,rbp_motif) 
dim(rbp_motif_coefs)

In [None]:
rbp_motif_coefs_gex <- inner_join(rbp_motif_coefs,
                                  gex.res) %>%
                       distinct(GENENAME,Motif_ID,representative_motif,cancer,event_type,wsize,position,direction,Estimate,coef.pvalue,log2FoldChange,pvalue,padj)

In [None]:
head(rbp_motif_coefs_gex)

In [None]:
# Compute correlations
all.cors<-rbp_motif_coefs_gex %>%
            mutate(GENENAME=paste(GENENAME,representative_motif,sep="_")) %>%                    
            dplyr::group_by(GENENAME,position,direction,event_type,wsize) %>%
            mutate(n=length(cancer)) %>%
            filter(n>=3) %>%
            dplyr::summarize(ncancers=length(unique(cancer)),
                             cors=list(cor.test(Estimate,log2FoldChange)[c("estimate","p.value")]))  %>%
            rowwise() %>%
            mutate(cor.estimate=cors[[1]][1],
                   cor.pvalue=cors[[2]][1]) %>%
            as_tibble() %>%
            select(-cors) %>%
            dplyr::group_by(wsize,event_type) %>%
            mutate(cor.padj = p.adjust(cor.pvalue,method = "fdr"),
                   ntests = length(cor.pvalue),
                   nrbp = length(unique(GENENAME)),
                   npos = length(unique(position))) 

In [None]:
signif.cors<-all.cors %>%
             ungroup() %>%
             filter(cor.padj<0.1) %>%
             group_by(GENENAME,event_type,wsize) %>%
             mutate(GENENAME=paste(GENENAME,position,wsize,direction,sep="_"))
dim(signif.cors)

In [None]:
rbp_motif_coefs_gex_sig<-rbp_motif_coefs_gex %>%
                         mutate(GENENAME=paste(GENENAME,representative_motif,position,wsize,direction,sep="_")) %>%
                         inner_join(.,signif.cors) %>%
                         inner_join(.,rbp_motif_coefs_gex %>% 
                                      mutate(GENENAME=paste(GENENAME,representative_motif,position,wsize,direction,sep="_")) %>%
                                      distinct(GENENAME)) %>%
                         mutate(RBP_Name=sub("_.*","",GENENAME))

In [None]:
rbp_motif_coefs_gex_all<-rbp_motif_coefs_gex %>%
                         mutate(GENENAME=paste(GENENAME,representative_motif,position,wsize,direction,sep="_")) %>%
                         inner_join(.,all.cors %>% mutate(GENENAME=paste(GENENAME,position,wsize,direction,sep="_"))) %>%
                         inner_join(.,rbp_motif_coefs_gex %>% 
                                      mutate(GENENAME=paste(GENENAME,representative_motif,position,wsize,direction,sep="_")) %>%
                                      distinct(GENENAME))%>%
                         mutate(RBP_Name=sub("_.*","",GENENAME))

In [None]:
dim(rbp_motif_coefs_gex_sig)
dim(rbp_motif_coefs_gex_all)

In [None]:
saveRDS(rbp_motif_coefs,"../output/objects/Figure10_rbp_motif_coefs.RDS")
saveRDS(rbp_motif_coefs_gex_sig,"../output/objects/Figure10_rbp_motif_coefs_gex_sig.RDS")
saveRDS(rbp_motif_coefs_gex_all,"../output/objects/Figure5_rbp_motif_coefs_gex_all.RDS")

In [None]:
write.table(signif.cors,"../output/data/RBP_significant_correlations.tsv",row.names = FALSE,sep="\t",col.names=TRUE, quote=FALSE)
write.table(all.cors,"../output/data/RBP_all_correlations.tsv",row.names = FALSE,sep="\t",col.names=TRUE, quote=FALSE)

## Visualize results
---

In [10]:
rbp_motif_coefs_gex_sig<-readRDS("../output/objects/Figure10_rbp_motif_coefs_gex_sig.RDS")
rbp_motif_coefs_gex_all<-readRDS("../output/objects/Figure10_rbp_motif_coefs_gex_all.RDS")

In [11]:
tcga.colors<-c("#EF9A9A","#F8BBD0","#CE93D8","#B39DDB","#9FA8DA",
               "#90CAF9","#81D4FA","#80DEEA","#80CBC4","#A5D6A7",
               "#C5E1A5","#E6EE9C","#FFF59D","#FFE082","#BCAAA4",
               "#F44336","#E91E63","#9C27B0","#673AB7","#3F51B5",
               "#2196F3","#03A9F4","#00BCD4","#009688","#4CAF50",
               "#8BC34A","#CDDC39","#FFC107","#FF9800","#795548",
               "#B71C1C")
names(tcga.colors)<-sort(unique(rbp_motif_coefs_gex_all$cancer))
events<-unique(rbp_motif_coefs_gex_sig$event_type)

In [12]:
cors<-rbp_motif_coefs_gex_sig %>%
        distinct(GENENAME,cor.estimate,cor.padj,ncancers)

In [14]:
outlier.rbps<-c("SRSF1","SRSF2","SRSF7","ANKHD1","SRSF7, ANKHD1") # Based on preliminary explorations of coefficients

In [15]:
ws<-200
for(event in events){
    
    sig_rbp <- rbp_motif_coefs_gex_sig  %>%
               filter(event_type == event) %>% 
               distinct(event_type,RBP_Name)
    
    d<-rbp_motif_coefs_gex_all %>% 
        ungroup() %>%
        filter(wsize == ws,
               event_type == event,
               RBP_Name %in% sig_rbp$RBP_Name) %>%
        mutate(GENENAME=sub("_0.6_","_",GENENAME) %>% sub("_200_","_",.)) 

    if(nrow(d)!=0){

        num_rbp<-length(unique(d$GENENAME))
        
        d <- d %>%
              mutate(position=factor(position,ss_levels[[event]]),
                     direction=factor(direction,levels=c("UP","DOWN"))) %>%
              group_by(representative_motif) %>%
              mutate(RBPs=paste(unique(RBP_Name),collapse = ", ")) %>%
              select(-RBP_Name) %>%
              distinct() %>%
              filter(!RBPs%in%outlier.rbps) %>%
              mutate(Estimate = case_when(Estimate>quantile(Estimate,0.95) ~ quantile(Estimate,0.95),
                                          Estimate<quantile(Estimate,0.05) ~ quantile(Estimate,0.05),
                                          TRUE ~ Estimate))

        glist <-plot.rbp.coefs.filtered(rb.coefs = d,
                                   event = event)
        
        d.bar<-d %>%
               ungroup() %>%
               mutate(RBPs=sub("_.*","",GENENAME)) %>%
               distinct(RBPs,position,direction,ncancers,cor.estimate,cor.padj,cor.pvalue)  %>%
               group_by(RBPs) %>%
               filter(sum(cor.padj<0.05)>=1) %>%
               arrange(cor.estimate) 
        s<-plot.corr.sidebar(d.bar,msr=glist$msr)
        
        message(event," ",glist$msr$position," ",glist$msr$direction)
 
        ggsave(glist$plot,filename=paste0("../output/figures/rbp_coefficients/",ws,".",event,".pdf"),height=6,width=14,device=cairo_pdf)
        ggsave(s,filename=paste0("../output/figures/rbp_coefficients/",ws,".",event,".correlations.pdf"),height=7,width=3.5,device=cairo_pdf)   
    }   
}

[1m[22m`summarise()` has grouped output by 'position'. You can override using the
`.groups` argument.
[1m[22mScale for [32mx[39m is already present.
Adding another scale for [32mx[39m, which will replace the existing scale.
A3 S2 DOWN

[1m[22m`summarise()` has grouped output by 'position'. You can override using the
`.groups` argument.
[1m[22mScale for [32mx[39m is already present.
Adding another scale for [32mx[39m, which will replace the existing scale.
A5 E1 DOWN

[1m[22m`summarise()` has grouped output by 'position'. You can override using the
`.groups` argument.
[1m[22mScale for [32mx[39m is already present.
Adding another scale for [32mx[39m, which will replace the existing scale.
AF E1 UP

[1m[22m`summarise()` has grouped output by 'position'. You can override using the
`.groups` argument.
[1m[22mScale for [32mx[39m is already present.
Adding another scale for [32mx[39m, which will replace the existing scale.
AL S2 UP

[1m[22m`summarise()` has g

In [16]:
ws<-200
for(event in rev(events)){
    genetags<-rbp_motif_coefs_gex_sig %>%
              filter(event_type==event,ncancers>=5,cor.padj<0.1) %>%
              mutate(GENENAME=sub("_0.6_","_",GENENAME) %>% sub("_200_","_",.)) %>%
              distinct(GENENAME)

    for(gn in genetags$GENENAME[1]){
        
        plot_data<-rbp_motif_coefs_gex_all %>% 
                   filter(wsize==ws) %>%
                   mutate(GENENAME=sub("_0.6_","_",GENENAME) %>% sub("_200_","_",.)) 

        g<-plot.rbp.scater(rbp_motif_coefs_gex = plot_data,
                           event = event,
                           gn = gn)
        outdir<-paste0("../output/figures/rbp_scatterplots/",event)
        dir.create(outdir,showWarnings=FALSE)
        ggsave(g,filename = paste0(outdir,"/",ws,"_",event,"_",gn,".pdf"),width=4,height = 4.25,device="pdf")
    }
}

In [17]:
g<-g+theme(legend.position="bottom")+labs(color="")
l<-as_ggplot(get_legend(g))
ggsave(l,filename = "../output/figures/legend_all_cancers.pdf",
       width=4,height = 3,device="pdf")