## Heart snATAC topic analysis

In [9]:
suppressPackageStartupMessages(library(ArchR))
library(parallel)
library(viridis)
suppressPackageStartupMessages(library(Seurat))
set.seed(1234)
library(reshape2)
library(dplyr)
library(ComplexHeatmap)
library(viridis)
library(circlize)
addArchRThreads(threads = 8) 
#addArchRGenome("mm10")
#library(BSgenome.Mmusculus.UCSC.mm10)

circlize version 0.4.15
CRAN page: https://cran.r-project.org/package=circlize
Github page: https://github.com/jokergoo/circlize
Documentation: https://jokergoo.github.io/circlize_book/book/

If you use it in published research, please cite:
Gu, Z. circlize implements and enhances circular visualization
  in R. Bioinformatics 2014.

This message can be suppressed by:
  suppressPackageStartupMessages(library(circlize))


Setting default number of Parallel threads to 8.



## Load Topic project

In [2]:
setwd("/share/crsp/lab/seyedam/share/enc4_mouse/snatac/scripts")
setwd("../topics")

hrt = loadArchRProject(path = "ENC4_Mouse_Topics_Heart/")


Successfully loaded ArchRProject!


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _ 

## addGroupCoverages

In [None]:
hrt <- addGroupCoverages(ArchRProj = hrt, 
                                minCells = max(table(hrt$Sample)),
                                groupBy = "Sample",
                                force = TRUE)

## addReproduciblePeakSet

In [None]:
pathToMacs2 <- findMacs2()
hrt <- addReproduciblePeakSet(
    ArchRProj = hrt, 
#    minCells = 1000, # didnt work
    groupBy = "Sample", 
    pathToMacs2 = pathToMacs2,
    
)

## addMotifAnnotations

In [None]:
hrt <- addMotifAnnotations(ArchRProj = hrt, 
                                  motifSet="vierstra",
                                  collection = "archetype",
                                  name = "Motif")

## Save

In [None]:
saveArchRProject(ArchRProj = hrt, outputDirectory = "ENC4_Mouse_Topics_Heart")


In [20]:
setwd("/share/crsp/lab/seyedam/share/enc4_mouse/snatac/scripts")
setwd("../topics")

hrt = loadArchRProject(path = "ENC4_Mouse_Topics_Heart/")


Successfully loaded ArchRProject!


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _ 

## Make heatmaps

In [21]:
motifs_meta = read.delim("../ref/vierstra_archetype_meta.tsv")
anno = getPeakAnnotation(ArchRProj = hrt, name = "Motif")
peaks_by_motifs = readRDS(anno$Matches)
grn_df = assays(peaks_by_motifs)$matches
grn_meta = rowData(peaks_by_motifs)
grn_meta$uniqueID = paste0(rownames(grn_meta),":",grn_meta$idx)
genes = read.csv("../ref/gene_weights_rLDA_model.csv")

rownames(genes) = genes$X
genes$X = NULL
genes = as.matrix(genes)
genes = t(genes)
genes = genes[rownames(genes) %in% unique(rownames(grn_meta)),]
genes = genes[match(unique(rownames(grn_meta)), rownames(genes)),]


In [22]:
# filter by topic of interest
topic_grn_df_list = list()
for (topic in unique(rownames(grn_meta))){
    topic = "Topic11"
    int_df = grn_df[grep(topic,rownames(grn_df)),] # columns are motifs
    topic_grn_meta = grn_meta[grep(topic,rownames(grn_meta)),]
    
    # format 
    int_df <- 1*as.matrix(int_df)
    int_df = t(int_df) # rows are motifs
    int_df = as.data.frame(int_df)
    
    colnames(int_df) = make.unique(colnames(int_df), sep="_")
    
    int_df$cluster = rownames(int_df)
    int_df$cluster = do.call("rbind", strsplit(as.character(int_df$cluster), "[|]"))[,1]
    
    # change archetype to name
    int_df = left_join(int_df,motifs_meta) 
    topic_grn_df=int_df
    topic_grn_df$cluster= NULL
    topic_grn_df$motif_id= NULL
    topic_grn_df$source_id= NULL
    topic_grn_df$tf_name= NULL
    topic_grn_df$family_name= NULL
    topic_grn_df$motif_type= NULL
    topic_grn_df$PMID = NULL
    topic_grn_df = as.matrix(topic_grn_df)
    
    rownames(topic_grn_df) = str_to_title(int_df$tf_name) 
    
    # filter rows (motifs) for those in topic genes
    # get topic genes
    topic_genes = as.vector(genes[topic,])
    names(topic_genes) = colnames(genes)

    topic_genes = topic_genes[topic_genes > 1] # ????????
    topic_genes = names(topic_genes)

    topic_grn_df = topic_grn_df[rownames(topic_grn_df) %in% topic_genes,]

    # filter columns (peaks) for the ones < 10,000 bp from TSS  
    # LOT OF OPTIONS HERE
    topic_grn_df  = topic_grn_df[,topic_grn_meta$distToTSS < 10000]
    topic_grn_meta = topic_grn_meta[topic_grn_meta$distToTSS < 10000,]


    #topic_grn_df  = topic_grn_df[,topic_grn_meta$score > 50]
    #topic_grn_meta = topic_grn_meta[topic_grn_meta$score > 50,]

    colnames(topic_grn_df) = topic_grn_meta$nearestGene
    

    # filter peaks (peak-associated genes atp) for those in topic_genes
    topic_grn_df = topic_grn_df[,colnames(topic_grn_df) %in% topic_genes]

  
    # sum up duplicate peaks  (columns)
    unique_peaks = colnames(topic_grn_df)
    unique_peaks = unique_peaks[!(duplicated(unique_peaks)|duplicated(unique_peaks, fromLast=TRUE))]
    topic_grn_df_unique = topic_grn_df[,colnames(topic_grn_df) %in% unique_peaks]

    if (length(unique_peaks) != ncol(topic_grn_df)){
        topic_grn_df_dupe = topic_grn_df[,!(colnames(topic_grn_df) %in% unique_peaks)]
        topic_grn_df_dupe = sapply(unique(colnames(topic_grn_df_dupe)), 
                               function(x) rowSums(topic_grn_df_dupe[,grepl(x, colnames(topic_grn_df_dupe))])
                               )
        topic_grn_df = cbind(topic_grn_df_unique,topic_grn_df_dupe)
        } else {
        topic_grn_df = topic_grn_df_unique
        }

# metadata may have duplicated nearest gene have to sum their score?
    topic_grn_meta = as.data.frame(topic_grn_meta)
    topic_grn_meta = topic_grn_meta[,c("nearestGene","score")]
    topic_grn_meta = ddply(topic_grn_meta,"nearestGene",numcolwise(sum))
  topic_grn_meta$gene = topic_grn_meta$nearestGene


    topic_grn_df = unique(topic_grn_df)
    rownames(topic_grn_df) = toupper(rownames(topic_grn_df))

    # average duplicate motifs
    unique_motifs = rownames(topic_grn_df)
    unique_motifs = unique_motifs[!(duplicated(unique_motifs)|duplicated(unique_motifs, fromLast=TRUE))]

    topic_grn_df_unique = topic_grn_df[rownames(topic_grn_df) %in% unique_motifs,]
    if (length(unique_motifs) != nrow(topic_grn_df)){

        topic_grn_df_dupe = topic_grn_df[!(rownames(topic_grn_df) %in% unique_motifs),]
        topic_grn_df_dupe = aggregate(topic_grn_df_dupe, list(row.names(topic_grn_df_dupe)), mean)
        rn = topic_grn_df_dupe$Group.1
        topic_grn_df_dupe$Group.1 = NULL
        topic_grn_df_dupe = as.matrix(topic_grn_df_dupe)
        rownames(topic_grn_df_dupe) = rn
        topic_grn_df = rbind(topic_grn_df_unique,topic_grn_df_dupe)
        } else {
        topic_grn_df = topic_grn_df_unique
        }


topic_genes = as.vector(genes[topic,])
names(topic_genes) = colnames(genes)

topic_genes = as.data.frame(topic_genes)
#topic_genes = as.data.frame(topic_genes[topic_genes > 1]) # ????????
colnames(topic_genes)= "weight"
topic_genes$gene = rownames(topic_genes)

testing_genes = as.data.frame(colnames(topic_grn_df))
colnames(testing_genes) = "gene"

weight_meta = left_join(testing_genes,topic_genes)
weight_meta$weight[is.na(weight_meta$weight)]= 0


score_meta = left_join(testing_genes,topic_grn_meta)

col_fun = colorRamp2(c(min(weight_meta$weight), round((max(weight_meta$weight) + min(weight_meta$weight))/2), max(weight_meta$weight)), c("blue", "white", "red"))
col_fun2 = colorRamp2(c( min(score_meta$score), round((max(score_meta$score) + min(score_meta$score))/2), max(score_meta$score)), c("purple", "black", "yellow"))

ha = HeatmapAnnotation(
    weight = weight_meta$weight, 
    score = score_meta$score,
    col = list(weight = col_fun,
               score = col_fun2),
    which = "row"
    )

  
    pdf(file=paste0("ENC4_Mouse_Topics_Heart/Plots/",topic,"_peaks_in_topicgenes.pdf"),
        width=50,height=50)
    print(Heatmap(t(topic_grn_df),  name = "Motif in peak",
       cluster_rows = T, right_annotation= ha,
        row_dend_width = unit(4, "cm"),
        column_dend_height = unit(4, "cm"),
       cluster_columns = T,col=magma(100)))
    dev.off()  
    
    topic_grn_df_list[[topic]] = t(topic_grn_df)
}

"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
"num

# order by weight

In [35]:
for (topic in unique(rownames(grn_meta))){
    thisdf = topic_grn_df_list$Topic
    
    topic_genes = as.vector(genes[topic,])
    names(topic_genes) = colnames(genes)

    topic_genes = as.data.frame(topic_genes)
    #topic_genes = as.data.frame(topic_genes[topic_genes > 1]) # ????????
    colnames(topic_genes)= "weight"
    topic_genes$gene = rownames(topic_genes)

    testing_genes = as.data.frame(colnames(topic_grn_df))
    colnames(testing_genes) = "gene"

    weight_meta = left_join(testing_genes,topic_genes)
    weight_meta$weight[is.na(weight_meta$weight)]= 0

    weight_meta_sorted = weight_meta[order(weight_meta$weight,decreasing = T),]
    thisdf_sorted = thisdf[weight_meta_sorted$gene,]


    testing_genes = as.data.frame(rownames(thisdf_sorted))
    colnames(testing_genes) = "gene"
    weight_meta = left_join(testing_genes,topic_genes)
    weight_meta$weight[is.na(weight_meta$weight)]= 0
    score_meta = left_join(testing_genes,topic_grn_meta)
    
    col_fun = colorRamp2(c(min(weight_meta$weight), round((max(weight_meta$weight) + min(weight_meta$weight))/2), max(weight_meta$weight)), c("blue", "white", "red"))
    col_fun2 = colorRamp2(c( min(score_meta$score), round((max(score_meta$score) + min(score_meta$score))/2), max(score_meta$score)), c("purple", "black", "yellow"))

ha = HeatmapAnnotation(
    weight = weight_meta$weight, 
    score = score_meta$score,
    col = list(weight = col_fun,
               score = col_fun2),
    which = "row"
    )

pdf(file=paste0("ENC4_Mouse_Topics_Heart/Plots/",topic,"_peaks_in_topicgenes_orderedbyweight.pdf"),
    width=50,height=50)
    print(Heatmap(thisdf_sorted,  name = "Motif in peak",
       cluster_rows = F, right_annotation= ha,
        row_dend_width = unit(4, "cm"),
        column_dend_height = unit(4, "cm"),
       cluster_columns = T,col=magma(100)))
    dev.off() 
    
}

[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"
[1m[22mJoining, by = "gene"


## Gene score vs. gene weight

In [36]:
genes = read.csv("../ref/gene_weights_rLDA_model.csv")
genes$gene_name = genes$X
genes$X = NULL
genes$Topic13 = NULL
genes$Topic9 = NULL
genes$Topic4 = NULL

In [37]:
library(reshape2)
gene_weight <- melt(genes, id.vars = c("gene_name"))
colnames(gene_weight) = c("gene_name","Topic","RNA_weight")

### ATAC gene score matrix

In [38]:
atac_genescore_mat <- getGroupSE(hrt, useMatrix = "GeneScoreMatrix", 
                                 groupBy = "Sample")


ArchR logging to : ArchRLogs/ArchR-getGroupSE-2863f733f12792-Date-2022-09-09_Time-11-21-20.log
If there is an issue, please report to github with logFile!

Getting Group Matrix

2022-09-09 11:21:38 : Successfully Created Group Matrix, 0.253 mins elapsed.

Normalizing by number of Cells

ArchR logging successful to : ArchRLogs/ArchR-getGroupSE-2863f733f12792-Date-2022-09-09_Time-11-21-20.log



In [39]:
atac_genescore_df = as.data.frame(assays(atac_genescore_mat)$GeneScoreMatrix)
atac_genescore_df$gene_name = rowData(atac_genescore_mat)$name
atac_genescore_df = atac_genescore_df[atac_genescore_df$gene_name %in% genes$gene_name,]
atac_genescore_df <- melt(atac_genescore_df, id.vars = c("gene_name"))
colnames(atac_genescore_df) = c("gene_name","Topic","ATAC_score")


In [40]:
df = as.data.frame(full_join(gene_weight, atac_genescore_df))

[1m[22mJoining, by = c("gene_name", "Topic")


In [41]:
library(ggrepel)
p=list()
for (topic in unique(df$Topic)){
    df_topic = df[df$Topic == topic,]
    df_topic$ATAC_score[is.na(df_topic$ATAC_score)] = 0
    df_topic = df_topic[df_topic$RNA_weight > 1,]
    p[[topic]] = ggplot(df_topic, aes(x=log(RNA_weight+1), y=ATAC_score)) + 
          geom_point() +theme(text=element_text(size=21)) +
          ggtitle(topic) +
          geom_text_repel(size = 5,max.overlaps = Inf,data = df_topic %>% 
                          mutate(label = ifelse(log(RNA_weight+1) > 6 & ATAC_score > 3,
                                                gene_name, "")),
                          aes(label = label), 
                          # box.padding = 1,
                          show.legend = FALSE)
}


In [42]:
pdf(file=paste0("ENC4_Mouse_Topics_Heart/Plots/gene_score_vs_weight.pdf"),width=15,height=5)

options(repr.plot.width=10,repr.plot.height=5)
p[["Topic1"]]+p[["Topic2"]]
p[["Topic3"]]+p[["Topic5"]]
p[["Topic6"]]+p[["Topic7"]]
p[["Topic8"]]+p[["Topic10"]]
p[["Topic11"]]+p[["Topic12"]]

dev.off()

## Differential peak marker heatmap

In [None]:
# add peak matrix
hrt <- addPeakMatrix(hrt)


In [None]:
markersPeaks <- getMarkerFeatures(
    ArchRProj = hrt, 
    useMatrix = "PeakMatrix", 
    groupBy = "Sample",
    bias = c("TSSEnrichment", "log10(nFrags)"),
    testMethod = "wilcoxon"
)

In [None]:
enrichMotifs <- peakAnnoEnrichment(
    seMarker = markersPeaks,
    ArchRProj = hrt,
    peakAnnotation = "Motif",
    cutOff = "FDR <= 0.05 & Log2FC >= 0.25" # mlog10Padj?
  )

In [None]:
save(markersPeaks,enrichMotifs,
     file="ENC4_Mouse_Topics_Heart/markersPeaks_enrichMotifs.rda")

In [None]:
heatmapPeaks <- plotMarkerHeatmap(
  seMarker = markersPeaks, 
  cutOff = "FDR <= 0.3 & Log2FC >= 0.25",
  transpose = TRUE
)
plotPDF(heatmapPeaks, name = "Peak-Marker-Heatmap", width = 8, 
        height = 6, ArchRProj = hrt, addDOC = FALSE)


## What peaks are differential between topic 3 and 8?

## Motif erichment vs gene weight

In [43]:
load("ENC4_Mouse_Topics_Heart/markersPeaks_enrichMotifs.rda")

In [55]:
# names(assays(enrichMotifs))
library(tidyverse)
enrichMotifs_log10Padj  = as.data.frame(assays(enrichMotifs)$mlog10Padj) # -log10Padj
enrichMotifs_log10Padj$cluster = rownames(enrichMotifs_log10Padj)
enrichMotifs_log10Padj$cluster = do.call("rbind", strsplit(as.character(enrichMotifs_log10Padj$cluster), "[|]"))[,1]

motifs_meta = read.delim("../ref/vierstra_archetype_meta.tsv")

enrichMotifs_TFs = left_join(enrichMotifs_log10Padj,motifs_meta) # 5193 rows
enrichMotifs_TFs_mat = enrichMotifs_TFs
enrichMotifs_TFs_mat$cluster= NULL
enrichMotifs_TFs_mat$motif_id= NULL
enrichMotifs_TFs_mat$source_id= NULL
enrichMotifs_TFs_mat$tf_name= NULL
enrichMotifs_TFs_mat$family_name= NULL
enrichMotifs_TFs_mat$motif_type= NULL
enrichMotifs_TFs_mat$PMID = NULL
enrichMotifs_TFs_mat = as.matrix(enrichMotifs_TFs_mat)

rownames(enrichMotifs_TFs_mat) = enrichMotifs_TFs$tf_name



enrichMotifs_TFs_mat = unique(enrichMotifs_TFs_mat) # 591  16

rownames(enrichMotifs_TFs_mat) = str_to_title(rownames(enrichMotifs_TFs_mat))

"number of columns of result is not a multiple of vector length (arg 1)"
[1m[22mJoining, by = "cluster"


In [56]:
# average duplicate motifs
unique_motifs = rownames(enrichMotifs_TFs_mat)
unique_motifs = unique_motifs[!(duplicated(unique_motifs)|duplicated(unique_motifs, fromLast=TRUE))]

enrichMotifs_TFs_mat_unique = enrichMotifs_TFs_mat[rownames(enrichMotifs_TFs_mat) %in% unique_motifs,]
enrichMotifs_TFs_mat_dupe = enrichMotifs_TFs_mat[!(rownames(enrichMotifs_TFs_mat) %in% unique_motifs),]
enrichMotifs_TFs_mat_dupe = aggregate(enrichMotifs_TFs_mat_dupe, list(row.names(enrichMotifs_TFs_mat_dupe)), mean)
rn = enrichMotifs_TFs_mat_dupe$Group.1
enrichMotifs_TFs_mat_dupe$Group.1 = NULL
enrichMotifs_TFs_mat_dupe = as.matrix(enrichMotifs_TFs_mat_dupe)
rownames(enrichMotifs_TFs_mat_dupe) = rn
enrichMotifs_TFs_df = as.data.frame(rbind(enrichMotifs_TFs_mat_unique,
                                          enrichMotifs_TFs_mat_dupe))
enrichMotifs_TFs_df$gene_name = rownames(enrichMotifs_TFs_df)

enrichMotifs_TFs_df <- melt(enrichMotifs_TFs_df, id.vars = c("gene_name"))
colnames(enrichMotifs_TFs_df) = c("gene_name","Topic","Motif_enrichment_score")


enrichMotifs_TFs_df = enrichMotifs_TFs_df[enrichMotifs_TFs_df$gene_name %in% genes$gene_name,]



In [57]:
df2 = as.data.frame(full_join(gene_weight, enrichMotifs_TFs_df))

[1m[22mJoining, by = c("gene_name", "Topic")


In [58]:
library(ggrepel)
p2=list()
for (topic in unique(df2$Topic)){
    df_topic = df2[df2$Topic == topic,]
    df_topic$Motif_enrichment_score[is.na(df_topic$Motif_enrichment_score)] = 0
    
    df_topic = df_topic[df_topic$RNA_weight > 1,]
    df_topic = df_topic[df_topic$Motif_enrichment_score > 0,]

    
    p2[[topic]] = ggplot(df_topic, aes(x=log(RNA_weight+1), y=Motif_enrichment_score)) + 
          geom_point() +theme(text=element_text(size=21)) +
          ggtitle(topic) +
          geom_text_repel(size = 5,max.overlaps = Inf,data = df_topic %>% 
                          mutate(label = ifelse(log(RNA_weight+1) > 0 & 
                                                Motif_enrichment_score > 0,
                                                gene_name, "")),
                          aes(label = label), 
                          # box.padding = 1,
                          show.legend = FALSE)
}


In [59]:
options(repr.plot.width=10,repr.plot.height=5)
pdf(file=paste0("ENC4_Mouse_Topics_Heart/Plots/motif_enrichment_vs_weight.pdf"),width=15,height=5)

p2[["Topic1"]]+p2[["Topic2"]]
p2[["Topic3"]]+p2[["Topic5"]]
p2[["Topic6"]]+p2[["Topic7"]]
p2[["Topic8"]]+p2[["Topic10"]]
p2[["Topic11"]]+p2[["Topic12"]]

dev.off()

## Topic gene weight vs. peak gene score

In [60]:
anno = getPeakAnnotation(ArchRProj = hrt, name = "Motif")
peaks_by_motifs = readRDS(anno$Matches)
grn_df = assays(peaks_by_motifs)$matches
grn_meta = rowData(peaks_by_motifs)

genes = read.csv("../ref/gene_weights_rLDA_model.csv")
genes$gene_name = genes$X
genes$X = NULL
genes$Topic13 = NULL
genes$Topic9 = NULL
genes$Topic4 = NULL
library(reshape2)
gene_weight <- melt(genes, id.vars = c("gene_name"))
colnames(gene_weight) = c("gene_name","Topic","RNA_weight")


In [61]:
p=list()
for (topic in unique(rownames(grn_df))){

topic_df = grn_df[grep(topic,rownames(grn_df)),] # columns are motifs
topic_meta = as.data.frame(grn_meta[grep(topic,rownames(grn_meta)),])

# filter peaks by distance to TSS
topic_meta = topic_meta[topic_meta$distToTSS < 10000,]
# get the peak info we care about
topic_meta_scores = topic_meta[,c("score","nearestGene")]

library(plyr)
# sum up scores for duplicate genes
topic_scores = ddply(topic_meta_scores,
                              "nearestGene",numcolwise(sum))
topic_scores$gene_name = topic_scores$nearestGene
topic_scores$nearestGene = NULL
topic_scores$Topic = topic

gene_weight_topic = gene_weight[gene_weight$Topic == topic,]
gene_weight_topic = gene_weight_topic[gene_weight_topic$RNA_weight > 1,]
df = left_join(gene_weight_topic, topic_scores)
df$score[is.na(df$score)]= 0

library(ggrepel)

p[[topic]]=ggplot(df, aes(x=log(RNA_weight+1), y=score)) + 
          geom_point() +theme(text=element_text(size=21)) +
          geom_text_repel(size = 5,max.overlaps = Inf,data = df %>% 
                          mutate(label = ifelse(log(RNA_weight+1) > max(log(RNA_weight+1))/2 & score > max(score)/2,
                                                gene_name, "")),
                          aes(label = label), 
                          # box.padding = 1,
                          show.legend = FALSE)+labs(title=topic,
        x ="log(weight of topic gene + 1)", y = "summed score of peaks\nassociated w/ topic gene")
}


[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")
[1m[22mJoining, by = c("gene_name", "Topic")


In [62]:
pdf(file=paste0("ENC4_Mouse_Topics_Heart/Plots/peak_score_vs_weight.pdf"),width=15,height=5)

options(repr.plot.width=10,repr.plot.height=5)
p[["Topic1"]]+p[["Topic2"]]
p[["Topic3"]]+p[["Topic5"]]
p[["Topic6"]]+p[["Topic7"]]
p[["Topic8"]]+p[["Topic10"]]
p[["Topic11"]]+p[["Topic12"]]

dev.off()

## Make DF