/
find_de_novo_motifs_Etv2_chipseq_peaks.Rmd
113 lines (102 loc) · 3.8 KB
/
find_de_novo_motifs_Etv2_chipseq_peaks.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
---
title: "Find de novo Motifs in Etv2 ChIP-seq peaks"
author: "Wuming Gong"
date: "1/4/2020"
output: rmarkdown::github_document
---
We used two ways `Homer` and `Dreme` to identify de novo motifs in a list of sequences. Both of them need a long time to execute. This script should run on MSI.
```{r, include = FALSE}
library(BSgenome.Mmusculus.UCSC.mm10)
library(DECIPHER)
library(chromVARmotifs) # https://github.com/GreenleafLab/chromVARmotifs
library(motifmatchr)
library(BiocParallel)
library(TFBSTools)
library(rGADEM)
library(dplyr)
devtools::load_all('packages/compbio')
register(MulticoreParam(4)) # Use 8 cores
MEF_peak_file <- 'https://s3.msi.umn.edu/gongx030/datasets/dataset=Etv2PioneerChIPseq_version=20191203a/MEF_Dox_d1_Etv2_summits.rds'
EB_peak_file <- 'https://s3.msi.umn.edu/gongx030/datasets/dataset=Etv2PioneerChIPseq_version=20191203a/EB_Dox_3h_Etv2_summits.rds'
mm_file <- sprintf('%s/Etv2_chipseq_peaks_width=200.rds', project_dir('etv2_pioneer'))
core_window_size <- 20
matchMotifs_p_cutoff <- 0.05
```
# Generate a merged dataset
For EB: cluster 1: nucleosome; cluster 2: NFR
For MEF: cluster 1: NFR; cluster 3: nucleosome
```{r}
mef <- readRDS(gzcon(url(MEF_peak_file)))
eb <- readRDS(gzcon(url(EB_peak_file)))
gr_eb <- granges(eb)
gr_eb$type <- NA
gr_eb$type[eb$cluster == 1] <- 'nucleosome'
gr_eb$type[eb$cluster == 2] <- 'NFR'
gr_eb$cell <- 'EB'
gr_mef <- granges(mef)
gr_mef$type <- NA
gr_mef$type[mef$cluster == 1] <- 'NFR'
gr_mef$type[mef$cluster == 3] <- 'nucleosome'
gr_mef$cell <- 'MEF'
gr <- c(gr_eb, gr_mef)
```
# Retrieve the DNA sequences at the core window
```{r}
gr <- resize(gr, fix = 'center', width = core_window_size)
data('homer_pwms')
etv2 <- homer_pwms[['Etv2(ETS)/ES-ER71-ChIP-Seq(GSE59402)/Homer(0.967)']] # a PWMatrix object
mm <- matchMotifs(etv2, gr, genome = 'mm10', p.cutoff = matchMotifs_p_cutoff, out = 'positions')[[1]]
gr$has_Etv2_motif <- gr %over% mm
```
# Show the % of Etv2 peaks that have the canonical Etv2 motifs in each category
```{r}
x <- table(gr$cell, gr$has_Etv2_motif)
barplot(t(x / rowSums(x)))
x <- table(gr$type, gr$has_Etv2_motif)
barplot(t(x / rowSums(x)))
```
# Retrieve the sequence in each category
```{r}
mm <- resize(mm, fix = 'center', width = 200)
mm$cell <- cbind(
MEF = mm %over% gr[gr$cell == 'MEF'],
EB = mm %over% gr[gr$cell == 'EB']
)
mm$type <- cbind(
NFR = mm %over% gr[!is.na(gr$type) & gr$type == 'NFR'],
nucleosome = mm %over% gr[!is.na(gr$type) & gr$type == 'nucleosome']
)
mm$sequence <- getSeq(BSgenome.Mmusculus.UCSC.mm10, mm)
mm$sequence <- reverseComplement(mm$sequence)
saveRDS(mm, mm_file)
s3_upload('projects/etv2_pioneer/', make_public = TRUE)
```
# Option 1: Use DREME to find enriched motifs
And save the results for following analysis (This step takes a few hours)
```{r, include = FALSE}
devtools::load_all('packages/compbio'); pwm_list <- run_dreme(
mm$sequence,
binary = 'dreme',
argument = list('-mink' = 8, '-maxk' = 12, '-m' = 20, '-e' = 1e-50, '-s' = 1)
)
metadata(mm)$dreme <- pwm_list
pwm_list_file <- sprintf('%s/Etv2_chipseq_peaks_width=200_E=1e-50_with_dreme.rds', project_dir('etv2_pioneer'))
saveRDS(mm, pwm_list_file)
s3_upload('projects/etv2_pioneer/', make_public = TRUE)
```
# Option 2: Use Homer to find enriched de novo motifs
This steps takes ~ 30 mins
```{r, include = FALSE}
homer_output_dir <- sprintf('%s/Etv2_chipseq_peaks_width=200_homer', project_dir('etv2_pioneer'))
homer_motif_files <- sprintf('%s/%s', homer_output_dir, c('homerMotifs.motifs8', 'homerMotifs.motifs10', 'homerMotifs.motifs12'))
devtools::load_all('packages/compbio'); pwm_list <- homer_findMotifsGenome(
mm,
genome = 'mm10',
binary = 'findMotifsGenome.pl',
known = FALSE,
nocheck = TRUE,
output_dir = homer_output_dir,
argument = list('-len' = '8,10,12', '-size' = 200, '-S' = 25)
)
s3_upload('projects/etv2_pioneer/', make_public = TRUE)
```