-
Notifications
You must be signed in to change notification settings - Fork 1
/
14-Gene-Enrichment-with-GO-MWU.Rmd
executable file
·418 lines (318 loc) · 19.4 KB
/
14-Gene-Enrichment-with-GO-MWU.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
---
title: "Gene Enrichment with GO-MWU"
author: "Yaamini Venkataraman"
date: "7/30/2019"
output: html_document
---
In this R Markdown document, I will format files for gene enrichment with [GO-MWU](https://github.com/z0on/GO_MWU). GO-MWU needs two input files. The first is a gene list, with all genes used in differential methylation analysis with all associated GOterms. The second is a list of genes with a continuous measure of significance.
**NOTE: MOVE THIS R MARKDOWN FILE INTO THE `2018-12-02-Gene-Enrichment-Analysis` FOLDER BEFORE RUNNING.**
# Set up R Markdown file
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Install packages
```{r}
#install.packages("dichromat")
require(dichromat)
```
# Obtain session information
```{r}
sessionInfo() #Obtain session information
```
# Create master list
## Import gene background
```{r}
geneBackground <- read.csv("2019-06-20-mRNA-Gene-Background-Uniprot.csv") #Import gene background with Uniprot annotations
geneBackground <- geneBackground[,-1] #Remove extra column
head(geneBackground) #Confirm import
```
## Import GOterms
```{r}
uniprotGOTerms <- read.delim("uniprot-reviewed_yes.tab", header = TRUE) #Import tab-delimited file with header
uniprotGOTerms <- uniprotGOTerms[,c(1,8)] #Retain Entry and GOterm columns
colnames(uniprotGOTerms) <- c("Uniprot", "GO") #Rename columns
head(uniprotGOTerms) #Confirm import
```
## Merge gene background with GOterms
```{r}
geneBackgroundGOterms <- merge(x = geneBackground, y = uniprotGOTerms, by = "Uniprot") #Merge files by Uniprot ID
head(geneBackgroundGOterms) #Confirm merge
```
```{r}
geneBackgroundGOterms$DMLB.start <- geneBackgroundGOterms$DMLB.start + 1 #Add 1 to each start position. Normally I would subtract 1 from each end position (dinucleotide), but the geneBackground file doesn't seem to include that.
geneBackgroundGOterms <- geneBackgroundGOterms[,c(1:4, 9, 20, 22:23)] #Remove extra columns
head(geneBackgroundGOterms) #Confirm changes
```
## Merge gene background with all loci tested
```{r}
allTested <- read.csv("../2018-10-25-MethylKit/2018-10-25-Loci-Analysis/2019-06-30-All-Loci-Methylation-Statistic-Filtered-Destrand-50-Cov5.csv", header = TRUE) #Import all loci tested
allTested <- allTested[,-1] #Remove extra column
colnames(allTested) <- c("chr", "DMLB.start", "DMLB.end", "strand", "pvalue", "qvalue", "meth.diff") #Rename columns to match previous naming conventions
head(allTested) #Confirm import
```
```{r}
allTestedGOterms <- merge(x = allTested, y = geneBackgroundGOterms, by = "DMLB.start") #Merge all loci tested with gene background information
head(allTestedGOterms) #Confirm merge
```
```{r}
allTestedGOterms <- allTestedGOterms[,c(2, 1, 3:8, 11:14)] #Reorganize columns
colnames(allTestedGOterms) <- c("chr", "DMLB.start", "DMLB.end", "strand", "pvalue", "qvalue", "meth.diff", "Uniprot", "Genbank", "evalue", "Product", "GO") #Rename columns
head(allTestedGOterms) #Confirm changes
```
## Remove redundant lines
```{r}
attach(allTestedGOterms) #Attach dataframe
allTestedGOterms <- allTestedGOterms[order(Genbank, pvalue),] #Sort by gene ID, then p-value
detach(allTestedGOterms) #Detatch dataframe
head(allTestedGOterms) #Confirm changes
```
```{r}
write.csv(allTestedGOterms, "2019-07-30-All-Tested-Loci-Uniprot-GOTerms.csv", quote = FALSE, row.names = FALSE) #Save ordered file
```
```{bash}
#Sort a file by the Genbank column (--key = 9,9) and only retain unique lines (--unique). Save output to a new file
sort --unique --key=9,9 2019-07-30-All-Tested-Loci-Uniprot-GOTerms.csv \
> 2019-07-30-All-Tested-Loci-Uniprot-GOTerms-Unique.csv
```
```{bash}
head -n4 2019-07-30-All-Tested-Loci-Uniprot-GOTerms-Unique.csv
```
# Create GO-MWU inputs
```{r}
allTestedGOtermsCondensed <- read.csv("2019-07-30-All-Tested-Loci-Uniprot-GOTerms-Unique.csv", header = TRUE) #Import file with one line per gene
head(allTestedGOtermsCondensed, 2) #Confirm import
```
## GO annotations table
The first table needs to be a tab-delimited file with one column for genes and one column for GOterms.
```{r}
goAnnotations <- data.frame(allTestedGOtermsCondensed$Genbank,
allTestedGOtermsCondensed$GO) #Create a new dataframe with Genbank and GO columns
head(goAnnotations) #Confirm format is good. It is.
```
```{r}
write.table(goAnnotations, "2019-07-30-allTested-GO-Annotations-Table.tab", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) #Save file
```
```{bash}
head 2019-07-30-allTested-GO-Annotations-Table.tab #Confirm formatting was retained
```
## Table of significance measures
This table is a .csv file with one column for gene IDs and another for a continuous significance measure. All genes need to be included, not just those with significant DML. Similar to the GO-MWU example, I will use signed negative log p-values modified from `methylKit` output. Hypomethylated loci will have negative p-values, and hypermethylated loci will have positive p-values.
```{r}
sigMeasures <- data.frame("gene" = allTestedGOtermsCondensed$Genbank,
"logP" = log(allTestedGOtermsCondensed$pvalue),
"meth.diff" = allTestedGOtermsCondensed$meth.diff) #Subset Genbank ID, log(p-value), and meth.diff
head(sigMeasures) #Confirm subset
```
```{r}
sigMeasuresUncorrected <- sigMeasures[,c(1:2)] #Keep first two columns to see if GO-MWU provides different results with uncorrected p-values
head(sigMeasuresUncorrected) #Confirm subset
```
```{r}
write.csv(sigMeasuresUncorrected, "2019-07-30-allTested-Table-of-Significance-Measures-Uncorrected.csv", quote = FALSE, row.names = FALSE) #Save file
```
# Gene enrichment with GO-MWU
These commands were copied from the GO-MWU.R file in the GO-MWU Github repository.
### Biological processes
#### Edit input variables
```{r}
input="2019-07-30-allTested-Table-of-Significance-Measures-Uncorrected.csv" # two columns of comma-separated values: gene id, continuous measure of significance. To perform standard GO enrichment analysis based on Fisher's exact test, use binary measure (0 or 1, i.e., either sgnificant or not).
goAnnotations="2019-07-30-allTested-GO-Annotations-Table.tab" # two-column, tab-delimited, one line per gene, multiple GO terms separated by semicolon.
goDatabase="go.obo" # download from http://www.geneontology.org/GO.downloads.ontology.shtml
goDivision="BP" # BP for biological processes
source("gomwu.functions.R")
```
#### Run GO-MWU
```{r}
# Calculating stats. It might take ~3 min for MF and BP. Do not rerun it if you just want to replot the data with different cutoffs, go straight to gomwuPlot. If you change any of the numeric values below, delete the files that were generated in previos runs first.
gomwuStats(input, goDatabase, goAnnotations, goDivision,
perlPath="perl", # replace with full path to perl executable if it is not in your system's PATH already
largest=0.1, # a GO category will not be considered if it contains more than this fraction of the total number of genes
smallest=5, # a GO category should contain at least this many genes to be considered
clusterCutHeight=0.25) # threshold for merging similar (gene-sharing) terms. See README for details.
# There are no GO terms pass 10% FDR.
```
#### Describe parent GOslim terms
Even though there are no significantly enriched terms, I want to describe the parent GOslim categories GO-MWU used to sort my data.
##### All tested loci
```{r}
parentGOBP <- read.delim("BP_2019-07-30-allTested-Table-of-Significance-Measures.csv", header = TRUE) #Import GO-MWU output file with individual genes and associated GOslim term. Even though it is a .csv extension, it is a tab-delimited file
colnames(parentGOBP) <- c("name", "term", "lev", "Genbank", "value") #Rename gene ID column to match previous naming conventions
head(parentGOBP) #Confirm import
```
###### Map to GOslim terms
```{r}
CVGOslim <- read.delim("../2018-12-02-Gene-Enrichment-Analysis/Blastquery-GOslim-BP.sorted.unique.noOther", sep = "\t", header = FALSE) #Import GOslim information
colnames(CVGOslim) <- c("Genbank", "GOslim") #Add column names
head(CVGOslim) #Confirm import
```
```{r}
parentGOBPGOslim <- unique(merge(x = parentGOBP, y = CVGOslim, by = "Genbank", all.x = TRUE)) #Merge GO-MWU with GOslim information, retaining all GO-MWU entries. Remove any merging artifacts
head(parentGOBPGOslim) #Confirm merge
length(parentGOBPGOslim$GOslim) #Count number of entries
```
```{r}
sum(is.na(parentGOBPGOslim$GOslim)) #Count how many entries do not have GOslim terms. 27/2281 GO-MWU entries do not have GOslim entries.
```
```{r}
parentGOBPGOslimCounts <- as.data.frame(table(parentGOBPGOslim$GOslim)) #Count the frequency of each GOslim term
colnames(parentGOBPGOslimCounts) <- c("CVGOslim", "frequency") #Rename columns
head(parentGOBPGOslimCounts) #Confirm counts
```
```{r}
write.csv(parentGOBPGOslimCounts, "2019-07-30-allTested-CVGOSlim-Frequency-BP.csv", row.names = FALSE, quote = FALSE) #Save file
```
```{r}
parentGOBPGOslimCounts$percentage <- (parentGOBPGOslimCounts$frequency/sum(parentGOBPGOslimCounts$frequency)*100) #Create a new column with percentages of GOSlim terms instead of raw counts
head(parentGOBPGOslimCounts) #Confirm column creation
max(parentGOBPGOslimCounts$percentage)
```
```{r}
parentGOBPGOslimCounts <- parentGOBPGOslimCounts[c(4, 1, 5, 11, 13,
2, 6, 3,
12,
7, 10, 9, 8),] #Reorganize rows based on broader functions. Rows are general activity, development, stress response, and metabolic processes
head(parentGOBPGOslimCounts) #Confirm organization
```
```{r}
parentGOBPGOslimCounts$GOSlim <- c("Cell-Cell Signaling", "Cell Adhesion", "Death", "Signal Transduction", "Transport",
"Cell Cycle and Proliferation", "Developmental Processes", "Cell Organization and Biogenesis",
"Stress Response",
"DNA Metabolism", "RNA Metabolism", "Protein Metabolism", "Other Metabolic Processes") #Capitalize labels
head(parentGOBPGOslimCounts) #Confirm capitalization
```
####### Create figure
```{r}
RColorBrewer::display.brewer.all() #Show all RColorBrewer palettes. I will choose greens.
plotColors <- rev(RColorBrewer::brewer.pal(5, "GnBu")) #Create a color palette for the barplots. Use 5 green-blue shades from RColorBrewer. Reverse theorder so the darkest shade is used first.
barplot(parentGOBPGOslimCounts$percentage,
col = plotColors) #See what plot looks like with new scheme
barplot(parentGOBPGOslimCounts$percentage,
col = dichromat(plotColors)) #Check that the plot colors will be easy to interpret for those with color blindess
```
```{r}
#pdf("2019-11-19-BP-GOSlim-allTested-Gene-Overlaps.pdf", width = 11, height = 8.5) #Save as a pdf
par(mar = c(5, 20, 1, 1)) #Change figure boundaries
barsGOSlim <- barplot((parentGOBPGOslimCounts$percentage),
horiz = TRUE,
axes = FALSE,
xlim = c(0,18),
col = c(rep(plotColors[2], times = 5),
rep(plotColors[4], times = 3),
rep(plotColors[5], times = 1),
rep(plotColors[1], times = 4))) #Create a barplot that is horizontal (horiz = TRUE). Use axes = FALSE to remove all axes. Set xlim based on maximum percentage value. Colors correspond to broader GOSlim groupings. Save plot as a new object.
axis(side = 2, at = barsGOSlim, labels = parentGOBPGOslimCounts$GOSlim, tick = FALSE, las = 2, col = "grey80", cex.axis = 1.5) #Add y-axis with GOSlim terms from BPGOSlimCounts. Place labels at specific barplot values from barsGOSlim. Remove tick marks (tick) and change orientation of labels (las)
axis(side = 1, at = seq(from = 0, to = 18, by = 2), cex = 1.2, col = "grey80") #Add x-axis
mtext(side = 1, "Percent of Genes", line = 3, cex = 1.5) #Add x-axis label
par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend("bottomleft",
xpd = TRUE,
legend = c("General Activity", "Development", "Stress Response", "Metabolic Processes"),
pch = 22,
col = "black",
pt.bg = c(plotColors[2], plotColors[4], plotColors[5], plotColors[1]),
bty = "n") #Place a legend in the bottom left of the figure with no box
#dev.off()
```
##### Differentially methylated loci
GO-MWU takes all loci into account, but there may be some differences in parental GOslim terms if I only look at differentially methylated loci. I will import the DML annotations I made previously and work from there.
```{r}
DMLGeneAnnotation <- read.csv("2019-06-20-DML-Gene-Annotation.csv", header = TRUE, row.names = 1) #Import file. The first column are row names.
head(DMLGeneAnnotation) #Confirm import
```
```{r}
DMLGenbank <- data.frame("Genbank" = DMLGeneAnnotation$Genbank,
"pvalue" = DMLGeneAnnotation$pvalue,
"meth.diff" = DMLGeneAnnotation$meth.diff) #Create a new dataframe with Genbank IDs and pvalues for each DML. Include meth.diff as well
head(DMLGenbank) #Confirm dataframe creation
```
```{r}
parentGOBPDML <- unique(merge(x = DMLGenbank, y = parentGOBP, by = "Genbank", all.x = TRUE)) #Merge parent GOslim terms and DML information by Genbank ID. Keep only unique entries. This will allow me to retain multiple DML per gene, but remove any merging artifacts. Keep rows that don't have matching GOterms from GO-MWU just in case.
head(parentGOBPDML) #Confirm merge. There are several entries without any matching information from GO-MWU.
```
```{r}
sum(is.na(parentGOBPDML$name)) #Count how many genes with DML do not have GO-MWU entries.
```
###### Map to GOslim terms
```{r}
parentGOBPDMLGOslim <- unique(merge(x = parentGOBPDML, y = CVGOslim, by = "Genbank", all.x = TRUE)) #Merge GO-MWU with GOslim information, retaining all GO-MWU entries. Remove any merging artifacts
head(parentGOBPDMLGOslim) #Confirm merge
length(parentGOBPDMLGOslim$GOslim) #Count number of entries
```
```{r}
sum(is.na(parentGOBPDMLGOslim$GOslim)) #Count how many entries do not have GOslim terms. 425/5283 GO-MWU entries do not have GOslim entries.
```
```{r}
parentGOBPDMLGOslimCounts <- as.data.frame(table(parentGOBPDMLGOslim$GOslim)) #Count the frequency of each GOslim term
colnames(parentGOBPDMLGOslimCounts) <- c("CVGOslim", "frequency") #Rename columns
head(parentGOBPDMLGOslimCounts) #Confirm counts
```
```{r}
write.csv(parentGOBPDMLGOslimCounts, "2019-07-30-condensedDML-CVGOSlim-Frequency-BP.csv", row.names = FALSE, quote = FALSE) #Save file
```
```{r}
parentGOBPDMLGOslimCounts$percentage <- (parentGOBPDMLGOslimCounts$frequency/sum(parentGOBPDMLGOslimCounts$frequency)*100) #Create a new column with percentages of GOSlim terms instead of raw counts
head(parentGOBPDMLGOslimCounts) #Confirm column creation
max(parentGOBPDMLGOslimCounts$percentage)
```
```{r}
parentGOBPDMLGOslimCounts <- parentGOBPDMLGOslimCounts[c(4, 1, 5, 11, 13,
2, 6, 3,
12,
7, 10, 9, 8),] #Reorganize rows based on broader functions. Rows are general activity, development, stress response, and metabolic processes
head(parentGOBPDMLGOslimCounts) #Confirm organization
```
```{r}
parentGOBPDMLGOslimCounts$GOSlim <- c("Cell-Cell Signaling", "Cell Adhesion", "Death", "Signal Transduction", "Transport",
"Cell Cycle and Proliferation", "Developmental Processes", "Cell Organization and Biogenesis",
"Stress Response",
"DNA Metabolism", "RNA Metabolism", "Protein Metabolism", "Other Metabolic Processes") #Capitalize labels
head(parentGOBPDMLGOslimCounts) #Confirm capitalization
```
####### Create figure
```{r}
#pdf("2019-11-19-BP-GOSlim-DML-Gene-Overlaps.pdf", width = 11, height = 8.5) #Save as a pdf
par(mar = c(5, 20, 1, 1)) #Change figure boundaries
barsGOSlimDML <- barplot(parentGOBPDMLGOslimCounts$percentage,
horiz = TRUE,
axes = FALSE,
xlim = c(0,18),
col = c(rep(plotColors[2], times = 5),
rep(plotColors[4], times = 3),
rep(plotColors[5], times = 1),
rep(plotColors[1], times = 4))) #Create a barplot that is horizontal (horiz = TRUE). Use axes = FALSE to remove all axes. Set xlim based on maximum percentage value. Colors correspond to broader GOSlim groupings. Save plot as a new object.
axis(side = 2, at = barsGOSlimDML, labels = parentGOBPDMLGOslimCounts$GOSlim, tick = FALSE, las = 2, col = "grey80", cex.axis = 1.5) #Add y-axis with GOSlim terms from BPGOSlimCounts. Place labels at specific barplot values from barsGOSlim. Remove tick marks (tick) and change orientation of labels (las)
axis(side = 1, at = seq(from = 0, to = 18, by = 2), cex = 1.2, col = "grey80") #Add x-axis
mtext(side = 1, "Percent of Genes with DML", line = 3, cex = 1.5) #Add x-axis label
par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE) #Create new plot
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #Add new plot on top of current plot
legend("bottomleft",
xpd = TRUE,
legend = c("General Activity", "Development", "Stress Response", "Metabolic Processes"),
pch = 22,
col = "black",
pt.bg = c(plotColors[2], plotColors[4], plotColors[5], plotColors[1]),
bty = "n") #Place a legend in the bottom left of the figure with no box
#dev.off()
```
##### Figure to compare all tested and DML distributions
```{r}
#pdf("2019-11-19-BP-GOSlim-allTested-Versus-DML.pdf", width = 11, height = 8.5) #Save as a pdf
par(mfrow = c(1, 2), oma = c(5, 20, 0, 3), mar = c(0, 0, 0, 0)) #Set up parameters for multipanel plot
barsGOSlim <- barplot(-1*(parentGOBPGOslimCounts$percentage),
horiz = TRUE,
axes = FALSE,
xlim = c(-18,0),
col = plotColors[5]) #Create a barplot that is horizontal (horiz = TRUE). Use axes = FALSE to remove all axes. Set xlim based on maximum percentage value. Colors correspond to broader GOSlim groupings. Save plot as a new object.
axis(side = 2, at = barsGOSlim, labels = parentGOBPGOslimCounts$GOSlim, tick = FALSE, las = 2, col = "grey80", cex.axis = 1.5) #Add y-axis with GOSlim terms from BPGOSlimCounts. Place labels at specific barplot values from barsGOSlim. Remove tick marks (tick) and change orientation of labels (las)
axis(side = 1, at = seq(from = -18, to = 0, by = 2), labels = c(seq(from = 18, to = 0, by = -2)), cex = 1.2, col = "grey80") #Add x-axis
mtext(side = 1, "% Genes", line = 3, cex = 1.5) #Add x-axis label
barsGOSlimDML <- barplot(parentGOBPDMLGOslimCounts$percentage,
horiz = TRUE,
axes = FALSE,
xlim = c(0,18),
col = plotColors[2]) #Create a barplot that is horizontal (horiz = TRUE). Use axes = FALSE to remove all axes. Set xlim based on maximum percentage value. Colors correspond to broader GOSlim groupings. Save plot as a new object.
axis(side = 1, at = seq(from = 0, to = 18, by = 2), cex = 1.2, col = "grey80") #Add x-axis
mtext(side = 1, "% Genes with DML", line = 3, cex = 1.5) #Add x-axis label
#dev.off()
```