-
Notifications
You must be signed in to change notification settings - Fork 0
/
Figure3.R
470 lines (394 loc) · 21.2 KB
/
Figure3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
### Figure 3
# the following files are provided in OSF https://osf.io/9xys4/
Counts <- read.delim("Counts.txt",h=T)
Diff_ctr <- read.delim("Diff_ctr.txt",h=T)
colData_RNA_Diff <- data.frame("Sample"=colnames(Diff_ctr)[3:31],
"Donor"=gsub("_.*","",colnames(Diff_ctr)[3:31]),
"Timepoint"=gsub(".*_","",colnames(Diff_ctr)[3:31]))
rownames(colData_RNA_Diff)<-colData_RNA_Diff$Sample
### Figure 3A
# the following files are provided in OSF https://osf.io/9xys4/
ISMARA <- read.delim("ReadyToUse_ISMARA.txt",h=T)
GOI <- c('NFATC1','JUN','ALX1')
for(i in GOI){
tmp <- colData_RNA_Diff
tmp$x <- NA
tmp[tmp$Timepoint=='d0','x'] <- 1
tmp[tmp$Timepoint=='d2','x'] <- 2
tmp[tmp$Timepoint=='d5','x'] <- 3
tmp[tmp$Timepoint=='d9','x'] <- 4
d0 <- unlist(ISMARA[rownames(ISMARA) == i,colData_RNA_Diff[colData_RNA_Diff$Timepoint=='d0','Sample']])
d2 <- unlist(ISMARA[rownames(ISMARA) == i,colData_RNA_Diff[colData_RNA_Diff$Timepoint=='d2','Sample']])
d5 <- unlist(ISMARA[rownames(ISMARA) == i,colData_RNA_Diff[colData_RNA_Diff$Timepoint=='d5','Sample']])
d9 <- unlist(ISMARA[rownames(ISMARA) == i,colData_RNA_Diff[colData_RNA_Diff$Timepoint=='d9','Sample']])
boxplot(d0,d2,d5,d9,ylab=paste("Motif activity",i), xaxt="none")
axis(1,at=c(1:4),labels = c('D0','D2','D5','D9'))
for(k in unique(colData_RNA_Diff$Donor)){
a <- tmp[tmp$Donor==k,'x']
b <- unlist(ISMARA[rownames(ISMARA) == i,colData_RNA_Diff[colData_RNA_Diff$Donor==k,'Sample']])
lines(a,b)
}
}
rm(GOI,a,b,d0,d2,d5,d9,tmp,tmp2,i,ISMARA)
### Figure 3B
# the following files are provided in OSF https://osf.io/9xys4/
ISMARA <- read.delim("ReadyToUse_ISMARA.txt",h=T)
# TFs from GO OC terms and DOI: 10.1002/jbmr.2229 - manual collection
OC_TFs <- c("JUN","CREB1","NRF1","SREBF2","NFYA","NFYB","NFYC","NR3C1","FOS","ATF1","ATF2","SMARCC2","ETS1","SIX5","ATF4","ZBTB33","BCL6","NFATC1","RFX5","JUND","MAFB","MITF","TFE3","CEBPB","JUNB","ESRRA","FOXP1")
# plot only individual motifs not factors
tmp <- ISMARA
tmp$Dup <- duplicated(tmp$Motif)
tmp <- tmp[tmp$Dup=="FALSE",]
rownames(tmp) <- tmp$Motif
# extract motif activities for significant motifs at average level and cluster them
y <- tmp[apply(tmp[,c("TTd2vsd0","TTd5vsd0","TTd9vsd0","TTd5vsd2","TTd9vsd2","TTd9vsd5")],1, min)< 0.001,c('d0','d2','d5','d9')]
y <- t(scale(t(y)))
hr <- hclust(as.dist(1-cor(t(y))))
# extract motif activities for significant motifs at donor level and use clustering from above for visualization
y <- tmp[apply(tmp[,c("TTd2vsd0","TTd5vsd0","TTd9vsd0","TTd5vsd2","TTd9vsd2","TTd9vsd5")],1, min)< 0.001,colData_RNA_Diff[order(colData_RNA_Diff$Timepoint),"Sample"]]
y <- t(scale(t(y)))
y[y< -2] <- -2
y[y > 2] <- 2
dev.off()
library(fields)
library(gplots)
library(RColorBrewer)
Mycol <- rev(designer.colors(100, col=brewer.pal(9,"Spectral")))
# As heatmap.2 does not plot all the names of the rows, the information is given in a txt file
heatp <- heatmap.2(y, trace="none", Colv=F, Rowv=as.dendrogram(hr), col=Mycol, labRow = rownames(y), )
# Reduce the information to the factors included in OC_TFs
# Create list with each motif being split up into TFs that are combined by ISAMRA
label_row <- strsplit(rownames(y),"_")
# get the index of the list elements which contain an exact match with one of the factors from OC_TFs
a <- c()
for (i in OC_TFs){
a <- c(a,which(lapply(label_row, function(x) grep(paste("^",i,"$", sep=""), x))!=0))
}
# replace rownames labels that are not in vector a with nothing
label <- rownames(y)
label[c(1:381)[!c(1:381) %in% a]] <- ""
# putput the labels in excel and combine it with heatmap in illustrator
write.table(rev(label[heatp$rowInd]), file="Motifs_OC_terms_heatmap.txt", sep="\n", quote=F, col.names=F, row.names=F)
rm(a,label_row,label,heatp,ISMARA,tmp,y,hr,Mycol,OC_TFs,i)
### Figure 3C
# Due that the ranking of the genes within a cluster is random, the start and end points of the lines are none-deterministic
library(circlize)
library(gplots)
library(RColorBrewer)
library(biomaRt)
library(ggplot2)
library(fields)
# the following files are provided in OSF https://osf.io/9xys4/
ISMARA <- read.delim("ReadyToUse_ISMARA.txt",h=T)
Targets <- read.delim("ReadyToUse_ISMARA_Targets.txt",h=T)
# Set up Targets by focusing on the factors that are part of the RNA_seq data
tmp <- Targets[Targets$Score > 1 & Targets$Factor %in% Diff_ctr$Symbol & Targets$Target %in% Diff_ctr$Symbol ,c("Factor","Target")]
tmp <- unique(tmp)
# Make a vector with all genes ISAMRA network has information for
vec <- unique(c(as.character(unique(tmp[,1])), as.character(unique(tmp[,2]))))
# Reduce Diff_ctr to all the genes that are part of the vector ablove and change cluster 0 to 9, due to visualization
test <- Diff_ctr[Diff_ctr$Symbol %in% vec,c('Symbol','clust',"d0","d2","d5","d9")]
names(test)[2] <- 'Cl'
test[test$Cl == 0,'Cl'] <- 9
rm(vec)
# Order genes based on cluster information
test <- test[order(test[,2], decreasing=F),]
# Make a help data frame in which the genes are ranked for each individual cluster from 1 to the number of genes in each cluster
# This information is needed in the circos plot to identify the rank of the regulating TF (from which the lines start) to the rank of the targets (where the lines end)
test_final <- data.frame(matrix(NA, ncol=ncol(test)+1, nrow=1))
names(test_final) <- c(names(test),'Rank')
test_final <- test_final[-1,]
for (i in 1:9){
test_1 <- test[test$Cl==i,]
test_1$Rank <- 1:nrow(test_1)
test_final <- rbind(test_final, test_1)
}
rm(test_1,i,test)
# Get the rank information into the network data frame tmp
tmp_net <- merge(tmp, test_final, by.x="Factor", by.y="Symbol")
tmp_net <- merge(tmp_net, test_final[,c(1:2,7)], by.x="Target", by.y="Symbol")
# Reorder the tmp_net
tmp_net <- tmp_net[,c(2,1,4:7,3,8:10)]
names(tmp_net)[c(7:10)] <- c('Cl_Fac','Rank_Fac','Cl_Tar','Rank_Tar')
# Order the network according to the clusters and ranl of the targets
tmp_net <- tmp_net[order(tmp_net$Cl_Fac, tmp_net$Rank_Fac),]
# Plotting using the circos package
circos.clear()
par(mfrow=c(1,2),mar = c(1, 1, 1, 1), pty="s" )
circos.par("gap.degree" = c(rep(2, 8),20))
circos.par(cell.padding = c(0.01, 0.01, 0.01, 0.01))
# Plot for NFATC1 and JUN
GOI <- c('NFATC1','JUN')
for (j in GOI){
tmp_net4 <- tmp_net[tmp_net$Factor== j,]
# Set up the individual segments of the circos plot (number of segments by number of clusters and size of segments by the rank)
circos.initialize(factors = test_final$Cl, x = test_final$Rank )
# Also change Diff_ctr cluster 0 to 9
Diff_ctr_tmp <- Diff_ctr
Diff_ctr_tmp[Diff_ctr_tmp$clust==0,'clust'] <- 9
# Calculate the enrichment of the individual clusters for the target genes of the factor of interest
# The numerical value will be transformed to a color with (blue depletion and red enrichement and white no change)
# Define color scale
mat_col <- designer.colors(n=50, col=c(rgb(0,0,255,max=255),rgb(255,255,255,max=255),rgb(255,0,0,max=255)))
# Store color information in a vector for each cluster
col2 <- c()
for (i in 1:9){
# Number of target genes in the particular cluster
a1 <- nrow(tmp_net4[tmp_net4$Cl_Tar ==i,])
if(a1==0){
a <- 0
} else {
# Numeric enrichment value if there are target genes in the respective cluster
a <- log2((a1/nrow(tmp_net4)/(nrow(Diff_ctr_tmp[Diff_ctr_tmp$clust==i,])/nrow(Diff_ctr_tmp))))
# If enrichment is above 8 or below -8 set to 8 or -8 (below is log scale) to be able to show differences in color scales
a[a< -3] <- -3
a[a> 3] <- 3
}
# Transform the value between -3 and 3 to a scale from 1 to 50 according to the 50 color scale mat_col
b <- round((a+3)/0.125)+1
col2 <- c(col2,b)
}
# Plot enrichment ring
circos.trackPlotRegion(factors = test_final$Cl, x = test_final$Rank, ylim=c(0,1), bg.col = mat_col[col2],bg.lwd=0.1, bg.border = 'black',track.height = 0.1)
# Plot cluster membership color
circos.trackPlotRegion(factors = test_final$Cl ,ylim=c(0,1), bg.col = c(rainbow(8)[1:8],'grey') , bg.border = 'black',bg.lwd=0.1,track.height = 0.05)
# Add a single black line on the cluster membership circle to indicate the rank of the TF of interest
circos.lines(x=c(
test_final[test_final$Symbol==j,'Rank'],
test_final[test_final$Symbol==j,'Rank']),
y=c(0,1),sector.index=test_final[test_final$Symbol==j,'Cl'], lwd=4)
# Add the lines connecting the TF of interst with its target genes, add transparency to help indicate where the lines start
for (i in 1:nrow(tmp_net4)){
circos.link(tmp_net4[i,'Cl_Fac'], tmp_net4[i,'Rank_Fac'],tmp_net4[i,'Cl_Tar'],tmp_net4[i,'Rank_Tar'], directional = 0, col=alpha('black',0.6), lwd=0.5, arr.width = 0.05, arr.length = 0.1)
}
title(paste(j,"Target genes"))
}
rm(a,j,b,a1,GOI,i,tmp,ISMARA,Targets,col2,c, mat_col, tmp_net, tmp_net4, n, Nodes, q, Timepoint, test_final,Diff_ctr_tmp)
### Figure 3D
library(Seurat)
# the following files are provided in OSF https://osf.io/9xys4/
scOC <- readRDS("ReadyToUse_scOC.rds")
# This object has 18 clusters
DimPlot(scOC, label = TRUE, group.by = "SCT_snn_res.0.9") + NoLegend()+ theme(aspect.ratio=1)
rm(scOC)
### Figure 3E, F & G
library(Seurat)
library(gplots)
# the following files are provided in OSF https://osf.io/9xys4/
scOC <- readRDS("ReadyToUse_scOC.rds")
Targets <- read.delim("ReadyToUse_ISMARA_Targets.txt",h=T)
# Use scRNA-seq to check ISMARA network, start with extraction of averaged expression data at cluster level
Matrix <- data.frame(AverageExpression(scOC,group.by = 'SCT_snn_res.0.9', assays = 'SCT')[[1]])
Matrix_scale <- data.frame(t(scale(t(AverageExpression(scOC,group.by = 'SCT_snn_res.0.9', assays = 'SCT')[[1]]))))
# Collect average expression of all TFs of the ISAMARA network in one data frame and make a second for their targets with the same order
sc_TFs <- data.frame(t(Matrix[rownames(Matrix) %in% unique(Targets$Factor),]))
sc_Targets_scaled <- data.frame(matrix(NA,ncol=ncol(sc_TFs), nrow=nrow(sc_TFs)))
colnames(sc_Targets_scaled) <- colnames(sc_TFs)
rownames(sc_Targets_scaled) <- rownames(sc_TFs)
for(i in colnames(sc_TFs)){
tmp <- Targets[Targets$Factor ==i & Targets$Score > 2,'Target']
tmp <- tmp[tmp %in% rownames(scOC)]
sc_Targets_scaled[,i] <- colSums(Matrix[rownames(Matrix) %in% tmp,])
}
# Spearman's correlation between expression of TF and its targets across the clusters of scOC
cor_scOC_TFs_Targets <- c()
for (i in 1:ncol(sc_TFs)){
cor_scOC_TFs_Targets <- c(cor_scOC_TFs_Targets,cor(sc_TFs[,i],sc_Targets_scaled[,i],method = "spearman"))
}
# Name the correlation vector like the TFs, order it and include a rank for gene set enrichment analysis
names(cor_scOC_TFs_Targets) <- names(sc_TFs)
cor_scOC_TFs_Targets <- cor_scOC_TFs_Targets[order(cor_scOC_TFs_Targets)]
cor_scOC_TFs_Targets <- data.frame(cbind(cor_scOC_TFs_Targets,1:length(cor_scOC_TFs_Targets)))
names(cor_scOC_TFs_Targets) <- c('Spearman','Rank')
# Remove TFs for which Spearman's correlation cannot be calculated
cor_scOC_TFs_Targets <- cor_scOC_TFs_Targets[complete.cases(cor_scOC_TFs_Targets),]
# Select example showing TF and Target gene relation at single cell level
cor_scOC_TFs_Targets[order(-cor_scOC_TFs_Targets$Spearman),][1:10,]
# Figure 3E
# Expression data per cell
Matrix <- data.frame(as.matrix(GetAssayData(scOC)))
# Select MYBL2 targets from ISMARA network (Score 2 to increase confidence of target genes due to comparing two different data sets from different labs with different protocols)
tmp <- Targets[Targets$Factor =="MYBL2" & Targets$Score > 2,'Target']
tmp <- tmp[tmp %in% rownames(scOC)]
# add expression sum of MYBL2 targets to seurat object
scOC$MYBL2_scaled <- colSums(Matrix[rownames(Matrix) %in% tmp,])
FeaturePlot(scOC,c('MYBL2','MYBL2_scaled'))
# Figure 3F
# Expression data per cluster including spearmans correlation and coloring of the clusters
dev.off()
par(mfrow=c(1,1),pty="s")
library(scales)
plot(sc_TFs$MYBL2, sc_Targets_scaled$MYBL2,col=hue_pal()(19), pch=16)
title(paste("Spearman's MYBL2:",cor_scOC_TFs_Targets["MYBL2","Spearman"]))
# Figure 3G
# Identify TFs that change expression across clusters
scOC@active.ident <- scOC$SCT_snn_res.0.9
TF_marker <- FindAllMarkers(scOC,features=colnames(sc_TFs),logfc.threshold = 0.5,min.pct = 0.25)
TF_marker <- TF_marker[TF_marker$p_val_adj < 0.05,]
# Perform gene set enrichment analysis
library(fgsea)
ranks <- cor_scOC_TFs_Targets$Spearman
names(ranks) <- rownames(cor_scOC_TFs_Targets)
tmp_list <- list()
tmp_list[[1]] <- unique(TF_marker$gene)
names(tmp_list)[1] <- 'Diff_TF'
fgseaRes <- fgsea(pathways=tmp_list, stats=ranks, nperm=1000)
p <- plotEnrichment(tmp_list$Diff_TF,
ranks) + labs(title=paste("diff TFs in scOC 19cluster:",length(unique(TF_marker$gene)),"padj:", fgseaRes[fgseaRes$pathway=='Diff_TF','padj']))
print(p)
# Add color scale and values for the correlations in illustrator
heatmap.2(as.matrix(cor_scOC_TFs_Targets[,c('Spearman','Spearman')]), col=designer.colors(n=100,col=c('blue','white','red')), breaks=seq(-1,1,length=101), Colv = F, Rowv = F, dendrogram = "none", labRow = F, labCol = F, trace="none")
unique(TF_marker$gene)
rm(Matrix, Matrix_scale,Targets, tmp, tmp_list, scOC,sc_TFs, sc_Targets_scaled,p,fgseaRes,ranks,i,TF_marker,cor_scOC_TFs_Targets)
### Figure 3H
library(tidyr)
library(neat)
library(qgraph)
# the following files are provided in OSF https://osf.io/9xys4/
Targets <- read.delim("ReadyToUse_ISMARA_Targets.txt",h=T)
# Make two list containing the clustered genes
Aset <- list()
Bset <- list()
for (i in c(1:8)){
Bset[[paste("Cl_",i,sep="")]] <- Diff_ctr[Diff_ctr$clust ==i,"Symbol"]
Aset[[paste("Cl_",i,sep="")]] <- Diff_ctr[Diff_ctr$clust ==i,"Symbol"]
}
# Reduce ISMARA network to clustered genes
tmp <- Targets[Targets$Factor %in% Diff_ctr[Diff_ctr$clust !=0,"Symbol"],]
tmp <- tmp[tmp$Target %in% Diff_ctr[Diff_ctr$clust !=0,"Symbol"],]
# Define all nodes in the network
Nodes <- rbind(data.frame("Factor" = unique(tmp$Factor)),data.frame("Factor" = unique(tmp$Target)))
Nodes <- unique(Nodes)
Nodes <- as.vector(Nodes[,1])
# Run NEAT analysis using the ISMARA network (Score above 1 to include more confident genes)
test <- neat(alist = Aset, blist =Bset, network = as.matrix(unique(tmp[tmp$Score>0,c('Factor','Target')])), nettype = 'directed', nodes = Nodes)
test$LogOdds <- test$nab / test$expected_nab
test <- data.frame(test)
# Reduce the network enrichment analysis to significant interactions between the clusters
tmp <- test[test$adjusted_p < 0.01 & test$LogOdds > 1 ,c(1:2,7)]
tmp$LogOdds <- log2(tmp$LogOdds)
# Show spring layout for all clusters - nondeterministic and can result in different visualization
layout(matrix(1:2,ncol=2), width = c(3,1),height = c(1,1))
qg <- qgraph(as.matrix(tmp), pty="s", layout="spring",directed=T, color='white', width=7, height=7, posCol='black', negCol='red', vsize=9.7 )
title(paste("ISMARA"), cex.main=0.7, line=3)
col <- data.frame("col"=qg$graphAttributes$Edges$color,
"value" <- tmp[tmp[,3]>0,3])
col <- col[order(col[,2],decreasing=F),]
col <- col[col[,2]>0,]
legend_image <- as.raster(matrix(rev(col[,1]), ncol=1))
plot(c(0,2),c(0,1),type = 'n', axes = F,xlab = '', ylab = '')
text(x=1.5, y = c(0,0.5,1), labels = c(round(col[1,2],4),round((col[nrow(col),2]+col[1,2])/2,4),round(col[nrow(col),2],4)))
rasterImage(legend_image, 0, 1, 0.5,0)
# Adjust some arrows in illustrator for better visualization and include cluster labels
rm(legend_image, qg, test, tmp,i, legend_image,Nodes,value,Targets,col, Aset, Bset)
### Figure 3G
# the following files are provided in OSF https://osf.io/9xys4/
# Get GWAS causal genes from doi:10.1038/s41588-018-0302-x part of supplementary table 2
Targets <- read.delim("ReadyToUse_ISMARA_Targets.txt",h=T)
GWAS <- read.csv("ebmd_con_indepen.csv",h=T)
names(GWAS)[2] <- "Lead_SNP"
GWAS$Start <- GWAS$BP
# Are the factors regulating causal SNP-eBMD-genes or genes from cluster 4
# Reduce Factors in ISMARA network to TFs that are expressed in the RNA-seq data set Diff_ctr
Targets_tmp <- Targets[Targets$Factor %in% Diff_ctr$Symbol,]
# Set up a matrix
mat <- matrix(NA, ncol=2, nrow=length(unique(Targets_tmp$Factor)))
colnames(mat) <- c("Enrich_GWAS","Enrich_CL4")
rownames(mat) <- unique(Targets_tmp$Factor)
# Collect all genes that are causual eBMD target genes and also predicted targets in the ISAMRA network
tmp <- GWAS[GWAS$P.NI < 5e-8,'C.GENE']
tmp <- tmp[tmp %in% Targets_tmp$Target]
# Collect all genes that are in cluster 4 and also predicted targets in the ISAMRA network
tmp2 <- Diff_ctr[Diff_ctr$Symbol %in% Targets_tmp$Target & Diff_ctr$clust==4,"Symbol"]
# Collect p-values using hypergeometric test
for (i in unique(Targets_tmp$Factor)){
k <- unique(Targets_tmp[Targets_tmp$Factor == i & Targets_tmp$Score >1,'Target'])
q <- unique(k[k %in% tmp])
m <- unique(tmp)
n <- unique(Targets_tmp$Target[!Targets_tmp$Target %in% tmp])
mat[i,1] <- phyper(length(q), length(m), length(n), length(k), lower.tail = FALSE)
q <- unique(k[k %in% tmp2])
m <- tmp2
n <- unique(Targets_tmp$Target[!Targets_tmp$Target %in% tmp2])
mat[i,2] <- phyper(length(q), length(m), length(n), length(k), lower.tail = FALSE)
}
# Combine p-values for the enrichments with nominal p-value from eBMD GWAS
TF_Enrichment <- data.frame(mat)
TF_Enrichment$Factor <- rownames(TF_Enrichment)
TF_Enrichment <- merge(TF_Enrichment,GWAS[,c("C.GENE","P.NI")],by.x="Factor",by.y="C.GENE", all.x=T)
TF_Enrichment[!complete.cases(TF_Enrichment$P.NI),'P.NI'] <- 1
# barplot
# subgroup1: differentially expressed and a causal eBMD gene
y1 <- TF_Enrichment[TF_Enrichment$Factor %in% Diff_ctr[Diff_ctr$clust!=0,'Symbol'] & TF_Enrichment$P.NI < 5e-8, ]
# subgroup2: differentially expressed and NOT a causal eBMD gene
y2 <- TF_Enrichment[TF_Enrichment$Factor %in% Diff_ctr[Diff_ctr$clust!=0,'Symbol'] & TF_Enrichment$P.NI > 5e-8, ]
# subgroup3: NOT differentially expressed and a causal eBMD gene
y3 <- TF_Enrichment[TF_Enrichment$Factor %in% Diff_ctr[Diff_ctr$clust==0,'Symbol'] & TF_Enrichment$P.NI < 5e-8, ]
# subgroup4: NOT differentially expressed and NOT a causal eBMD gene
y4 <- TF_Enrichment[TF_Enrichment$Factor %in% Diff_ctr[Diff_ctr$clust==0,'Symbol'] & TF_Enrichment$P.NI > 5e-8, ]
# Barplot left panel, which is modified later in Illustrator, using the second bar just for upper alignment of the fourth, thrid and fourth bar are then aligned with the first one and the second is deleted
barplot(
c(
# All TFs from the group
nrow(y1),
# This number is useful for visualization, align the next at lower end and second next at upper end
nrow(y1[y1$Enrich_CL4 < 0.01 | y1$Enrich_GWAS < 0.01,]),
# All TFs enriched for cluster 4 genes
nrow(y1[y1$Enrich_CL4 < 0.01,]),
# All TFs enriched for eBMD genes
nrow(y1[y1$Enrich_GWAS < 0.01,]),
nrow(y2),
nrow(y2[y2$Enrich_CL4 < 0.01 | y2$Enrich_GWAS < 0.01,]),
nrow(y2[y2$Enrich_CL4 < 0.01,]),
nrow(y2[y2$Enrich_GWAS < 0.01,]),
nrow(y3),
nrow(y3[y3$Enrich_CL4 < 0.01 | y3$Enrich_GWAS < 0.01,]),
nrow(y3[y3$Enrich_CL4 < 0.01,]),
nrow(y3[y3$Enrich_GWAS < 0.01,]),
nrow(y4),
nrow(y4[y4$Enrich_CL4 < 0.01 | y4$Enrich_GWAS < 0.01,]),
nrow(y4[y4$Enrich_CL4 < 0.01,]),
nrow(y4[y4$Enrich_GWAS < 0.01,])
)
)
rm(y1,y2,y3,y4)
# Scatter plot right panel
# Select TFs that are enriched for target genes in cluster4 or causal eBMD genes
TF_Enrichment$Enrich_GWAS_log10 <- -log10(TF_Enrichment$Enrich_GWAS)
TF_Enrichment$Enrich_CL4_log10 <- -log10(TF_Enrichment$Enrich_CL4)
# Set very low p-values for cluster 4 to 100 and for eBMD to 15 (according to the distribution for each value)
TF_Enrichment[is.infinite(TF_Enrichment$Enrich_CL4_log10),'Enrich_CL4_log10'] <- 100
TF_Enrichment[is.infinite(TF_Enrichment$Enrich_GWAS_log10),'Enrich_GWAS_log10'] <- 15
TF_Enrichment[TF_Enrichment$Enrich_CL4_log10 >100,'Enrich_CL4_log10'] <- 100
TF_Enrichment[TF_Enrichment$Enrich_GWAS_log10 >15,'Enrich_GWAS_log10'] <- 15
# Select for significant enrichments
tmp <- TF_Enrichment[apply(TF_Enrichment[,c('Enrich_CL4_log10','Enrich_GWAS_log10')],1,max)>2,]
# Replace non-significant ones with 0
tmp[tmp$Enrich_CL4_log10 < 2,'Enrich_CL4_log10'] <- 0
tmp[tmp$Enrich_GWAS_log10 < 2,'Enrich_GWAS_log10'] <- 0
# Introduce color code for the eBMD GWAS if the TF is listed as causal gene
tmp[tmp$P.NI > 5e-8,'P.NI'] <- 1
tmp$P.NI <- -log10(tmp$P.NI)
# Add color value to data.frame
# first make breaks similar to a heatmap function
mat_col_breaks <- c(0,seq(-log10(5e-8),max(tmp$P.NI),length=50))
# Get the closest lower index of the break vector mat_col_breaks
tmp$col <- NA
for (i in 1:nrow(tmp)){
tmp[i,'col'] <- which(mat_col_breaks==max(mat_col_breaks[mat_col_breaks <= tmp[i,'P.NI']]))
}
# Replace index with color from color vector
mat_col <- c(alpha('white',0),designer.colors(n=50, col=c('plum1','darkmagenta')))
tmp$col <- mat_col[tmp$col]
# Scatter plot
plot(tmp$Enrich_GWAS_log10,tmp$Enrich_CL4_log10,pch=21, bg=tmp$col)
# Show names of the factors that are themselves eBMD genes or which are enriched for both gene groups
tmp2 <- tmp[tmp$P.NI > -log10(5e-8) | (tmp$Enrich_CL4 < 0.01 & tmp$Enrich_GWAS < 0.01),]
text(tmp2$Enrich_GWAS_log10,tmp2$Enrich_CL4_log10,tmp2$Factor)
# Make a heatmap to get the color scale
library(fields)
library(gplots)
mat_col <- c('white',designer.colors(n=50, col=c('plum1','darkmagenta')))
mat_col_breaks <- c(0,seq(-log10(5e-8),max(tmp$P.NI),length=51))
heatp <- heatmap.2(as.matrix(tmp[,c('P.NI','P.NI')]), col=mat_col, breaks=mat_col_breaks,trace="none",Colv =F, Rowv = F)
rm(Targets_tmp,Targets,tmp, tmp2,a,b,j,k,mat,heatp,mat_col,mat_col_breaks,i,m,n,q,TF_Enrichment,GWAS)