# Integrative analysis of ATAC & RNA - Transcription factor binding site motif enrichment analysis
- goal: find most probable TFs regulating genes from DEA, DEA-clusters & time-series analysis results
- input: DEA & time-series analysis results
- output: Transcription factor binding site motif enrichments

In [1]:
# set correct working directory -> project folder
getwd()
setwd('..')
getwd()

In [2]:
# load libraries
library(RcisTarget)
library(dplyr)
library(purrr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [3]:
# configs
data_path <- file.path('results','INT')
results_path <- file.path(data_path,'TF')

gene_motif_db_dir <- file.path('resources','mm10__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather')
gene_motif_db_url <- 'https://resources.aertslab.org/cistarget/databases/mus_musculus/mm10/refseq_r80/mc9nr/gene_based/mm10__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather'

In [4]:
# make directories if not exist
dir.create(results_path, showWarnings = FALSE)

# load data

In [5]:
# load filtered data for background genes
data <- read.csv(file.path(data_path,'INT_counts.csv'), row.names=1)
dim(data)
head(data)

Unnamed: 0_level_0,RNA_PT76_R1_C_albicans_2h,RNA_PT82_R1_C_albicans_24h,RNA_PT76_R1_C_albicans_4h,RNA_PT76_R1_C_albicans_6h,RNA_PT82_R1_C_albicans_8h,RNA_PT76_R1_untreated_0h,RNA_PT82_R1_untreated_24h,RNA_PT76_R1_IFN_beta_2h,RNA_PT82_R1_IFN_beta_24h,RNA_PT76_R1_IFN_beta_4h,⋯,PT82_R2_IFN_gamma_24h,PT82_R2_IFN_gamma_8h,PT82_R2_LCMV_Cl13_24h,PT82_R2_LCMV_Cl13_8h,PT82_R2_LO28_24h,PT82_R2_LO28_8h,PT82_R2_LPS_24h,PT82_R2_LPS_8h,PT82_R2_untreated_24h,PT82_R2_untreated_8h
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ENSMUSG00000098104,1,6,0,2,1,9,4,6,4,0,⋯,8,7,3,11,31,14,5,12,10,9
ENSMUSG00000033845,219,220,201,169,196,204,226,202,124,153,⋯,210,109,218,185,32,164,127,155,156,123
ENSMUSG00000025903,115,105,106,113,87,144,128,159,98,137,⋯,366,204,351,328,65,192,282,150,255,205
ENSMUSG00000033813,63,57,39,74,51,53,68,58,63,124,⋯,701,414,674,625,113,424,459,357,550,485
ENSMUSG00000033793,315,222,309,250,312,203,277,266,222,218,⋯,0,5,6,4,12,6,3,5,4,9
ENSMUSG00000025907,295,355,225,280,225,285,336,169,354,166,⋯,45,21,18,31,21,26,15,13,36,35


In [6]:
# Load sample annotation
annot <- read.csv(file.path(data_path,'INT_annotations.csv'), row.names=1)
dim(annot)
head(annot)

Unnamed: 0_level_0,library,treatment,time,experiment,group
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
RNA_PT76_R1_C_albicans_2h,Quant-seq,C_albicans,2h,PT76,C_albicans_2h
RNA_PT82_R1_C_albicans_24h,Quant-seq,C_albicans,24h,PT82,C_albicans_24h
RNA_PT76_R1_C_albicans_4h,Quant-seq,C_albicans,4h,PT76,C_albicans_4h
RNA_PT76_R1_C_albicans_6h,Quant-seq,C_albicans,6h,PT76,C_albicans_6h
RNA_PT82_R1_C_albicans_8h,Quant-seq,C_albicans,8h,PT82,C_albicans_8h
RNA_PT76_R1_untreated_0h,Quant-seq,untreated,0h,PT76,untreated_0h


In [7]:
# load gene annotation
gene_annot <- read.csv(file.path('results','RNA','counts','gene_annotation.tsv'), sep='\t', row.names=1)
dim(gene_annot)
head(gene_annot)

Unnamed: 0_level_0,version,source,external_gene_name,external_gene_source,description,gene_biotype,length,gc
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>
ENSMUSG00000000194,13,ensembl_havana,Gpr107,MGI Symbol,G protein-coupled receptor 107 [Source:MGI Symbol;Acc:MGI:2139054],protein_coding,12766,0.494595
ENSMUSG00000000247,11,ensembl_havana,Lhx2,MGI Symbol,LIM homeobox protein 2 [Source:MGI Symbol;Acc:MGI:96785],protein_coding,4393,0.5970863
ENSMUSG00000000544,14,ensembl_havana,Gpa33,MGI Symbol,glycoprotein A33 (transmembrane) [Source:MGI Symbol;Acc:MGI:1891703],protein_coding,2742,0.5309993
ENSMUSG00000000817,10,ensembl_havana,Fasl,MGI Symbol,"Fas ligand (TNF superfamily, member 6) [Source:MGI Symbol;Acc:MGI:99255]",protein_coding,1937,0.4305627
ENSMUSG00000000889,8,ensembl_havana,Dbh,MGI Symbol,dopamine beta hydroxylase [Source:MGI Symbol;Acc:MGI:94864],protein_coding,2692,0.5549777
ENSMUSG00000001138,13,ensembl_havana,Cnnm3,MGI Symbol,cyclin M3 [Source:MGI Symbol;Acc:MGI:2151055],protein_coding,6420,0.5610592


In [8]:
#  prepare list of background genes
background <- gene_annot[rownames(data), 'external_gene_name']
background <- unique(background)
# background <- toupper(background)
length(background)
head(background)

In [9]:
treatments <- c(unique(annot$treatment))
treatments

# load reference data

In [10]:
# check if Gene-motif rankings database exists and download if not
if (!file.exists(gene_motif_db_dir)){
    download.file(gene_motif_db_url, destfile=gene_motif_db_dir, method='wget')
}

# import motif rankings
# motifRankings <- importRankings(gene_motif_db_dir)

# import motif rankings considering background
rankingsDb <- importRankings(gene_motif_db_dir, columns=background)
motifRankings <- reRank(rankingsDb)

motifRankings
ranking_df <- getRanking(motifRankings)
dim(ranking_df)
head(ranking_df)

Using the column 'features' as feature index for the ranking database.

“The following columns are missing from the database: Gm6085, 4732440D04Rik, Gm19026, Gm7449, Gm24276, Gm37569, Gm38120, Gm38380, Gm38319, Gm37444, Gm5251, Gm28438, Gm28437, Gm28661, Gm37906, Gm37354, Gm38157, Gm3052, Gm37233, Gm28417, Gm38115, Rpl12-ps1, Gm43213, Gm37309, Gm15832, Gm23722, Gm8251, Poglut2, Rps27a-ps1, Gm36955, A130048G24Rik, Gm17971, Sgo2a, Gm15834, Gm37760, D430013B06Rik, Gm37531, Gm15464, 6030460B20Rik, Gm37198, Rpl18-ps1, Gm37121, Gm20342, Rpl10a-ps1, Gm8805, Gm38387, Gm38162, 6820402A03Rik, Rpl31-ps14, Gm37733, Retreg2, Gm816, 4833412K13Rik, Gm38339, Gm37902, Gm37645, Gm38062, A630081D01Rik, Gm19552, Gm37058, A530040E14Rik, AC147806.2, Gm37914, Gm38021, Gm6136, 4833421G17Rik, Gm38365, Gm15368, D130058E05Rik, Septin2, BC055308, Gm15427, Gm37642, Gm7967, Gm28187, Relch, Gm37566, Gm7160, Gm38235, Gm37510, Gm8451, Gm38248, Rpl28-ps1, Gm37407, Gm15675, Gm37084, Gm37954, Gm29488, Gm38067, 6030442K20

Rankings for RcisTarget.
  Number of genes: 21490 (21490 available in the full DB)
  Number of FEATURES: 24453

 [Source file: mm10__refseq-r80__500bp_up_and_100bp_down_tss.mc9nr.feather]

features,Mrpl15,Lypla1,Tcea1,Atp6v1h,Rb1cc1,St18,Pcmtd1,Rrs1,Adhfe1,⋯,Mir3086,Vax1,Kcnk18,Slc18a2,Emx2os,Emx2,2700089I24Rik,E330013P04Rik,Prlhr,Gm7102
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
jaspar__MA1023.1,7973,2078,4395,9786,398,10421,4800,4112,7248,⋯,8470,16733,15538,20988,725,337,17050,7902,2624,15858
taipale_cyt_meth__IRX3_NACGYRNNNNNNYGCGTN_eDBD_meth,18856,5458,13735,19592,3253,171,19280,6516,9693,⋯,8037,10563,6028,21101,142,217,18244,1384,5046,17371
taipale__DBP_DBD_NRTTACGTAAYN,1500,12422,21165,17887,13680,373,17322,8600,19574,⋯,14079,14816,3787,456,11944,15493,15285,9719,17125,13636
cisbp__M4240,8721,8205,16416,14348,6252,249,20542,1985,19639,⋯,9830,8531,13937,5062,11229,5284,10224,212,12696,21340
scertf__macisaac.ACE2,7600,923,18949,13228,6860,9033,20642,3703,18363,⋯,5685,12430,11427,14307,9901,5312,8734,12554,8039,21328
hocomoco__CEBPG_MOUSE.H11MO.0.B,8938,18681,14783,14395,2228,1414,20504,7816,16061,⋯,12438,211,18347,6361,19287,20099,10897,10236,18224,19680


In [11]:
# Load the annotation to mouse transcription factors
# alternative source https://resources.aertslab.org/cistarget/motif2tf
data(motifAnnotations_mgi)

dim(motifAnnotations_mgi)
head(motifAnnotations_mgi)

motif,TF,directAnnotation,inferred_Orthology,inferred_MotifSimil,annotationSource,description
<chr>,<chr>,<lgl>,<lgl>,<lgl>,<fct>,<chr>
bergman__Abd-B,Hoxa9,False,False,True,inferredBy_MotifSimilarity,"gene is annotated for similar motif cisbp__M1008 ('HOXA6[gene ID: ""ENSG00000106006"" species: ""Homo sapiens"" TF status: ""inferred"" TF family: ""Homeodomain"" DBDs: ""Homeobox""]; HOXB9[gene ID: ""ENSG00000170689"" species: ""Homo sapiens"" TF status: ""inferred"" TF family: ""Homeodomain"" DBDs: ""Homeobox""]; HOXC9[gene ID: ""ENSG00000180806"" species: ""Homo sapiens"" TF status: ""inferred"" TF family: ""Homeodomain"" DBDs: ""Homeobox""]; Hoxa9[gene ID: ""ENSMUSG00000038227"" species: ""Mus musculus"" TF status: ""direct"" TF family: ""Homeodomain"" DBDs: ""Homeobox""]; Hoxb9[gene ID: ""ENSMUSG00000020875"" species: ""Mus musculus"" TF status: ""inferred"" TF family: ""Homeodomain"" DBDs: ""Homeobox""]; NP_032296.2[gene ID: ""NP_032296.2"" species: ""Mus musculus"" TF status: ""inferred"" TF family: ""Homeodomain"" DBDs: ""Homeobox""]'; q-value = 0.0006)"
bergman__Aef1,Zfp128,False,True,False,inferredBy_Orthology,motif is annotated for orthologous gene FBgn0005694 in D. melanogaster (identity = 22%)
bergman__Cf2,Zfp853,False,True,False,inferredBy_Orthology,motif is annotated for orthologous gene FBgn0000286 in D. melanogaster (identity = 16%)
bergman__EcR_usp,Nr1h2,False,True,False,inferredBy_Orthology,gene is orthologous to FBgn0000546 in D. melanogaster (identity = 37%) which is directly annotated for motif
bergman__EcR_usp,Nr1h3,False,True,False,inferredBy_Orthology,gene is orthologous to FBgn0000546 in D. melanogaster (identity = 40%) which is directly annotated for motif
bergman__EcR_usp,Nr1h4,False,True,False,inferredBy_Orthology,gene is orthologous to FBgn0000546 in D. melanogaster (identity = 29%) which is directly annotated for motif


# TF analysis

## DEA results

In [175]:
# configs for plotting
top_n <- 1

In [174]:
# get significant genes per treatment and time point ie group
gene_lists <- list()

for (treatment in treatments){
    # load DEA analysis results per treatment
    tmp_results <- read.csv(file.path(data_path,'DEA',paste0('INT_DEA_',treatment,'.csv')))
    for (group in unique(tmp_results$group)){

        for (direction in c('up','down')){
            if (direction=='up'){
                tmp_genes <- unique(tmp_results[(tmp_results['adj.P.Val']<0.05) & (tmp_results['group']==group) & (tmp_results['logFC']>0), 'rn'])
            }else{
                tmp_genes <- unique(tmp_results[(tmp_results['adj.P.Val']<0.05) & (tmp_results['group']==group) & (tmp_results['logFC']<0), 'rn'])
            }

            if (length(tmp_genes)==0){
                next
            }

            gene_lists[paste0(group,"_",direction)] <- list(tmp_genes)
        }
    }
}
length(gene_lists)        
names(gene_lists)

In [176]:
# convert gene IDs to symbols, and subset gene lists for supported genes
for (key in names(gene_lists)){
    gene_lists[key] <- list(gene_annot[unname(unlist(gene_lists[key])), 'external_gene_name'])
    gene_lists[key] <- list(intersect(colnames(ranking_df), unname(unlist(gene_lists[key]))))
#     gene_lists[key] <- list(toupper(unname(unlist(gene_lists[key]))))
}

In [None]:
# run analysis
motifEnrichmentTable_wGenes <- cisTarget(gene_lists,
                                         motifRankings,
                                         motifAnnot=motifAnnotations_mgi)

dim(motifEnrichmentTable_wGenes)
head(motifEnrichmentTable_wGenes)

In [None]:
# save results
write.table(motifEnrichmentTable_wGenes, file.path(results_path, 'INT_TF_DEA.csv'), sep=",", row.names=FALSE, quote=FALSE)

In [192]:
# Summary dataframes and heatmaps - for NES and number of uniquely enriched genes

# load TF analysis results
tmp_results <- read.csv(file.path(results_path, 'INT_TF_DEA.csv'), sep=",")

# determine list of unique high Confidence TFs in the results
tmp_tfs <- unique(unlist(lapply(tmp_results$TF_highConf,
                      function(x) {
                        genes <- gsub(" \\(.*\\). ", "; ", x, fixed=FALSE)
                        genesSplit <- unique(unlist(strsplit(genes, "; ")))
                        return(genesSplit)
                        })))
#     length(tmp_tfs)

# filter for geneSet and TF, and take max per TF and NA if not found
NES_df <- data.frame(matrix(ncol=length(tmp_tfs),nrow=length(unique(tmp_results$geneSet)), dimnames=list(unique(tmp_results$geneSet), tmp_tfs)))
nEnrGenes_df <- data.frame(matrix(ncol=length(tmp_tfs),nrow=length(unique(tmp_results$geneSet)), dimnames=list(unique(tmp_results$geneSet), tmp_tfs)))

for (gene_list in unique(tmp_results$geneSet)){
    for (tf in tmp_tfs){
        # max NES approach
        tmp_nes <- suppressWarnings(max(tmp_results[intersect(grep(tf, tmp_results$TF_highConf),grep(gene_list, tmp_results$geneSet)),'NES']))
        NES_df[gene_list, tf] <- ifelse(tmp_nes!=-Inf, tmp_nes, NA)

        # number of uniquely enriched genes per TF
        nEnrGenes_df[gene_list, tf] <- length(unique(unlist(strsplit(tmp_results[intersect(grep(tf, tmp_results$TF_highConf),grep(gene_list, tmp_results$geneSet)),'enrichedGenes'],';'))))
    }
}

### normalize nEnrGenes by total number of genes in geneSet
# normalize
nGenes <- lapply(gene_lists, length)
nEnrGenes_df_norm <- data.frame(sapply(nEnrGenes_df, function(x) x/unlist(unname(nGenes[rownames(nEnrGenes_df)]))))
rownames(nEnrGenes_df_norm) <- rownames(nEnrGenes_df)

### save results
write.table(NES_df, file.path(results_path, paste0('INT_TF_DEA_summary_maxNES.csv')), sep=",", row.names=TRUE, quote=FALSE)
write.table(nEnrGenes_df, file.path(results_path, paste0('INT_TF_DEA_summary_nEnrGenes.csv')), sep=",", row.names=TRUE, quote=FALSE)
write.table(nEnrGenes_df_norm, file.path(results_path, paste0('INT_TF_DEA_summary_nEnrGenes_norm.csv')), sep=",", row.names=TRUE, quote=FALSE)

### plot summries of top_n TFs as Heatmaps
# nEnrGenes
nEnrGenes_df_norm <- data.frame(t(nEnrGenes_df_norm))
nEnrGenes_df_norm[is.na(nEnrGenes_df_norm)] <- 0
top_tfs <- unique(unlist(as.list(sapply(nEnrGenes_df_norm, function(x) rownames(nEnrGenes_df_norm)[sort(x, decreasing=TRUE, index.return = TRUE)$ix][1:top_n]))))
png(file=file.path(results_path, paste0('INT_TF_DEA_summary_nEnrGenes_norm_HM_top',top_n,'.png')))
heatmap(as.matrix(nEnrGenes_df_norm[top_tfs,]))
dev.off()
# NES
NES_df <- data.frame(t(NES_df))
NES_df[is.na(NES_df)] <- 0
top_tfs <- unique(unlist(as.list(sapply(NES_df, function(x) rownames(NES_df)[sort(x, decreasing=TRUE, index.return = TRUE)$ix][1:top_n]))))
png(file=file.path(results_path, paste0('INT_TF_DEA_summary_maxNES_HM_top',top_n,'.png')))
heatmap(as.matrix(NES_df[top_tfs,]))
dev.off()

## DEG cluster results

In [168]:
# configs
ks <- c(4:12)
ks

# for plotting
top_n <- 5

In [None]:
for (k in ks){
    # get DEG cluster genes

    # load clustering results
    DEA_cluster_results <- read.csv(file.path(data_path,'DEA',paste0('Clusters_HM_',k),'Genes.csv'))

    # make gene lists
    gene_lists <- DEA_cluster_results %>% split(.$value) %>% map(pull, rn)

    # rename gene_lists
    names(gene_lists) <- paste0('cluster_',names(gene_lists))

    length(gene_lists)        
    names(gene_lists)

    # convert gene IDs to symbols and subset gene lists for supported genes
    for (key in names(gene_lists)){
        gene_lists[key] <- list(gene_annot[unname(unlist(gene_lists[key])), 'external_gene_name'])
        gene_lists[key] <- list(intersect(colnames(ranking_df), unname(unlist(gene_lists[key]))))
    #     gene_lists[key] <- list(toupper(unname(unlist(gene_lists[key]))))
    }

    # run analysis
    motifEnrichmentTable_wGenes <- cisTarget(gene_lists,
                                             motifRankings,
                                             motifAnnot=motifAnnotations_mgi)

    dim(motifEnrichmentTable_wGenes)
    head(motifEnrichmentTable_wGenes)
    
    # save results
    write.table(motifEnrichmentTable_wGenes, file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'.csv')), sep=",", row.names=FALSE, quote=FALSE)
}

In [173]:
# Summary dataframes and heatmaps - for NES and number of uniquely enriched genes
for (k in ks){

    # load TF analysis results
    tmp_results <- read.csv(file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'.csv')), sep=",")

    # determine list of unique high Confidence TFs in the results
    tmp_tfs <- unique(unlist(lapply(tmp_results$TF_highConf,
                          function(x) {
                            genes <- gsub(" \\(.*\\). ", "; ", x, fixed=FALSE)
                            genesSplit <- unique(unlist(strsplit(genes, "; ")))
                            return(genesSplit)
                            })))
#     length(tmp_tfs)

    # filter for geneSet and TF, and take max per TF and NA if not found
    NES_df <- data.frame(matrix(ncol=length(tmp_tfs),nrow=length(unique(tmp_results$geneSet)), dimnames=list(unique(tmp_results$geneSet), tmp_tfs)))
    nEnrGenes_df <- data.frame(matrix(ncol=length(tmp_tfs),nrow=length(unique(tmp_results$geneSet)), dimnames=list(unique(tmp_results$geneSet), tmp_tfs)))

    for (gene_list in unique(tmp_results$geneSet)){
        for (tf in tmp_tfs){
            # max NES approach
            tmp_nes <- suppressWarnings(max(tmp_results[intersect(grep(tf, tmp_results$TF_highConf),grep(gene_list, tmp_results$geneSet)),'NES']))
            NES_df[gene_list, tf] <- ifelse(tmp_nes!=-Inf, tmp_nes, NA)
            
            # number of uniquely enriched genes per TF
            nEnrGenes_df[gene_list, tf] <- length(unique(unlist(strsplit(tmp_results[intersect(grep(tf, tmp_results$TF_highConf),grep(gene_list, tmp_results$geneSet)),'enrichedGenes'],';'))))
        }
    }
    
    ### normalize nEnrGenes by total number of genes in cluster/group
    # load clustering results
    tmp_cl_results <- read.csv(file.path(data_path,'DEA',paste0('Clusters_HM_',k),'Genes.csv'))
    tmp_cl_results$X1 <- gene_annot[tmp_cl_results$rn, 'external_gene_name']

    # filter for genes that were used in the TF analysis
    rownames(tmp_cl_results) <- tmp_cl_results$X1
    tmp_cl_results <- tmp_cl_results[intersect(colnames(ranking_df), rownames(tmp_cl_results)), ]

    # determine number of genes per clustering to use for normalization of enriched gene numbers
    tmp_cl_results <- aggregate(tmp_cl_results$value, by=list(tmp_cl_results$value), FUN=length)
    colnames(tmp_cl_results) <- c('cluster','nGenes')
    rownames(tmp_cl_results) <- tmp_cl_results$cluster
    tmp_cl_results$cluster <- NULL

    # normalize
    nEnrGenes_df_norm <- data.frame(sapply(nEnrGenes_df, function(x) x/tmp_cl_results$nGenes))
    rownames(nEnrGenes_df_norm) <- rownames(nEnrGenes_df)

    ### save results
    write.table(NES_df, file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'_summary_maxNES.csv')), sep=",", row.names=TRUE, quote=FALSE)
    write.table(nEnrGenes_df, file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'_summary_nEnrGenes.csv')), sep=",", row.names=TRUE, quote=FALSE)
    write.table(nEnrGenes_df_norm, file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'_summary_nEnrGenes_norm.csv')), sep=",", row.names=TRUE, quote=FALSE)
    
    ### plot summries of top_n TFs as Heatmaps
    # nEnrGenes
    nEnrGenes_df_norm <- data.frame(t(nEnrGenes_df_norm))
    nEnrGenes_df_norm[is.na(nEnrGenes_df_norm)] <- 0
    top_tfs <- unique(unlist(as.list(sapply(nEnrGenes_df_norm, function(x) rownames(nEnrGenes_df_norm)[sort(x, decreasing=TRUE, index.return = TRUE)$ix][1:top_n]))))
    png(file=file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'_summary_nEnrGenes_norm_HM_top',top_n,'.png')))
    heatmap(as.matrix(nEnrGenes_df_norm[top_tfs,]))
    dev.off()
    # NES
    NES_df <- data.frame(t(NES_df))
    NES_df[is.na(NES_df)] <- 0
    top_tfs <- unique(unlist(as.list(sapply(NES_df, function(x) rownames(NES_df)[sort(x, decreasing=TRUE, index.return = TRUE)$ix][1:top_n]))))
    png(file=file.path(results_path, paste0('INT_TF_DEGclusters_k',k,'_summary_maxNES_HM_top',top_n,'.png')))
    heatmap(as.matrix(NES_df[top_tfs,]))
    dev.off()
}

## time-series cluster results

In [165]:
# config
timeseries_k <- read.csv(file=file.path('config','BMDM_timeseries_k.csv'), row.names=1, header=TRUE)

# for plotting
top_n <- 5

In [None]:
# get significant TFs per treatment and per k clustering
for (treatment in treatments){
    if (treatment=='untreated'){
        next
    }
    
    # load clustering results
    ts_k <- timeseries_k[treatment, 'INT']
    tmp_results <- read.delim(file=file.path(data_path, "time_series", treatment, paste0('k_',ts_k), paste0('clustering_',treatment,'.csv')), sep=',', header = TRUE)#, row.names=1)

    # make gene lists
    gene_lists <- tmp_results %>% split(.$X2) %>% map(pull, X1)
    
    # rename gene_lists
    names(gene_lists) <- paste0('cluster_',names(gene_lists))
    
    length(gene_lists)        
    names(gene_lists)

    # subset gene lists for supported genes
    for (key in names(gene_lists)){
#         gene_lists[key] <- list(gene_annot[unname(unlist(gene_lists[key])), 'external_gene_name'])
        gene_lists[key] <- list(intersect(colnames(ranking_df), unname(unlist(gene_lists[key]))))
    #     gene_lists[key] <- list(toupper(unname(unlist(gene_lists[key]))))
    }

    # run analysis
    motifEnrichmentTable_wGenes <- cisTarget(gene_lists,
                                             motifRankings,
                                             motifAnnot=motifAnnotations_mgi)

    dim(motifEnrichmentTable_wGenes)
    head(motifEnrichmentTable_wGenes)
    
    # save results
    write.table(motifEnrichmentTable_wGenes, file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'.csv')), sep=",", row.names=FALSE, quote=FALSE)

}

In [166]:
# Summary dataframes and heatmaps - for NES and number of uniquely enriched genes
for (treatment in treatments){
    if (treatment=='untreated'){
        next
    }
    
    # load clustering k config
    ts_k <- timeseries_k[treatment, 'INT']

    # load TF analysis results
    tmp_results <- read.csv(file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'.csv')), sep=",")

    # determine list of unique high Confidence TFs in the results
    tmp_tfs <- unique(unlist(lapply(tmp_results$TF_highConf,
                          function(x) {
                            genes <- gsub(" \\(.*\\). ", "; ", x, fixed=FALSE)
                            genesSplit <- unique(unlist(strsplit(genes, "; ")))
                            return(genesSplit)
                            })))
#     length(tmp_tfs)

    # filter for geneSet and TF, and take max per TF and NA if not found
    NES_df <- data.frame(matrix(ncol=length(tmp_tfs),nrow=length(unique(tmp_results$geneSet)), dimnames=list(unique(tmp_results$geneSet), tmp_tfs)))
    nEnrGenes_df <- data.frame(matrix(ncol=length(tmp_tfs),nrow=length(unique(tmp_results$geneSet)), dimnames=list(unique(tmp_results$geneSet), tmp_tfs)))

    for (gene_list in unique(tmp_results$geneSet)){
        for (tf in tmp_tfs){
            # max NES approach
            tmp_nes <- suppressWarnings(max(tmp_results[intersect(grep(tf, tmp_results$TF_highConf),grep(gene_list, tmp_results$geneSet)),'NES']))
            NES_df[gene_list, tf] <- ifelse(tmp_nes!=-Inf, tmp_nes, NA)
            
            # number of uniquely enriched genes per TF
            nEnrGenes_df[gene_list, tf] <- length(unique(unlist(strsplit(tmp_results[intersect(grep(tf, tmp_results$TF_highConf),grep(gene_list, tmp_results$geneSet)),'enrichedGenes'],';'))))
        }
    }
    
    ### normalize nEnrGenes by total number of genes in cluster/group
    # load clustering results
    tmp_cl_results <- read.delim(file=file.path(data_path, "time_series", treatment, paste0('k_',ts_k), paste0('clustering_',treatment,'.csv')), sep=',', header = TRUE)#, row.names=1)

    # filter for genes that were used in the TF analysis
    rownames(tmp_cl_results) <- tmp_cl_results$X1
    tmp_cl_results <- tmp_cl_results[intersect(colnames(ranking_df), rownames(tmp_cl_results)), ]

    # determine number of genes per clustering to use for normalization of enriched gene numbers
    tmp_cl_results <- aggregate(tmp_cl_results$X2, by=list(tmp_cl_results$X2), FUN=length)
    colnames(tmp_cl_results) <- c('cluster','nGenes')
    rownames(tmp_cl_results) <- tmp_cl_results$cluster
    tmp_cl_results$cluster <- NULL

    # normalize
    nEnrGenes_df_norm <- data.frame(sapply(nEnrGenes_df, function(x) x/tmp_cl_results$nGenes))
    rownames(nEnrGenes_df_norm) <- rownames(nEnrGenes_df)

    ### save results
    write.table(NES_df, file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'_summary_maxNES.csv')), sep=",", row.names=TRUE, quote=FALSE)
    write.table(nEnrGenes_df, file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'_summary_nEnrGenes.csv')), sep=",", row.names=TRUE, quote=FALSE)
    write.table(nEnrGenes_df_norm, file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'_summary_nEnrGenes_norm.csv')), sep=",", row.names=TRUE, quote=FALSE)
    
    ### plot summries of top_n TFs as Heatmaps
    # nEnrGenes
    nEnrGenes_df_norm <- data.frame(t(nEnrGenes_df_norm))
    nEnrGenes_df_norm[is.na(nEnrGenes_df_norm)] <- 0
    top_tfs <- unique(unlist(as.list(sapply(nEnrGenes_df_norm, function(x) rownames(nEnrGenes_df_norm)[sort(x, decreasing=TRUE, index.return = TRUE)$ix][1:top_n]))))
    png(file=file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'_summary_nEnrGenes_norm_HM_top',top_n,'.png')))
    heatmap(as.matrix(nEnrGenes_df_norm[top_tfs,]))
    dev.off()
    # NES
    NES_df <- data.frame(t(NES_df))
    NES_df[is.na(NES_df)] <- 0
    top_tfs <- unique(unlist(as.list(sapply(NES_df, function(x) rownames(NES_df)[sort(x, decreasing=TRUE, index.return = TRUE)$ix][1:top_n]))))
    png(file=file.path(results_path, paste0('INT_TF_time_series_',treatment,'_k',ts_k,'_summary_maxNES_HM_top',top_n,'.png')))
    heatmap(as.matrix(NES_df[top_tfs,]))
    dev.off()
}