# Use ChromVAR analyze varability of chromatin accessibility

In [1]:
library(repr)
options(repr.plot.width=4, repr.plot.height=3)

In [2]:
library(tidyverse)
library(TFBSTools)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()




In [3]:
library(chromVAR)
library(motifmatchr)
library(Matrix)
library(SummarizedExperiment)
library(BSgenome.Hsapiens.UCSC.hg38)
library(BiocParallel)
register(MulticoreParam(16))
set.seed(2019)


Attaching package: ‘Matrix’


The following object is masked from ‘package:TFBSTools’:

    Matrix


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘matrixStats’


The following object is masked from ‘package:dplyr’:

    count



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowC

In [4]:
library(tictoc)
library(naturalsort)
library(parallel)
library(WriteXLS)

In [5]:
setwd("/gpfs/commons/groups/sanjana_lab/cdai/NeuronReporters/code")

## Varability of Chromatin Accessibility
Use chromVAR and Jaspar 2020

**Inputs**
1. **Peaks**: Peaks are consensus peaks (min 2 overlap), centered at summit with uniform width of 500 bp as recommended by chromVAR. First 3 columns are standard bed, the 4th column is peak intensity as calculated from MACS2. Peak file: <br>
`/gpfs/commons/groups/sanjana_lab/cdai/TFscreen/atac/diffbind_consensu_min2overlap.bed`. <br>
 <br>
2. **Read count per peaks**: 
`/gpfs/commons/groups/sanjana_lab/cdai/TFscreen/atac/diffbind_consensu_min2overlap_readcounts.txt` <br>
See **`DiffBind-R.ipynb`** on how these the consensus peaks dataset is produced. <br>
<br>**Note:** read counts are consenus peak set, where a peak must appear in at least 2 dataset. Only peaks in `chr[0-9XY]+` are saved, meaning peaks that are in contig region or chrM are not kept in chromVAR analysis.<br><br>
2. **Annovation**
: Jaspar 2020 motifs

In [6]:
suppressMessages({samplesheet <- read_csv("../../TFscreen/atac/samplesheet2.csv")})

---

#### Support functions

In [7]:
# custom function to check if a motifname contains a hit gene name or TF gene name
# the reason is motifname sometimes contains "(var.2)" or other characters
mystrfunc <- function(motifname, target_match) {
    # motifname: scalar string value, name of the motif: eg. TFAP4(var.2)
    # target_match: vector, either Tf list of Hit list, or any gene list
    # returns T/F
    checkname <- function(x) {
        if (str_detect(motifname, paste0("^", x, "$"))) {
            check <- TRUE
        } else if (str_detect(motifname, paste0(":", x, "$"))) {
            check <- TRUE
        } else if (str_detect(motifname, paste0("^", x, ":"))) {
            check <- TRUE
        } else {
            check <- FALSE
        }
    }
    test <- map_lgl(target_match, ~ checkname(.x)) %>% any
    #test <- map_lgl(target_match, ~ if (str_detect(motifname, paste0("^", .x, "$"))) {TRUE} else if (str_detect(motifname, paste0(":", .x, "$"))) {TRUE} else if (str_detect(motifname, paste0("^", .x, ":"))) {TRUE} else {FALSE})
    #test <- map_lgl(target_match, ~ str_detect(string = motifname, pattern = paste0("^", .x, "$")) | str_detect(string = motifname, pattern = paste0(":", .x, "$")) | str_detect(string = motifname, pattern = paste0("^", .x, ":")))
    return(test)
}

In [8]:
getTargetsFromMotif <- function(motif_name, motif.matches, genomic.features, min.overlap = 10, genelist) {
    # Given a motif name (JASPAR), find its binding targets with read counts, observed in ATAC-seq
    #--------------- INPUT: -----------------
    # motif_name: string, this is the jaspar motif name, not gene name
    # motif.matches: matchMotif object, e.g. motifMatch() result of peaks and jaspar 2020 motifs
    # genomic.features: GenomicRanges object, e.g. a bed formated promoter region coordinates, converted into GRanges object
    # min.overlap: integer, minimum overlap in base pairs
    # genelist: vector, a list of gene_names. Note these gene names are some times not the same as motif names, hence the need of motif_lookup
    #--------------- OUTPUT: -----------------
    # overlapped.target.readcount: dataframe, containing read counts of genes that are targets of a given regulator. 
    # Read counts sum of all peaks that match to the regulator's motif.
    # -----------------------------------------
    
    # Get motif matches matrix (rows are peak coordianes, columns are each motif name, values are logical values indicating match)
    match.matrix <- motifMatches(motif.matches)
    
    # Get peaks that have binding sites matching given motif, each GRange also include read count columns
    GRanges.index <- which(match.matrix[ , motif_name]) 
    matched.target.GRanges <- rowRanges(motif.matches)[GRanges.index]
    
    # Find overlaps between: 
    # 1. peaks that match binding sites of a given motif, and 
    # 2. coordinates of gene annotation
    target.intersect.feature <- GenomicRanges::findOverlaps(query = matched.target.GRanges, 
                                                            subject = genomic.features,
                                                           minoverlap = min.overlap, ignore.strand = T)
    
    # Get the genes (from annoation) that overlap with peaks (that have binding sites of given motif)
    overlapped.gene_names <- genomic.features[subjectHits(target.intersect.feature), ] %>% as.data.frame %>% pull(gene_name) 
    
    # Summarize the read counts of matching peaks into read counts per gene for a given motif
    overlapped.target.readcount <- matched.target.GRanges[queryHits(target.intersect.feature), ] %>%
                                mcols %>% as.data.frame %>%
                                add_column("target_gene" = overlapped.gene_names) %>% # add gene_name to matched ranges
                                dplyr::select(target_gene,A1:A12) %>% # selecct read count columns only
                                filter(target_gene %in% genelist) %>% # keep rows with genes in the genelist
                                group_by(target_gene) %>% summarise_all(sum) # sum read counts per gene, as multiple peaks can fall into one gene
                                
    return(overlapped.target.readcount) # return a dataframe
}

In [9]:
getRegulatorsOfGene <- function(gene_name, motif.matches, genomic.features, min.overlap = 10, motif.list) {
    ###### INPUTS: #####
    # gene_name: string, gene name
    # motif.matches: matchMotif object, e.g. motifMatch() result of peaks and jaspar 2020 motifs
    # countGRanges: GenomicRanges object, e.g. a normalized read matrix, converted into GRanges object
    # genomic.features: GenomicRanges object, e.g. a bed formated promoter region coordinates, converted into GRanges object
    # min.overlap: integer, minimum overlap in base pairs
    # motif.list: vector, a list of motif names. Only return results if found motifs are part of this list
    
    ##### OUTPUTS: #####
    # regulators.readcount: list, each element's name is a regulator's motif name, the values are
    # summarised readcounts of matching/overlapping peaks for this regulator
    
    match.matrix <- motifMatches(motif.matches) # motif match logic matrix
    match.matrix.GRanges <- rowRanges(motif.matches) # peak coordiantes, including read counts in mcols
    
    # target gene's annotation coordinates
    gene.GRanges <- genomic.features[genomic.features$gene_name == gene_name]
    
    # intersect peak coordinates with target gene's annotation coordinates
    # to get peaks that belong to target gene
    match.intersect.gene <- GenomicRanges::findOverlaps(query = match.matrix.GRanges, subject = gene.GRanges,
                                                       minoverlap = min.overlap, ignore.strand = T)
    # row index of peaks that fall in target gene coordinates
    overlapped.match.index <- queryHits(match.intersect.gene) 
    
    # Once extracted peaks that fall into target gene region, get the motifs that bind to these peaks.
    # Note slight difference in extracting motif names when there are only 1 peak versus more than 1 peaks
    if (length(overlapped.match.index) == 1) { # if only one peak in target gene
        gene.regulators <- match.matrix[overlapped.match.index, ] %>% .[.] %>% names 
    } else { # if multiple peaks in target gene
        gene.regulators <- match.matrix[overlapped.match.index, ] %>% colSums %>% .[.>0] %>% names
    }
    
    # Only keep motifs that are part of a given list
    gene.regulators <- gene.regulators[gene.regulators %in% motif.list] 
    
    # Get the read counts of the binding sites for each motif (regulator)
    regulators.readcount <- list()
    for (regulator in gene.regulators) {
        readcount <- match.matrix.GRanges[overlapped.match.index, ] %>% # readcount of each peak in target gene region
                        as.data.frame %>% select(A1:A12) %>% # extract read counts columns
                        `*`(match.matrix[overlapped.match.index, regulator]) %>% # multiply 0 or 1 based on if the peak has a match to this regulator or not
                        colSums() # sum up all the reads from peaks that match to the regulator binding site
        regulators.readcount[[regulator]] <- readcount
    }
    
    # Return a dataframe, each row gives the name of the motif that binds to the target gene, 
    # along with observed read count assciated with the matching peaks. Some genes may have no matches
    if (length(regulators.readcount) > 0) {
        regulators.readcount <- do.call(rbind, regulators.readcount) %>% 
                                as.data.frame %>% 
                                add_column("target_gene" = gene_name, .before = "A1") %>%
                                rownames_to_column("regulator_motif")
        return(regulators.readcount)
    }
}

---

### Why did you decide to not make each peak width 500?

For ATAC seq, the peak width reflects open chromatin. Naturally, peak width will vary. More open area would have wider peaks. Our next analysis is to look for TF binding sites in the open chromatin region, we would have erroneously lost (when actual peak width is wider than 500) or gained (when actual peak width is less than 500) some TF binding sites if we force consistent peak width.


In [10]:
# Consensus peaks
peak.file <- "/c/groups/sanjana_lab/cdai/TFscreen/atac/diffbind/diffbind_consensu_min2overlap.bed"
peaks <- getPeaks(peak.file, sort_peaks = F)

“Peaks are not equal width!Use resize(peaks, width = x, fix = "center") to make peaks equal in size, where x is the desired size of the peaks)”
“Peaks not sorted”


In [11]:
# Consensus peaks read counts
raw_counts <- read.table("/c/groups/sanjana_lab/cdai/TFscreen/atac/diffbind/diffbind_consensu_min2overlap_NO_RECENTER.txt", 
                               header = T, stringsAsFactor = F)

In [12]:
# each peak must> 15 reads across all samples
keep_rows <- which(rowSums(select(raw_counts, A1:A12)) > 15 ) # changed to 50 in rstudio version 6/10/2020, doesn't change anything though 

In [13]:
raw_counts <- raw_counts[keep_rows, ]

In [14]:
# peak coordinates
peaks <- raw_counts %>% select(Chr, Start, End)

In [15]:
raw_peak_counts <- makeGRangesFromDataFrame(raw_counts, keep.extra.columns = T)

#### Annotations

Get JASPAR2020 motifs. Then construct a motif - gene_name lookup table. In addition, add columns to indicate if a motif is a TF or hit TF.

In [16]:
# New method: use downloaded JASPAR2020
jaspar_2020 <- readJASPARMatrix("../../TFscreen/atac/JASPAR2020_combined_matrices_20191030.txt", matrixClass = "PFM")

# read in hit list and TF list, note the gene names are matched and transformed to be consistent with approved symbols according to HGNC genenames.org
# hit list from crispr screen
hitlist <- read.csv('/c/groups/sanjana_lab/cdai/TFscreen/Hitlist_20191230.csv', stringsAsFactors=F) %>% pull(hgnc_symbol)
# genomewide TF list
tflist <- read.csv('/c/groups/sanjana_lab/cdai/TFscreen/TFlist_20191230.csv', stringsAsFactors=F) %>% pull(hgnc_symbol)

# construct base motif_lookup table
motif_lookup <- TFBSTools::name(jaspar_2020) %>% 
                data.frame(stringsAsFactors = F) %>% 
                rownames_to_column("motif") %>% 
                mutate(gene_name=str_extract(`.`, "[a-zA-Z:0-9\\-]+")) %>%
                select(motif, gene_name)

# add two columns to indicate whether the motif is a TF or is a Hit
motif_lookup <- motif_lookup %>% mutate(is_hit = map_chr(gene_name, ~ if_else(mystrfunc(.x, hitlist), "Yes", "No"))) %>%
        mutate(is_tf = map_chr(gene_name, ~ if_else(mystrfunc(.x, tflist), "Yes", "No")))
# motif names of (almost) all TFs
tf.motif.list <- motif_lookup %>% filter(is_tf == "Yes") %>% pull(motif)

In [17]:
# get motif matches with peaks, (takes about 97 seconds to run)
Jaspar_ix <- matchMotifs(jaspar_2020, raw_peak_counts, 
                        genome = BSgenome.Hsapiens.UCSC.hg38,
                        out = "matches", p.cutoff = 5e-5)

In [18]:
Jaspar_matches <- motifMatches(Jaspar_ix)

In [19]:
raw_peak_counts[, 1:2] %>% head

GRanges object with 6 ranges and 2 metadata columns:
      seqnames        ranges strand |        A1        A2
         <Rle>     <IRanges>  <Rle> | <integer> <integer>
  [1]     chr1 629565-630002      * |     31494     24404
  [2]     chr1 633829-634392      * |     47145     33130
  [3]     chr1 778398-779290      * |       110        92
  [4]     chr1 826900-827802      * |        70        57
  [5]     chr1 869615-870166      * |        37        42
  [6]     chr1 876554-876951      * |        26        22
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [20]:
Jaspar_matches[1:5, 1:5]

5 x 5 sparse Matrix of class "lgCMatrix"
     THRB THAP11 TFAP4(var.2) TFAP2E TGIF2LY
[1,]    .      .            .      .       .
[2,]    .      .            .      .       .
[3,]    .      .            .      .       .
[4,]    .      .            .      .       .
[5,]    |      |            .      .       .

In [21]:
# for each jaspar motif in motif_lookup, get its respective read counts observed in atac-seq data
Jaspar_counts <- map(motif_lookup$motif, ~ raw_peak_counts[which(Jaspar_matches[, .x]), ]) %>% 
    map(~mcols(.x) %>% as.matrix %>% colSums)

In [22]:
# convert list into data frame
Jaspar_counts <- do.call(rbind, Jaspar_counts) %>% as.data.frame %>% 
        add_column(gene_name = motif_lookup$gene_name, .before = "A1") %>%
        group_by(gene_name) %>% summarise_all(mean)

In [23]:
Jaspar_counts %>% column_to_rownames("gene_name") %>% scale %>% head

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12
ALX3,-0.4807611,-0.4922085,-0.4836909,-0.4823163,-0.4950695,-0.4898261,-0.4762696,-0.4869891,-0.4770646,-0.4872151,-0.4435823,-0.4509154
AR,0.158029,0.1715849,0.1833,0.1704066,0.1537694,0.1682365,0.1626164,0.1388816,0.1614267,0.1620886,0.1059482,0.1011546
ARGFX,-0.5245539,-0.5305741,-0.5236492,-0.5127966,-0.539152,-0.5337648,-0.5004463,-0.5141496,-0.5059872,-0.5095521,-0.4662876,-0.4723474
ARNT::HIF1A,-0.77202,-0.764786,-0.7729967,-0.7744975,-0.7630934,-0.773969,-0.7970579,-0.7891908,-0.7930368,-0.7884161,-0.7745124,-0.7691255
ARNT2,-0.5977327,-0.5938041,-0.6056408,-0.5981779,-0.58646,-0.5990379,-0.6191276,-0.5977703,-0.6036864,-0.5943136,-0.5526561,-0.5434807
ASCL1,1.1307654,1.1563968,1.1812053,1.1492826,1.1566828,1.1610696,1.1175597,1.0743644,1.1055702,1.1070546,1.0572784,1.0595204


In [24]:
# get annotated promoter region bed
# note the latest protein coding genes are updated with gene names and gene_id to be consistent with hgnc
gene_region <- read.table("resources/annotations/hs38/gencode_v31_protein_u100k_d1k.bed",
                          header = T, 
                          col.names = c("seqname","start","end","gene_name", "score","strand")) %>% 
    dplyr::select(seqname, start, end, strand, gene_name)
# convert to GRanges object
gene_region <- GenomicRanges::makeGRangesFromDataFrame(gene_region, keep.extra.columns = T)

In [25]:
gene_region

GRanges object with 19943 ranges and 1 metadata column:
          seqnames            ranges strand |   gene_name
             <Rle>         <IRanges>  <Rle> | <character>
      [1]     chr1     449703-551697      - |      OR4F29
      [2]     chr1     684679-786673      - |      OR4F16
      [3]     chr1     823928-945581      + |      SAMD11
      [4]     chr1     860584-966719      + |      KLHL17
      [5]     chr1     866497-976865      + |     PLEKHN1
      ...      ...               ...    ... .         ...
  [19939]     chrY 24507560-24640207      + |       BPY2B
  [19940]     chrY 24733843-24908040      + |        DAZ4
  [19941]     chrY 24762069-24913492      - |        DAZ3
  [19942]     chrY 25029901-25162548      - |       BPY2C
  [19943]     chrY 25522162-25625902      + |        CDY1
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

---

### Limit search within hit TFs

#### 1. Out of 58 hit TF motifs, find their targets.

Note some motif names such as `BHLHA15(var.2), FOS::JUNB`

In [26]:
hitlist %>% sort %>% print

  [1] "AFF4"     "AHR"      "AIP"      "APBB2"    "ARID1B"   "ARNTL"   
  [7] "ASB1"     "BHLHA15"  "BRWD1"    "CBX4"     "CDX4"     "CEBPA"   
 [13] "CHD5"     "CNOT6"    "CREBZF"   "CRY1"     "CTDP1"    "CTNNB1"  
 [19] "DLX5"     "E2F1"     "EBF1"     "EGR3"     "EHF"      "ELK3"    
 [25] "EMX1"     "FOXN2"    "FOXQ1"    "GATA1"    "GTF2A2"   "H1-4"    
 [31] "H2BC4"    "HIF3A"    "HOXB3"    "HOXC13"   "HSFY1"    "HTATIP2" 
 [37] "INSM1"    "ISL2"     "JADE3"    "JUNB"     "JUND"     "KLF10"   
 [43] "KLF14"    "LHX4"     "MECP2"    "MEOX2"    "MLLT6"    "MPHOSPH8"
 [49] "NBPF24"   "NEUROG1"  "NEUROG2"  "NEUROG3"  "NFAT5"    "NFKBIB"  
 [55] "NKRF"     "NKX3-2"   "NR1D2"    "NR1I2"    "PAX2"     "PAX7"    
 [61] "PHF12"    "PHOX2A"   "POLR2B"   "POLR2J"   "POU2F3"   "POU3F2"  
 [67] "PRDM5"    "PRDM6"    "RAI14"    "RBPJ"     "RCOR2"    "RORB"    
 [73] "SCRT2"    "SIX5"     "SKI"      "SLA2"     "SMAD3"    "SNAI2"   
 [79] "SOX15"    "TAF1L"    "TAF5"     "TBPL1"    "TCEA2"    "TF

#### 2. Out of 120 hit TFs, find their regulators.

#### 1. & 2. write out result

---

### Limit search within TFs (~ 1800 TFs)

### 3. Out of all TF motifs ( ~ 578), find their targets.

In [27]:
# get a list of hit motif names
tf_motifs <- filter(motif_lookup, is_tf == "Yes") %>% pull(motif) %>% sort
names(tf_motifs) <- tf_motifs

In [28]:
tic()

In [29]:
# Given a list of motifs, find their binding target genes
reg2tar_tf <- imap_dfr(tf_motifs, ~ getTargetsFromMotif(.x, Jaspar_ix, gene_region, 100, tflist) %>% 
                            add_column("regulator_motif" = .y, .before = "target_gene"))

# Get gene name from motif name and summarize read counts per interaction (regulator - target combinations)
reg2tar_tf <- left_join(reg2tar_tf, motif_lookup[, 1:2], by = c("regulator_motif" = "motif")) %>% 
                        rename("gene_name" = "regulator_gene") %>%
                        select(regulator_gene, target_gene, A1:A12) %>% 
                        group_by(regulator_gene, target_gene) %>% 
                        summarise_all(sum)

# Combine replicate 1 and replicate 2 by taking the mean
# Result in just ES and 5 time points
rep1 <- paste0(rep("A", 6), seq(1, 11, 2))
rep2 <- paste0(rep("A", 6), seq(2, 12, 2))
reg2tar_tf <- map2_df(reg2tar_tf[, rep1], reg2tar_tf[, rep2], ~ map2_dbl(.x, .y, ~ mean(c(.x, .y))) ) %>% 
    add_column("regulator_gene" = reg2tar_tf$regulator_gene, "target_gene" = reg2tar_tf$target_gene, .before = "A1")
names(reg2tar_tf) <- c("regulator_gene", "target_gene", "ES", "H1", "H4", "H16", "D1", "D4")
# Ungroup
reg2tar_tf <- ungroup(reg2tar_tf)
# Round digits to integers
reg2tar_tf =  mutate_at(reg2tar_tf, .vars = c("ES", "H1", "H4", "H16", "D1", "D4"), .funs = list(round))

In [30]:
toc()

145.997 sec elapsed


### 4. Out of ~ 1800 TFs, find their regulators. (time consuming! 30 min with 16 cores)

In [31]:
tflist %>% str

 chr [1:1889] "ABL1" "ABT1" "ABTB1" "ADNP" "ADNP2" "AEBP1" "AEBP2" "AFF1" ...


In [32]:
tic()

In [33]:
# given a list of gene, find the regulators of each gene
tar2reg_tf <- mclapply(tflist, function(x) getRegulatorsOfGene(x, Jaspar_ix, gene_region, 100, tf.motif.list), mc.cores = 16)

# remove null elements
tar2reg_tf <- map_lgl(tar2reg_tf, ~ ! is_null(.x)) %>% which %>% tar2reg_tf[.]
# combine into one dataframe
tar2reg_tf <- do.call(rbind, tar2reg_tf)
                       
# combine replicate 1 and replicate 2 by taking the mean
# Result in just ES and 5 time points
rep1 <- paste0(rep("A", 6), seq(1, 11, 2))
rep2 <- paste0(rep("A", 6), seq(2, 12, 2))
                       
tar2reg_tf <- map2_df(tar2reg_tf[, rep1], tar2reg_tf[, rep2], ~ map2_dbl(.x, .y, ~ mean(c(.x, .y)))) %>% 
    add_column("regulator_motif" = tar2reg_tf$regulator_motif, "target_gene" = tar2reg_tf$target_gene, .before = "A1")
names(tar2reg_tf) <- c("regulator_motif", "target_gene", "ES", "H1", "H4", "H16", "D1", "D4")

# replace motif names with gene names
tar2reg_tf <- left_join(tar2reg_tf, motif_lookup[, 1:2], by = c("regulator_motif" = "motif")) %>% 
                select(gene_name, target_gene: D4) %>%
                group_by(gene_name, target_gene) %>% summarise_all(sum) %>%
                rename("gene_name" = "regulator_gene")
tar2reg_tf <- ungroup(tar2reg_tf)

In [34]:
tar2reg_tf = mutate_at(tar2reg_tf, c("ES", "H1", "H4", "H16", "D1", "D4"), list(round))

In [35]:
toc()

2770.224 sec elapsed


In [36]:
reg2tar_tf %>% dim
tar2reg_tf %>% dim

In [70]:
reg2tar_tf %>% dim
tar2reg_tf %>% dim

### 3. & 4. write out result

In [73]:
getwd()

### 5. Combine reg2tar and tar2reg dataset?

Identical interactions such as `NEUROG2 -> ZBTB18` should have the same number of read counts in either dataset.

Code cell below checks if identical interactions in reg2tar_tf (reg2tar) and tar2reg_tf(tar2reg) have the same read counts, in general they do. For some reason, there are discrepances in the `*_tf` datasets. But these are mostlly `JUNB:FOS`, which will be filtered out in the end. So not a concern. 

##### merge reg2tar and tar2reg dataset, at this point, we an use either the hit dataset or the all TF dataset

In [37]:
# Concatenate regulator and target, used as key
reg2tar_tf <- mutate(reg2tar_tf, "interaction" = paste0(regulator_gene, "_", target_gene))
tar2reg_tf <- mutate(tar2reg_tf, "interaction" = paste0(regulator_gene, "_", target_gene))

keys1 <- paste0(reg2tar_tf$regulator_gene, "_", reg2tar_tf$target_gene)
keys2 <- paste0(tar2reg_tf$regulator_gene, "_", tar2reg_tf$target_gene)

common_keys <- intersect(keys1, keys2) # interactions shared between 2 datasets
keys1 <- setdiff(keys1, common_keys) # interactions in reg2tar only
keys2 <- setdiff(keys2, common_keys) # interactions in tar2reg only

# Combine the two datasets such that overlapping interactions are only stored once
interactions <- rbind(filter(reg2tar_tf, interaction %in% c(keys1, common_keys)), filter(tar2reg_tf, interaction %in% c(keys2)))

---

### 6. Integrate interaction ATAC read counts with target gene expression, calculate correlations.

Comparing time points are: `ES, H16, D1, D4` *(H16 is H15 in RNA-seq data)*

In [39]:
#### Read in gene expression raw read counts, select only interested time poins, normalize, 
#### then create a nested dataframe making it ready for correlation calculation.
#### Finally calculate correlations

library(readxl)

dge.file <- "../../TFscreen//RNA-seq_timepoint_deseq_result_20200102.xlsx"
# also sheet name for raw readcounts "rawCounts"
sheets <- c("H15_vs_ES", "D1_vs_ES", "D4_vs_ES")

dge <- map(sheets, ~ read_excel(dge.file, .x) %>% select(gene_id, gene_name, log2FoldChange, pvalue, padj))
names(dge) <- factor(c("H16", "D1", "D4"), levels = c("H16", "D1", "D4"))
dge <- map(dge, ~ drop_na(.x, gene_name)) # remove rows with no gene name

In [40]:
dge.rawCounts <- read_excel(dge.file, "rawCounts")

suppressMessages(rna_sample_anno <- read_csv('../../TFscreen/RNASeqSampleNames.csv'))

rna_sample <- c("S01_B1", "S01_B2", "S02_B1", "S02_B2", "S15_B1", "S15_B2", "S16_B1", "S16_B2", 
                "S17_B1", "S17_B2", "S18_B1", "S18_B2", "S07_B1", "S07_B2", "S08_B1", "S08_B2")

In [41]:
# Check rna samples (not needed for analysis)
rna_sample_anno %>% filter(sample %in% rna_sample) %>% select(sample, group) %>% t

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
sample,S01_B1,S01_B2,S02_B1,S02_B2,S07_B1,S07_B2,S08_B1,S08_B2,S15_B1,S15_B2,S16_B1,S16_B2,S17_B1,S17_B2,S18_B1,S18_B2
group,ES,ES,ES,ES,D4,D4,D4,D4,H15,H15,H15,H15,D1,D1,D1,D1


In [42]:
# Normalize read counts: first calc reads per million then take sqrt
dge.normCounts <- select(dge.rawCounts, - gene_id, - gene_name) %>% apply(2, function(x) sqrt(x * 1e6 / sum(x)) ) %>% as.data.frame

# replicate names
rna.rep1 <- c("S01_B1", "S15_B1", "S17_B1", "S07_B1")
rna.rep2 <- c("S01_B2", "S15_B2", "S17_B2", "S07_B2")
rna.rep3 <- c("S02_B1", "S16_B1", "S18_B1", "S08_B1")
rna.rep4 <- c("S02_B2", "S16_B2", "S18_B2", "S08_B2")

# combine 4 replicates into 1, using average
dge.normCounts <- pmap_df(list(dge.normCounts[, rna.rep1], dge.normCounts[, rna.rep2], dge.normCounts[, rna.rep3], dge.normCounts[, rna.rep4]), 
                            ~ pmap_dbl(list(..1, ..2, ..3, ..4), ~ mean(c(..1, ..2, ..3, ..4)))) %>%
                        add_column("gene_id" = dge.rawCounts$gene_id, "gene_name" = dge.rawCounts$gene_name, .before= "S01_B1")
names(dge.normCounts) <- c("gene_id", "gene_name", "ES", "H16", "D1", "D4")

# some gene_ids correspond to multiple or NA gene names, remove these ~ 211 gene ids
remove.gene.idx <- dge.normCounts[, 1:2] %>% group_by(gene_name) %>% tally %>% filter(n > 1) %>% pull(gene_name)
remove.gene.id <- dge.normCounts %>% filter(gene_name %in% remove.gene.idx) %>% pull(gene_id)
dge.normCounts <- filter(dge.normCounts, ! gene_id %in% remove.gene.id)

# Create nested dataframe with each time point expression values in a list
dge.byGene <- select(dge.normCounts, - gene_id) %>% group_by(gene_name) %>% nest(tar.ge = c(ES, H16, D1, D4))

# do the same for raw counts (useful later for regulator count filters)
dge.rawCounts <- pmap_df(list(dge.rawCounts[, rna.rep1], dge.rawCounts[, rna.rep2], dge.rawCounts[, rna.rep3], dge.rawCounts[, rna.rep4]), 
                            ~ pmap_dbl(list(..1, ..2, ..3, ..4), ~ mean(c(..1, ..2, ..3, ..4)))) %>%
                        add_column("gene_id" = dge.rawCounts$gene_id, "gene_name" = dge.rawCounts$gene_name, .before= "S01_B1")
names(dge.rawCounts) <- c("gene_id", "gene_name", "ES", "H16", "D1", "D4")

dge.rawCounts <- filter(dge.rawCounts, ! gene_id %in% remove.gene.id) %>%
                        select( - gene_id) %>% mutate_at(c("ES", "H16", "D1", "D4"), ~ as.integer(.)) %>% nest(reg.cnt = c(ES, H16, D1, D4))

In [43]:
# Get read counts of ATAC-seq for interactions at intersted time points, store in nested dataframe
interactions <- as.data.frame(interactions) %>% select(-H1, -H4, -interaction) %>% group_by(regulator_gene, target_gene) %>%
            nest(atac = c(ES, H16, D1, D4))

# Combine ATAC-seq and RNA-seq data of selected interactions at time points.
atac_dge <- inner_join(interactions, dge.byGene, by = c("target_gene" = "gene_name"))

# Run correlation test on ATAC-seq read counts and RNA-seq read counts
suppressWarnings(cor_test <- map2(atac_dge$atac, atac_dge$tar.ge, ~ cor.test(unlist(.x), unlist(.y), method = "p")))

# Extract pearson correlation and p-values and store along with read counts for each interactions
inter_corr <- add_column(atac_dge, "corr" = map_dbl(cor_test, ~.x$estimate)) %>%
            add_column("cor_p" = map_dbl(cor_test, ~.x$p.value))

In [44]:
inter_corr %>% filter(cor_p < .05 & abs(corr) > .6 & str_detect(regulator_gene, "NEUROG2")) %>% head

regulator_gene,target_gene,atac,tar.ge,corr,cor_p
<chr>,<chr>,<list>,<list>,<dbl>,<dbl>
NEUROG2,AEBP1,"132, 110, 200, 96","3.164558, 2.780241, 4.209366, 2.022166",0.9674061,0.03259385
NEUROG2,ANKZF1,"200, 262, 454, 360","5.865387, 5.536853, 4.507372, 4.831147",-0.9902439,0.00975615
NEUROG2,ASB6,"424, 453, 786, 836","5.818059, 6.609648, 8.058946, 9.049867",0.9668239,0.033176118
NEUROG2,ASCC1,"232, 244, 534, 264","4.182794, 4.271233, 2.227073, 3.979258",-0.9968055,0.003194537
NEUROG2,BARX2,"171, 188, 384, 374","0.1054361, 0.3021371, 0.6228801, 0.6747453",0.9643166,0.035683447
NEUROG2,BTF3,"110, 121, 169, 265","24.83077, 25.67861, 24.19530, 21.96138",-0.9595156,0.04048444


In [45]:
dim(inter_corr)

### 7. Integrate target gene's DESeq2 differential experession `FC` and `FDR` at (`H16, D1, D4`) against `ES`.

In [46]:
# Remove gene_ids, for the same reason as above for dge.normCounts
# dge stores DESeq2 differential gene expression analysis results
dge <- map(dge, ~ filter(.x, ! gene_id %in% remove.gene.id))

# Extract Log2FoldChange and FDR (padj)
l2fc <- data.frame("gene_name" = dge$H16$gene_name,
                    "H16" = dge$H16$log2FoldChange, 
                   "D1" = dge$D1$log2FoldChange, 
                   "D4" = dge$D4$log2FoldChange,
                    stringsAsFactors = F) %>%
            nest(l2fc = c(H16, D1, D4))
fdr <- data.frame("gene_name" = dge$H16$gene_name,
                    "H16" = dge$H16$padj, 
                  "D1" = dge$D1$padj, 
                  "D4" = dge$D4$padj,
                    stringsAsFactors = F) %>%
            nest(fdr = c(H16, D1, D4))
fc_fdr <- cbind(l2fc, fdr[,2])

# Interaction matrix that include previous 
inter_matrix <- inner_join(inter_corr, fc_fdr, by = c("target_gene" = "gene_name"))

In [47]:
head(inter_matrix)
dim(inter_matrix)

regulator_gene,target_gene,atac,tar.ge,corr,cor_p,l2fc,fdr
<chr>,<chr>,<list>,<list>,<dbl>,<dbl>,<list>,<list>
ALX3,ABL1,"58, 55, 87, 70","5.173194, 6.529207, 5.723459, 8.246673",0.02506234,0.9749377,"0.6320782, 0.2170690, 1.1849099","0.262886079, 0.740634623, 0.001499175"
ALX3,ABT1,"30, 35, 38, 12","7.872756, 7.348605, 7.966006, 7.344911",0.55294242,0.4470576,"0.01694428, 0.17787550, -0.13382753","0.9942557, 0.7615942, 0.7876390"
ALX3,ABTB1,"284, 242, 415, 354","1.593805, 1.140030, 0.914218, 2.234686",-0.04142582,0.9585742,"-0.2050566, -0.6785848, 1.1371586","0.9588951, 0.7137269, 0.3841130"
ALX3,ADNP,"120, 96, 201, 208","10.49540, 10.76583, 10.66034, 12.07038",0.59044179,0.4095582,"0.08503511, 0.01793032, 0.32433211","0.9030131, 0.9736239, 0.2440713"
ALX3,AFF1,"190, 208, 274, 64","6.782081, 7.869602, 5.009183, 5.381708",0.09976717,0.9002328,"0.05826006, -0.82964306, -1.14271842","0.9831020, 0.3664824, 0.1072623"
ALX3,AFF4,"288, 340, 612, 549","9.71868, 10.85117, 11.69826, 14.29276",0.72926826,0.2707317,"0.2163915, 0.4003539, 0.9217811","0.76682514, 0.38390812, 0.00364119"


### 8. Integrate regulator gene expression

##### Note: read counts for regulator gene expression is raw read counts, while for target gene expression is sqrt(reads per million) normalized, because we need to raw read counts to filter regulator while we used normalized reads of target to calculate correlation with ATAC seq read counts.

In [48]:
tic()

In [49]:
# Add regulator gene expression
inter_matrix <- inner_join(inter_matrix, dge.rawCounts, by = c("regulator_gene" = "gene_name"))

In [50]:
# Add explanation for ouput columns
column_explanation <- tribble(~name, ~notes,
       "regulator_gene", "regulator gene name",
       "target_gene","target gene name",
       "atac", "ATAC-seq observed read counts of regulator-target interaction, normalized log2(reads per million)",
       "tar.ge", "gene expression read counts of target gene (normalized to sqrt(reads per million))",
       "corr", "Pearson correlation between atac and tar.ge",
       "cor_p", "p value of correlation", 
       "l2fc", "log2 fold change of target gene expression, all against ES",
       "fdr", "fdr or adjusted p value of target gene being differentially expressed, all against ES",
       "reg.ge", "gene expression read counts of regulator gene (normalized to sqrt(reads per million))",
       "ES, H16, D1, D4", "conditions: ES, 16 Hour, 1 Day, and 4 Day")

In [51]:
toc()

0.882 sec elapsed


In [52]:
inter_matrix %>% head

regulator_gene,target_gene,atac,tar.ge,corr,cor_p,l2fc,fdr,reg.cnt
<chr>,<chr>,<list>,<list>,<dbl>,<dbl>,<list>,<list>,<list>
AR,ABTB1,"284, 242, 415, 354","1.593805, 1.140030, 0.914218, 2.234686",-0.04142582,0.95857418,"-0.2050566, -0.6785848, 1.1371586","0.9588951, 0.7137269, 0.3841130","68, 47, 46, 7"
AR,ADNP,"163, 152, 246, 164","10.49540, 10.76583, 10.66034, 12.07038",-0.25835573,0.74164427,"0.08503511, 0.01793032, 0.32433211","0.9030131, 0.9736239, 0.2440713","68, 47, 46, 7"
AR,ADNP2,"102, 94, 154, 170","9.098495, 9.291882, 11.298799, 5.948209",-0.32093994,0.67906006,"-0.1498182, 0.3723243, -1.5241753","8.564070e-01, 4.118791e-01, 2.430913e-07","68, 47, 46, 7"
AR,AEBP2,"50, 34, 60, 31","8.107459, 6.034633, 5.254064, 4.774380",0.2707649,0.7292351,"-0.7205395, -0.9770126, -1.4987059","0.31629824, 0.09203261, 0.00122416","68, 47, 46, 7"
AR,AFF1,"74, 55, 89, 34","6.782081, 7.869602, 5.009183, 5.381708",-0.15078816,0.84921184,"0.05826006, -0.82964306, -1.14271842","0.9831020, 0.3664824, 0.1072623","68, 47, 46, 7"
AR,AFF3,"136, 150, 242, 348","0.8340168, 2.1015446, 2.5902853, 8.4704284",0.93941698,0.06058302,"1.489448, 2.391791, 5.331861","1.779774e-01, 5.448725e-03, 2.535322e-14","68, 47, 46, 7"


In [53]:
inter_matrix %>% head() %>% unnest( cols = c(atac, tar.ge, fdr, l2fc, reg.cnt), names_sep = ".") 

regulator_gene,target_gene,atac.ES,atac.H16,atac.D1,atac.D4,tar.ge.ES,tar.ge.H16,tar.ge.D1,tar.ge.D4,⋯,l2fc.H16,l2fc.D1,l2fc.D4,fdr.H16,fdr.D1,fdr.D4,reg.cnt.ES,reg.cnt.H16,reg.cnt.D1,reg.cnt.D4
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
AR,ABTB1,284,242,415,354,1.5938046,1.14003,0.914218,2.234686,⋯,-0.2050566,-0.67858479,1.1371586,0.9588951,0.713726938,0.384113,68,47,46,7
AR,ADNP,163,152,246,164,10.4953998,10.765826,10.660336,12.070383,⋯,0.08503511,0.01793032,0.3243321,0.9030131,0.973623901,0.2440713,68,47,46,7
AR,ADNP2,102,94,154,170,9.0984955,9.291882,11.298799,5.948209,⋯,-0.14981816,0.37232433,-1.5241753,0.856407,0.411879075,2.430913e-07,68,47,46,7
AR,AEBP2,50,34,60,31,8.1074589,6.034633,5.254064,4.77438,⋯,-0.72053945,-0.97701259,-1.4987059,0.3162982,0.092032606,0.00122416,68,47,46,7
AR,AFF1,74,55,89,34,6.7820813,7.869602,5.009183,5.381708,⋯,0.05826006,-0.82964306,-1.1427184,0.983102,0.366482409,0.1072623,68,47,46,7
AR,AFF3,136,150,242,348,0.8340168,2.101545,2.590285,8.470428,⋯,1.48944816,2.39179062,5.3318608,0.1779774,0.005448725,2.535322e-14,68,47,46,7


In [54]:
tic()

In [55]:
# Unnest interaction matrix dataframe to expand the columns, ready for csv write out
inter_matrix_unnested <- unnest(inter_matrix, cols = c(atac, tar.ge, fdr, l2fc, reg.cnt), names_sep = ".")

In [56]:
toc()

16.195 sec elapsed


In [58]:
tic()
WriteXLS(list("interactions" = inter_matrix_unnested, "notes" = column_explanation), 
         "results/notebook_results/Interaction_matrix_nofilter_AllTF-intergenic-raw_20220919.xlsx")
toc()

412.349 sec elapsed


In [59]:
tic()
write.table(inter_matrix_unnested, 
            "results/notebook_results/Interaction_matrix_nofilter_AllTF-intergenic-raw_20220919.csv", 
            sep=",", quote=F, row.names=F, col.names=T)
toc()

11.29 sec elapsed


### 9. Analyze / filter interaction matrix

# Approach 1: NGN initiated TF networks. 

- NGN1/2 start at the top
- Level 1: targets are significantly differentially expressed at H16, includes:
    - targets directly regulated by NGN1/2 (Group A)
- Level 2: targes are significantly differentially expressed at D1, includes:
    - targets directly regulated by NGN1/2 at D1 (Group B)
    - or targets regulated by NGN1/2's level 1 objects (Grouop A1)
- Level 3: targets are significantly differentially expressed at D4, includes:
    - targets directly regulated by NGN1/2 at D4 (Group C)
    - or targets regulated by NGN1/2's level 1 objects (Group A2)
    - or targets regulated by NGN1/2's level 2 objects (Group B1)
    - or targets regulated by other level 2 objects, not directly regulated by NGN1/2 (Group A1B1)

## 1. Find NGN -> targets in: 

- GroupA: interactions started with H16
- GroupB: interactions started with D1
- GroupC: interactions started with D4

In [260]:
# add regulator total read count across H16 - D4, used for filtering regulators
reg.cnt.sum <- apply(select(inter_matrix_unnested, reg.cnt.H16:reg.cnt.D4), 1, sum) # regulator H16 - D4 read sum
base_set <- mutate(inter_matrix_unnested, "reg.cnt.sum" = reg.cnt.sum)

### Group A: NGN initiated regulations starting from H16

Criteria:<br>
`abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at H16, fdr.H16 < 0.05` & <br> `Target expression change abs(l2fc.H16) > 1` & <br> `reg.cnt.sum > 30`

In [261]:
base_set %>% head(2)

regulator_gene,target_gene,atac.ES,atac.H16,atac.D1,atac.D4,tar.ge.ES,tar.ge.H16,tar.ge.D1,tar.ge.D4,⋯,tar.cnt.D4,reg.l2fc.H16,reg.l2fc.D1,reg.l2fc.D4,reg.fdr.H16,reg.fdr.D1,reg.fdr.D4,reg.isHit,tar.isHit,reg.cnt.sum
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>
AR,ABTB1,6.585393,6.045427,6.030358,6.084846,1.593805,1.14003,0.914218,2.234686,⋯,27,-0.507919,-0.4858691,-3.236964,0.7647186,0.6979496,3.529782e-05,False,False,100
AR,ADNP2,3.945854,3.821774,3.67717,4.218503,9.098495,9.291882,11.298799,5.948209,⋯,180,-0.507919,-0.4858691,-3.236964,0.7647186,0.6979496,3.529782e-05,False,False,100


In [262]:
NGN.interaction.H16 <- filter(base_set, regulator_gene %in% c("NEUROG1", "NEUROG2") & # NGN as regulator
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.H16 < .05 & abs(l2fc.H16 > 1) &
       reg.cnt.sum > 30)  # FDR and FC filter

In [263]:
NGN.interaction.H16$target_gene %>% unique %>% sort %>% print

 [1] "AGAP3"   "CBFA2T2" "CHD7"    "EBF1"    "EBF2"    "EBF3"    "EYA2"   
 [8] "EZH2"    "HES6"    "HIVEP3"  "MSL3"    "NEUROD1" "NHLH1"   "NKX6-1" 
[15] "NR5A2"   "PAX3"    "POU3F2"  "POU6F2"  "PRDM13"  "RCOR2"   "RUNX1T1"
[22] "SIM1"    "ST18"    "TBC1D2B" "TP53BP1" "ZBTB18"  "ZEB1"    "ZEB2"   


of those, these These are hits

In [264]:
NGN.interaction.H16$target_gene %>% intersect(hitlist)  %>% sort %>% print

[1] "EBF1"   "POU3F2" "RCOR2"  "ZBTB18"


### Group B: NGN initiated regulations starting from D1

Criteria:<br>
`abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at D1: fdr.H16 > 0.05 & fdr.D1 < 0.05` & <br> `Target expression change abs(l2fc.D1) > 1` & <br> `reg.cnt.sum > 30`

In [265]:
NGN.interaction.D1 <- filter(base_set, regulator_gene %in% c("NEUROG1", "NEUROG2") & # NGN as regulator
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.D1 < .05 & fdr.H16 > .05 & abs(l2fc.D1 > 1) &
       reg.cnt.sum > 30)   # FDR and FC filter

In [266]:
NGN.interaction.D1$target_gene %>% unique %>% sort %>% print

 [1] "AFF3"   "BACH2"  "CELF3"  "GATA6"  "HOMEZ"  "NHLH2"  "POU2F2" "PRDM2" 
 [9] "TCF15"  "TCF4"   "TRIP4"  "VAX2"  


New targets emerge in D1, but not in H16

In [267]:
setdiff(NGN.interaction.D1$target_gene, NGN.interaction.H16$target_gene) %>% sort %>% print

 [1] "AFF3"   "BACH2"  "CELF3"  "GATA6"  "HOMEZ"  "NHLH2"  "POU2F2" "PRDM2" 
 [9] "TCF15"  "TCF4"   "TRIP4"  "VAX2"  


Of D1 targets, these are hits, **`VAX2` is new D1**

In [268]:
NGN.interaction.D1$target_gene %>% intersect(hitlist)  %>% sort %>% print

[1] "VAX2"


### Group C: NGN initiated regulations starting from D4
Criteria:<br>
`abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at D4: fdr.H16 > 0.05 & fdr.D1 > 0.05 & fdr.D4 < 0.05` & <br> `Target expression change abs(l2fc.D4) > 1` & <br> `reg.cnt.sum > 30`

In [269]:
NGN.interaction.D4 <- filter(base_set, regulator_gene %in% c("NEUROG1", "NEUROG2") & # NGN as regulator
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.D4 < .05 & fdr.D1 > .05 & fdr.H16 > .05 & abs(l2fc.D4 > 1) &
       reg.cnt.sum > 30)  # FDR and FC filter

In [270]:
NGN.interaction.D4$target_gene %>% unique %>% sort %>% print

 [1] "ARX"    "BTG1"   "CASZ1"  "CBFB"   "DIP2C"  "ELAVL2" "ESRRG"  "GTF2B" 
 [9] "HDAC5"  "KAT2B"  "KLF12"  "KLF7"   "MAML3"  "MEF2C"  "MEIS2"  "MYT1"  
[17] "MYT1L"  "NFIA"   "NKX6-2" "NPAS3"  "PAX6"   "PBX3"   "PBX4"   "PROX1" 
[25] "RB1"    "RORA"   "SATB2"  "SCRT1"  "SIX1"   "SMAD9"  "SSBP2"  "TLE1"  
[33] "TLX1"   "TSHZ2"  "ZNF611" "ZNF83"  "ZNF862"


New targets emerge in D4, but not in H16 or D1. 

In [271]:
setdiff(NGN.interaction.D4$target_gene, union(NGN.interaction.H16$target_gene, NGN.interaction.D1$target_gene)) %>% sort %>% print

 [1] "ARX"    "BTG1"   "CASZ1"  "CBFB"   "DIP2C"  "ELAVL2" "ESRRG"  "GTF2B" 
 [9] "HDAC5"  "KAT2B"  "KLF12"  "KLF7"   "MAML3"  "MEF2C"  "MEIS2"  "MYT1"  
[17] "MYT1L"  "NFIA"   "NKX6-2" "NPAS3"  "PAX6"   "PBX3"   "PBX4"   "PROX1" 
[25] "RB1"    "RORA"   "SATB2"  "SCRT1"  "SIX1"   "SMAD9"  "SSBP2"  "TLE1"  
[33] "TLX1"   "TSHZ2"  "ZNF611" "ZNF83"  "ZNF862"


Of D4 targets, no hit TF targets.

In [272]:
NGN.interaction.D4$target_gene %>% intersect(hitlist)  %>% sort %>% print

character(0)


## 2. Find level 2 targets of NGN H16 targets. These are: NGN -> H16 targets -> level 2 targets

### Group A1: targets differentially expressed at *D1*, regulated by NGN's direct level 1 targets

Criteria:<br>
`regulators are targets of NGN1/2's direct target at level1 (H16)` & <br> `abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at D1: fdr.H16 > 0.05 & fdr.D1 < 0.05` & <br> `Target expression change abs(l2fc.D1) > 1` & <br> `reg.cnt.sum > 30`

In [273]:
NGN.interaction.H16.toD1 <- filter(base_set, regulator_gene %in% NGN.interaction.H16$target_gene & # targets of TFs that are regulated by NGN at H16
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.D1 < .05 & fdr.H16 > .05 & abs(l2fc.D1 > 1) &
       reg.cnt.sum > 30)  # FDR and FC filter

Group A1: interactions started with D1, targets of NGN H16 targets

In [274]:
NGN.interaction.H16.toD1$target_gene %>% unique %>% sort %>% print

 [1] "AFF3"    "BACH2"   "CELF3"   "DENND4A" "ESRRA"   "FXR2"    "GATA6"  
 [8] "HOMEZ"   "HSF2"    "INSM2"   "KLF13"   "LHX9"    "LMO1"    "MEIS1"  
[15] "NHLH2"   "NRL"     "ONECUT2" "PHF21B"  "POU2F2"  "PRDM2"   "RARA"   
[22] "TAF13"   "TCF15"   "TCF4"    "TLX2"    "TRIP4"   "TSHZ1"   "VAX2"   
[29] "ZFHX3"   "ZHX3"    "ZNF160"  "ZNF197"  "ZNF491"  "ZNF75D" 


of those, these These are hits

In [275]:
NGN.interaction.H16.toD1$target_gene %>% intersect(hitlist)  %>% sort %>% print

[1] "VAX2"


### Group A2: targets differentially expressed at *D4*, regulated by NGN's direct level 1 targets

Criteria:<br>
`regulators are targets of NGN1/2's direct target at level1 (H16)` & <br> `abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at D4: fdr.H16 > 0.05 & fdr.D1 > 0.05 & fdr.D4 < 0.05` & <br> `Target expression change abs(l2fc.D4) > 1` & <br> `reg.cnt.sum > 30`

In [276]:
NGN.interaction.H16.toD4 <- filter(base_set, regulator_gene %in% NGN.interaction.H16$target_gene & # targets of TFs that are regulated by NGN at H16
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.D4 < .05 & fdr.H16 > .05 & fdr.D1 > .05 & abs(l2fc.D4 > 1) &
       reg.cnt.sum > 30) # FDR and FC filter

In [277]:
NGN.interaction.H16.toD4$target_gene %>% unique %>% sort %>% t

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
ABL1,APBB1,APC,ARNT2,ARX,ASB8,BAZ2B,BIN1,BTG1,CALCOCO1,⋯,ZNF512,ZNF568,ZNF611,ZNF688,ZNF708,ZNF764,ZNF785,ZNF83,ZNF862,ZNHIT3


New targets emerge in D4, but not in  D1. 

### Group B1: targets differentially expressed at D4, regulated by NGN's direct level 2 targets

Criteria:<br>
`regulators are targets of NGN1/2's direct target at level2 (D1)` & <br> `abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at D4: fdr.H16 > 0.05 & fdr.D1 > 0.05 & fdr.D4 < 0.05` & <br> `Target expression change abs(l2fc.D4) > 1` & <br> `reg.cnt.sum > 30`

In [278]:
NGN.interaction.D1.toD4 <- filter(base_set, regulator_gene %in% NGN.interaction.D1$target_gene & # targets of TFs that are regulated by NGN at H16
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.D4 < .05 & fdr.H16 > .05 & fdr.D1 > .05 & abs(l2fc.D4 > 1) &
       reg.cnt.sum > 30) # FDR and FC filter

In [279]:
NGN.interaction.D1.toD4$target_gene %>% unique %>% sort %>% t

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
ABL1,APC,ASB8,BAZ2B,BIN1,CASZ1,CBFB,CBX4,CBX8,CEBPG,⋯,ZNF382,ZNF410,ZNF436,ZNF568,ZNF611,ZNF641,ZNF688,ZNF764,ZNF83,ZNF862


of those, these These are hits

In [280]:
NGN.interaction.D1.toD4$target_gene %>% intersect(hitlist)  %>% sort %>% print

[1] "CBX4"   "LHX4"   "PHOX2A" "PRDM6"  "THRA"   "ZNF250"


### Group A1B11: targets differentially expressed at D4, regulated by (Group A1) NGN -> H16 targets -> D1 targets

Criteria:<br>
`regulators are targets of NGN1/2's level 1 (H16) indirect targets' level 2 (D1) targets` & <br> `abs(corr) > 0.55` & <br> `corr. p-val < 0.2` & <br> `Target differentially expressed at D4: fdr.H16 > 0.05 & fdr.D1 > 0.05 & fdr.D4 < 0.05` & <br> `Target expression change abs(l2fc.D4) > 1` & <br> `reg.cnt.sum > 30`

In [281]:
NGN.interaction.H16.toD1.toD4 <- filter(base_set, regulator_gene %in% NGN.interaction.H16.toD1$target_gene & # targets of TFs that are regulated by NGN at H16
       abs(corr) > .55 & cor_p < .2 & # correlation filter
       fdr.D4 < .05 & fdr.H16 > .05 & fdr.D1 > .05 & abs(l2fc.D4 > 1) &
       reg.cnt.sum > 30) # FDR and FC filter

In [282]:
NGN.interaction.H16.toD1.toD4$target_gene %>% unique %>% sort %>% t

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
ABL1,APBB1,APC,ARNT2,ARX,ASB8,BAZ2B,BIN1,BTG1,CALCOCO1,⋯,ZNF599,ZNF611,ZNF641,ZNF688,ZNF708,ZNF764,ZNF785,ZNF83,ZNF862,ZNHIT3


## construct interaction table for cyberscope

- lev1: NGN.interaction.H16
- lev2: NGN.interaction.D1   
- lev3: NGN.interaction.D4   
- lev2: NGN.interaction.H16.toD1  
- lev3: NGN.interaction.H16.toD4  
- lev3: NGN.interaction.D1.toD4   
- lev3: NGN.interaction.H16.toD1.toD4   

In [283]:
NGN.started.network <- list("NGN.interaction.H16"=NGN.interaction.H16, "NGN.interaction.D1"=NGN.interaction.D1, 
                            "NGN.interaction.D4"=NGN.interaction.D4, "NGN.interaction.H16.toD1"=NGN.interaction.H16.toD1, 
                            "NGN.interaction.H16.toD4" = NGN.interaction.H16.toD4, "NGN.interaction.D1.toD4"=NGN.interaction.D1.toD4,
                            "NGN.interaction.H16.toD1.toD4"=NGN.interaction.H16.toD1.toD4)
NGN.started.network.levels <- c("lev1", "lev2", "lev3", "lev2", "lev3", "lev3", "lev3")

In [284]:
NGN.started.network.df <- map2_df(NGN.started.network, NGN.started.network.levels, ~ add_column(.x, "lev" = .y))

In [288]:
NGN.started.network.df %>% write.table("../NGN_started_network_20200107.csv", quote=F, sep=",", col.names=T, row.names=F)

In [291]:
NGN.started.network.df %>% filter( tar.isHit)

regulator_gene,target_gene,atac.ES,atac.H16,atac.D1,atac.D4,tar.ge.ES,tar.ge.H16,tar.ge.D1,tar.ge.D4,⋯,reg.l2fc.H16,reg.l2fc.D1,reg.l2fc.D4,reg.fdr.H16,reg.fdr.D1,reg.fdr.D4,reg.isHit,tar.isHit,reg.cnt.sum,lev
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<chr>
NEUROG1,EBF1,4.591357,6.043470,6.011842,9.017202,0.6696824,1.4371240,3.8997900,20.301545,⋯,8.170744,9.6348898,11.116910,7.875081e-22,5.935372e-31,5.633972e-42,TRUE,TRUE,539,lev1
NEUROG2,EBF1,18.885112,22.733444,22.733617,28.581881,0.6696824,1.4371240,3.8997900,20.301545,⋯,7.088920,7.9901132,9.893931,3.985811e-09,7.743982e-12,8.486055e-19,TRUE,TRUE,207,lev1
NEUROG2,POU3F2,6.351503,7.253477,7.200895,9.575558,0.1609897,2.8551875,4.1576482,12.963583,⋯,7.088920,7.9901132,9.893931,3.985811e-09,7.743982e-12,8.486055e-19,TRUE,TRUE,207,lev1
NEUROG2,RCOR2,10.116665,10.779594,10.950813,11.168362,7.2018173,17.3199422,18.2898296,16.572379,⋯,7.088920,7.9901132,9.893931,3.985811e-09,7.743982e-12,8.486055e-19,TRUE,TRUE,207,lev1
NEUROG2,ZBTB18,3.166601,4.770543,4.652431,5.911644,2.8316239,11.0424247,11.1503921,12.374454,⋯,7.088920,7.9901132,9.893931,3.985811e-09,7.743982e-12,8.486055e-19,TRUE,TRUE,207,lev1
NEUROG2,VAX2,9.887712,9.670370,9.880845,10.416721,0.2078368,0.3528202,1.0458451,3.172412,⋯,7.088920,7.9901132,9.893931,3.985811e-09,7.743982e-12,8.486055e-19,TRUE,TRUE,207,lev2
EBF1,VAX2,12.833548,13.420957,14.064004,15.171356,0.2078368,0.3528202,1.0458451,3.172412,⋯,2.514547,5.0491582,9.726408,5.891332e-05,5.672568e-22,3.335348e-84,TRUE,TRUE,2308,lev2
EBF3,VAX2,10.281116,10.226842,10.327152,10.769148,0.2078368,0.3528202,1.0458451,3.172412,⋯,5.541898,6.6627546,9.147850,1.203497e-24,3.641239e-36,3.460202e-69,FALSE,TRUE,2878,lev2
NEUROD1,VAX2,9.887712,9.670370,9.880845,10.416721,0.2078368,0.3528202,1.0458451,3.172412,⋯,3.176489,0.9474577,11.894240,5.611389e-06,4.596942e-01,1.609459e-99,FALSE,TRUE,3704,lev2
NHLH1,VAX2,10.010747,10.900004,11.482029,13.118007,0.2078368,0.3528202,1.0458451,3.172412,⋯,8.349374,8.5751674,11.586282,4.889849e-58,2.070067e-61,1.834385e-113,FALSE,TRUE,4329,lev2


In [286]:
NGN.started.network.df %>% filter( lev == "lev3") %>% select(-(atac.ES:tar.ge.D4), -(reg.cnt.ES:reg.cnt.D4), -(reg.l2fc.H16:reg.fdr.D4))

regulator_gene,target_gene,corr,cor_p,l2fc.H16,l2fc.D1,l2fc.D4,fdr.H16,fdr.D1,fdr.D4,tar.cnt.ES,tar.cnt.H16,tar.cnt.D1,tar.cnt.D4,reg.isHit,tar.isHit,reg.cnt.sum,lev
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<lgl>,<lgl>,<dbl>,<chr>
NEUROG1,CASZ1,0.9491921,0.050807886,-1.10372204,0.5747946,2.082050,0.56178808,0.73938018,4.911382e-02,27,8,29,76,TRUE,FALSE,539,lev3
NEUROG1,CBFB,0.9546171,0.045382920,0.40514723,0.4671792,1.854964,0.52026316,0.34906771,1.124080e-08,307,389,429,923,TRUE,FALSE,539,lev3
NEUROG1,ESRRG,0.8793330,0.120667016,0.69731594,1.8804745,2.889741,0.83248842,0.29323364,3.285278e-02,10,9,21,43,TRUE,FALSE,539,lev3
NEUROG1,KLF12,0.9060244,0.093975637,0.92154959,0.6942953,2.106188,0.43896102,0.52525032,2.728186e-03,25,118,93,248,TRUE,FALSE,539,lev3
NEUROG1,MAML3,0.8707537,0.129246272,-1.12875645,0.5249655,3.403921,0.37111119,0.68125739,1.736630e-06,28,13,47,277,TRUE,FALSE,539,lev3
NEUROG1,MEF2C,0.8507371,0.149262864,-3.36475141,3.2790285,6.020716,0.24142611,0.08977080,5.211778e-05,0,0,28,174,TRUE,FALSE,539,lev3
NEUROG1,MYT1,-0.8621385,0.137861541,-1.01368610,-0.2975204,5.768915,0.68255937,0.89640182,7.977172e-09,7,2,5,256,TRUE,FALSE,539,lev3
NEUROG1,MYT1L,0.9809500,0.019049992,0.03255222,0.5141626,2.935981,0.99893057,0.77365234,2.733817e-03,5,8,12,59,TRUE,FALSE,539,lev3
NEUROG1,PAX6,-0.8354846,0.164515440,-0.82201666,0.2225106,3.200247,0.82325780,0.94130854,3.028417e-02,5,3,7,55,TRUE,FALSE,539,lev3
NEUROG1,RORA,-0.9526933,0.047306714,0.47126498,0.6080620,2.047648,0.82977118,0.67585612,2.302845e-02,34,36,42,98,TRUE,FALSE,539,lev3


---

## Find TF targets or regulators

Current method: 

1. Use `motifMatch` to match motif to each of the 116K peaks (each peak has its respective read counts).
2. Select only ranges that have a match to motifs
3. Get the read counts of a given regulator or target:
    - For a given regulator, get read counts of all the peaks (ranges) that has this binding motif, sum normalized counts
    - For a given target, get read counts of all the peaks (ranges) that ca bind to the motif, sum normalized counts

---

---

## Differential analysis on targets:

In [19]:
library(DESeq2)

In [158]:
FOUND_TARGETS.deseq <- FOUND_TARGETS %>% mutate("tf.pair" = paste(regulator,gene_name, sep = ":")) %>%
                select(tf.pair, A1:A12)

In [159]:
head(FOUND_TARGETS.deseq)

tf.pair,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
NEUROG2:ABL1,59.612027,60.728975,49.15846,69.372078,52.360238,53.93496,41.17851,54.68986,44.86193,52.234566,61.353611,63.027849
NEUROG2:AEBP2,12.656092,9.656551,13.60999,6.383934,7.993929,11.54743,16.51793,11.85229,10.38811,6.512002,4.893233,4.563102
NEUROG2:AFF1,26.229292,33.261453,31.89206,38.516399,26.979512,30.55845,25.59116,26.92163,28.22257,27.710645,24.340696,23.671093
NEUROG2:AFF3,135.915421,142.487772,137.31866,144.276899,141.292702,140.68152,146.56756,165.2548,153.06377,153.655526,150.686477,160.99195
NEUROG2:AHR,28.063508,36.265713,36.36101,39.367591,35.173289,28.16447,30.01145,16.25457,32.08364,23.554048,11.417543,11.122562
NEUROG2:ALX3,8.070551,14.806711,11.98491,12.129474,12.990135,13.51894,14.65676,7.61933,10.93969,10.945705,8.02992,9.553995


In [160]:
FOUND_TARGETS.deseq[,2:13] <- apply(FOUND_TARGETS.deseq[,2:13], 2, as.integer)

In [161]:
sample.anno <- data.frame("sample" = paste0(rep("A",12),1:12), 
                          "condition" = c(rep("ES",2), rep("H1",2), rep("H4",2), rep("H16",2), rep("H24",2), rep("D5",2)))

In [162]:
sample.anno %>% t

0,1,2,3,4,5,6,7,8,9,10,11,12
sample,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12
condition,ES,ES,H1,H1,H4,H4,H16,H16,H24,H24,D5,D5


In [163]:
dds.targets <- DESeqDataSetFromMatrix(countData = FOUND_TARGETS.deseq[,2:13], colData = sample.anno, design = ~ condition)

In [164]:
mcols(dds.targets) <- cbind(mcols(dds.targets), "tf.pair"=FOUND_TARGETS.deseq$tf.pair)

In [165]:
dds.targets <- DESeq(dds.targets)

estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
“Estimated rdf < 1.0; not estimating variance”final dispersion estimates
fitting model and testing


In [166]:
resultsNames(dds.targets)

In [169]:
res.targets.a <- results(dds.targets, contrast = c("condition", "D5", "ES"))

In [171]:
fdr <- 0.05
res.targets.a[which(res.targets.a$padj < fdr), ] %>% as.data.frame %>% add_column("tf.pair"=FOUND_TARGETS.deseq$tf.pair[which(res.targets.a$padj < fdr)])

baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,tf.pair
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
146.6622431,0.4286594,0.1232722,3.477340,5.064160e-04,4.088833e-03,NEUROG2:AFF3
26.3350362,-1.2742763,0.3579525,-3.559904,3.709909e-04,3.238035e-03,NEUROG2:AHR
8.0479035,-2.9036434,1.0757727,-2.699123,6.952240e-03,3.262962e-02,NEUROG2:ANKRD42
8.8090902,1.6142051,0.5143416,3.138391,1.698779e-03,1.012147e-02,NEUROG2:ANKS1A
47.6285014,1.4059792,0.2286268,6.149669,7.764504e-10,8.116781e-08,NEUROG2:ARID1A
20.2296244,-0.9561344,0.3533861,-2.705637,6.817353e-03,3.261186e-02,NEUROG2:ASCC3
4.6876782,2.1407404,0.6652061,3.218161,1.290154e-03,8.228867e-03,NEUROG2:ATF3
1.0132321,4.7088744,1.5305313,3.076627,2.093569e-03,1.197187e-02,NEUROG2:ATF6
35.6538932,0.6310421,0.2272533,2.776823,5.489310e-03,2.772520e-02,NEUROG2:BAZ1B
16.6357106,-1.1260717,0.4250101,-2.649518,8.060677e-03,3.612781e-02,NEUROG2:BIN1


---

## Get normalized read counts per gene from consensus peaks

Wrote a script for this: 
- path: `/gpfs/commons/groups/sanjana_lab/cdai/TFscreen/atac`
- Rscript: `getATACseqReadcountsPerGene.R`
- Resulted read counts in: `ATAC-seq_readcounts_per_Gene.csv`

In [84]:
atac.counts.perGene <- read.csv("ATAC-seq_readcounts_per_Gene.csv")

In [138]:
atac.counts.perGene %>% head


gene_id,gene_name,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,is.hit,is.tf
<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>
ENSG00000187634.12,SAMD11,37.05117,48.28275,48.34592,46.38992,57.55629,36.61381,44.668209,40.636427,47.89563,51.5418,102.757889,88.552702,No,No
ENSG00000188976.11,NOC2L,30.63141,29.18424,39.61115,41.06997,41.96813,23.09486,28.382925,30.646639,37.87524,33.66843,54.703833,58.036956,No,No
ENSG00000187961.14,KLHL17,23.29455,20.17146,27.21997,31.91967,28.1786,17.46197,21.636164,20.995487,24.3615,20.64443,19.321996,24.954465,No,No
ENSG00000187583.10,PLEKHN1,11.55556,17.38179,15.23506,13.61906,12.79029,10.84332,8.142642,14.900023,11.49127,12.46979,5.896973,5.418684,No,No
ENSG00000187642.9,PERM1,14.85715,13.30458,17.2664,16.38543,13.58968,10.56168,15.819991,8.465922,10.75583,13.024,12.421283,5.703878,No,No
ENSG00000188290.10,HES4,46.03883,51.5016,43.87697,46.38992,54.55857,38.86697,41.876446,45.885299,41.27666,44.05993,38.142122,39.64195,No,Yes


In [139]:
sample.anno <- data.frame("sample" = paste0(rep("A",12),1:12), 
                          "condition" = c(rep("ES",2), rep("H1",2), rep("H4",2), rep("H16",2), rep("H24",2), rep("D5",2)))

In [140]:
atac.counts <- select(atac.counts.perGene, A1:A12) %>% apply(., 2, as.integer)

In [147]:
dds.atac <- DESeqDataSetFromMatrix(countData = atac.counts, colData = sample.anno, design = ~ condition)

In [148]:
dds.atac <- DESeq(dds.atac)

estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing


In [153]:
sample.anno %>% t

0,1,2,3,4,5,6,7,8,9,10,11,12
sample,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12
condition,ES,ES,H1,H1,H4,H4,H16,H16,H24,H24,D5,D5


In [156]:
res.atac.a <- results(dds.atac, contrast = c("condition", "D5", "ES"))

In [157]:
fdr <- 0.01
res.atac.a[which(res.atac.a$padj < fdr), ] %>% as.data.frame %>% 
        add_column("gene_name"=atac.counts.perGene$gene_name[which(res.atac.a$padj < fdr)]) %>%
        cbind(counts(dds.atac)[which(res.atac.a$padj < fdr),])

baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_name,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
53.96597,1.2859206,0.2555272,5.032421,4.843235e-07,4.171176e-05,SAMD11,37,48,48,46,57,36,44,40,47,51,102,88
36.98419,1.0500669,0.3035926,3.458802,5.425834e-04,7.509406e-03,NOC2L,30,29,39,41,41,23,28,30,37,33,54,58
85.93123,1.3913771,0.2120123,6.562719,5.283528e-11,2.881085e-08,ACAP3,61,60,50,66,95,75,67,79,89,93,141,150
41.82490,1.0333660,0.2768952,3.731975,1.899846e-04,3.502703e-03,PRXL2B,42,31,33,37,37,35,31,40,35,42,69,68
43.65750,-1.3623490,0.3017304,-4.515121,6.328074e-06,2.844690e-04,ARHGEF16,67,62,54,57,51,46,40,36,36,36,21,25
49.64993,0.9224558,0.2622441,3.517546,4.355565e-04,6.426952e-03,PARK7,37,43,47,37,38,39,54,51,50,61,69,70
186.76433,0.5528684,0.1481239,3.732473,1.896094e-04,3.502703e-03,RERE,171,174,187,171,167,175,165,180,189,204,239,225
44.46515,1.0083677,0.2684577,3.756152,1.725463e-04,3.269921e-03,PGD,37,40,29,37,38,34,50,43,37,45,70,72
27.16807,1.1758202,0.3428771,3.429276,6.051942e-04,8.100341e-03,UBIAD1,24,18,22,24,26,24,24,23,27,26,44,43
99.67772,-1.0762933,0.2093694,-5.140642,2.738011e-07,2.617207e-05,DHRS3,130,123,127,114,119,115,102,97,82,93,50,60


In [134]:
res.atac.b <- results(dds.atac, contrast = c("condition", "D5", "ES"))

In [137]:
fdr <- 0.01
res.atac.b[which(res.atac.b$padj < fdr), ] %>% as.data.frame %>% 
        add_column("gene_name"=atac.counts.perGene$gene_name[which(res.atac.b$padj < fdr)]) %>%
        cbind(counts(dds.atac)[which(res.atac.b$padj < fdr),])

baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,gene_name,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
53.96597,1.2859206,0.2555272,5.032421,4.843235e-07,4.171176e-05,SAMD11,37,48,48,46,57,36,44,40,47,51,102,88
36.98419,1.0500669,0.3035926,3.458802,5.425834e-04,7.509406e-03,NOC2L,30,29,39,41,41,23,28,30,37,33,54,58
85.93123,1.3913771,0.2120123,6.562719,5.283528e-11,2.881085e-08,ACAP3,61,60,50,66,95,75,67,79,89,93,141,150
41.82490,1.0333660,0.2768952,3.731975,1.899846e-04,3.502703e-03,PRXL2B,42,31,33,37,37,35,31,40,35,42,69,68
43.65750,-1.3623490,0.3017304,-4.515121,6.328074e-06,2.844690e-04,ARHGEF16,67,62,54,57,51,46,40,36,36,36,21,25
49.64993,0.9224558,0.2622441,3.517546,4.355565e-04,6.426952e-03,PARK7,37,43,47,37,38,39,54,51,50,61,69,70
186.76433,0.5528684,0.1481239,3.732473,1.896094e-04,3.502703e-03,RERE,171,174,187,171,167,175,165,180,189,204,239,225
44.46515,1.0083677,0.2684577,3.756152,1.725463e-04,3.269921e-03,PGD,37,40,29,37,38,34,50,43,37,45,70,72
27.16807,1.1758202,0.3428771,3.429276,6.051942e-04,8.100341e-03,UBIAD1,24,18,22,24,26,24,24,23,27,26,44,43
99.67772,-1.0762933,0.2093694,-5.140642,2.738011e-07,2.617207e-05,DHRS3,130,123,127,114,119,115,102,97,82,93,50,60


In [141]:
gene.name.lookup <- read_delim("../gene_names_lookup.txt", delim="\t")

Parsed with column specification:
cols(
  `Approved symbol` = [31mcol_character()[39m,
  `Previous symbols` = [31mcol_character()[39m,
  Synonyms = [31mcol_character()[39m,
  `RefSeq IDs` = [31mcol_character()[39m,
  `Ensembl gene ID` = [31mcol_character()[39m
)



In [143]:
gene.name.lookup %>% head

Approved symbol,Previous symbols,Synonyms,RefSeq IDs,Ensembl gene ID
<chr>,<chr>,<chr>,<chr>,<chr>
A1BG,,,NM_130786,ENSG00000121410
A1BG-AS1,"NCRNA00181, A1BGAS, A1BG-AS",FLJ23569,NR_015380,ENSG00000268895
A1CF,,"ACF, ASP, ACF64, ACF65, APOBEC1CF",NM_014576,ENSG00000148584
A1S9T,,,,
A2M,,"FWP007, S863-7, CPAMD5",NM_000014,ENSG00000175899
A2M-AS1,,,NR_026971,ENSG00000245105


In [144]:
motif_lookup %>% head

motif,gene_name,is_hit,is_tf
<chr>,<chr>,<chr>,<chr>
THRB,THRB,No,Yes
THAP11,THAP11,No,Yes
TFAP4(var.2),TFAP4,No,Yes
TFAP2E,TFAP2E,No,Yes
TGIF2LY,TGIF2LY,No,Yes
TGIF2LX,TGIF2LX,No,Yes


In [167]:
map_lgl(hitlist, ~ ! .x %in% gene.name.lookup$`Approved symbol`) %>% hitlist[.]

In [234]:
?mutate_at

0,1
mutate_all {dplyr},R Documentation

0,1
.tbl,A tbl object.
.funs,"A function fun, a quosure style lambda ~ fun(.) or a list of either form."
...,"Additional arguments for the function calls in .funs. These are evaluated only once, with tidy dots support."
.predicate,A predicate function to be applied to the columns or a logical vector. The variables for which .predicate is or returns TRUE are selected. This argument is passed to rlang::as_function() and thus supports quosure-style lambda functions and strings representing function names.
.vars,"A list of columns generated by vars(), a character vector of column names, a numeric vector of column positions, or NULL."
.cols,This argument has been renamed to .vars to fit dplyr's terminology and is deprecated.
