# Overview: DiffBind analysis of KLF6a in young and aged HSPC
- Will perform differential analysis on H3K4me1, H3K4me3, and H3K27ac
- Have 2 replicates for each group
- Will use the narroPeaks called by macs2
- Will perform the following comparisons:
    - Yng_NTC vs Yng_Act
    - Aged_NTC vs Aged_Act
- Note that the "fold" reported by diffbind is in a log2 scale, it is not directly equivalent to conc2-conc1,because diffbind corrects the fold-change

In [None]:
library(DiffBind)
library(ggplot2)
library(tidyr)
library(ggpubr)
library(gridExtra)
library(chipenrich)
library(genomation)
library(GenomicRanges)

In [None]:
set.seed(123)

In [None]:
setwd("/ChIPseq/")

In [2]:
sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Debian GNU/Linux 10 (buster)

Matrix products: default
BLAS/LAPACK: /opt/conda/envs/diffbind/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
 [1] grid      parallel  stats4    stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] genomation_1.22.0           chipenrich_2.14.0          
 [3] gridExtra_2.3               ggpubr_0.4.0               
 [5] tidyr_1.1.2                 ggplot2_3.3.3              
 [7] DiffBind_3.0.9              SummarizedExperiment_1.20.0
 [9] Biobase_2.50.0              MatrixGenerics_1.2.0       
[11] matrixStats_

In [None]:
samples = read.csv('/ChIPseq/diffBind_KLF6a_sampleSheet.csv')

In [None]:
samples$Condition=factor(samples$Condition, levels = c("Yng_NTC", "Yng_Act", "Aged_NTC", "Aged_Act"))

### Make object for each IP

In [None]:
samples_H3K4me3 = samples[samples$Factor == 'H3K4me3',]
samples_H3K27ac = samples[samples$Factor == 'H3K27ac',]
samples_H3K4me1 = samples[samples$Factor == 'H3K4me1',]

In [None]:
samples_H3K4me3_YNYA= samples_H3K4me3[samples_H3K4me3$Condition %in% c("Yng_NTC", "Yng_Act"),]
samples_H3K4me3_ANAA= samples_H3K4me3[samples_H3K4me3$Condition %in% c("Aged_NTC", "Aged_Act"),]

samples_H3K4me1_YNYA= samples_H3K4me1[samples_H3K4me1$Condition %in% c("Yng_NTC", "Yng_Act"),]
samples_H3K4me1_ANAA= samples_H3K4me1[samples_H3K4me1$Condition %in% c("Aged_NTC", "Aged_Act"),]

samples_H3K27ac_YNYA= samples_H3K27ac[samples_H3K27ac$Condition %in% c("Yng_NTC", "Yng_Act"),]
samples_H3K27ac_ANAA= samples_H3K27ac[samples_H3K27ac$Condition %in% c("Aged_NTC", "Aged_Act"),]

# Main Function

In [None]:
### This will use will use bUseSummarizeOverlaps=TRUE
### Need to modify if have multiple contrasts to choose from
### Also assumes minMembers=2, if have a larger dataset, modify this
diffbind_difPeaks <- function(config_table, output_name){
    suppressMessages(library(DiffBind)) 
    path = getwd()
    chipenrich_dir <- "chipenrich"
    chipenrich_path <- file.path(dirname(path), chipenrich_dir)
    print(chipenrich_path)
    dir.create(chipenrich_path)
    output_name2 = as.name(output_name)
    all_name = paste(output_name2, "_all.txt", sep="")
    dn_name = paste(output_name2, "_dn_peak.txt", sep="")
    up_name = paste(output_name2, "_up_peak.txt", sep="")
    
    
    ###Make sample sheet
    ###Affinity binding matrix
    ###Establish contrast
    ###Perform differential analysis
    ###Extract results for DESeq2 method and export as table
    dbObj <- dba(sampleSheet=config_table)
    dbObj_abm <- dba.count(dbObj, bUseSummarizeOverlaps=TRUE)
    dbObj_con <- dba.contrast(dbObj_abm, categories=DBA_CONDITION, minMembers = 2) 
    dbObj_diff<- dba.analyze(dbObj_con, method=DBA_ALL_METHODS) 
    res_deseq <- dba.report(dbObj_diff, method=DBA_DESEQ2) 
    res_deseq_df = as.data.frame(res_deseq)
    write.table(res_deseq_df, all_name, quote = F, sep="\t", row.names=F)

    
    ###Extract peaks that go down or up and export as .txt file
    ### Annotate to hg19 using chipenrich
    if (dim(res_deseq_df)[1] > 0) {   
        res_deseq_dn = res_deseq_df[res_deseq_df$Fold < 0, ]
        res_deseq_up = res_deseq_df[res_deseq_df$Fold > 0, ]
        dn_chipenrich_name= paste(output_name2, "_deseq_dn", sep="")
        up_chipenrich_name= paste(output_name2, "_deseq_up", sep="")
        if (dim(res_deseq_dn)[1] > 0) {
            write.table(res_deseq_dn, dn_name, quote = F, sep="\t", row.names=F)
            chipenrich(res_deseq_dn, out_name=dn_chipenrich_name, out_path=chipenrich_path, 
                       genome="hg19_refseq", genesets=c("kegg_pathway", "panther_pathway", "biocarta_pathway", "GOBP", "GOCC", "GOMF"), 
                       method = "chipenrich", fisher_alt="two.sided", qc_plots=T, max_geneset=500, num_peak_threshold=1, 
                       n_cores=2, locusdef="nearest_tss")
        } else{
            print("no downpeaks")
        }
        if (dim(res_deseq_up)[1] > 0) {
            write.table(res_deseq_up, up_name, quote = F, sep="\t", row.names=F)
            chipenrich(res_deseq_up, out_name=up_chipenrich_name, out_path=chipenrich_path, 
                       genome="hg19_refseq", genesets=c("kegg_pathway", "panther_pathway", "biocarta_pathway", "GOBP", "GOCC", "GOMF"), 
                       method = "chipenrich", fisher_alt="two.sided", qc_plots=T, max_geneset=500, num_peak_threshold=1, 
                       n_cores=2, locusdef="nearest_tss")
        } else{
            print("no up peaks")
        }
        dim_list=list("up_peaks"=dim(res_deseq_up)[1], "down_peaks"=dim(res_deseq_dn)[1])
        print(dim_list)
    } else {
        print("No significant peaks")
}
}



In [None]:
diffbind_difPeaks(samples_H3K27ac_ANAA, "H3K27ac_ANvsAA")
diffbind_difPeaks(samples_H3K4me1_ANAA, "H3K4me1_ANvsAA")
diffbind_difPeaks(samples_H3K4me3_ANAA, "H3K4me3_ANvsAA")
diffbind_difPeaks(samples_H3K27ac_YNYA, "H3K27ac_YNvsYA")
diffbind_difPeaks(samples_H3K4me1_YNYA, "H3K4me1_YNvsYA")
diffbind_difPeaks(samples_H3K4me3_YNYA, "H3K4me3_YNvsYA")