2-2_TileR_replicateAnalysis_DESeq_diffIntFrags_nonIced_matrices_justWT_200324.R

### TILED ANALYSIS ###

library(tidyverse)
library(RColorBrewer)
library(gplots)
library(DESeq2)


#### taking raw interaction matrices calculated by Marieke's script 
# from individual samples and I am data cleaning and putting through DESeq to:

# 1. cluster > PCA, heatmaps
# 2. find out differentially interacting fragments between genotypes and cell types

## all done on RAW CONTACT MATRICES

#################
### FUNCTIONS ###
#################

# making a function to perform pairwise tests on a combined dds object generated with a given design formula
pairTestDESeq <- function(dds,design,contrast,pair1,pair2,FDRLevel,outDirData,outDirMA){
  # this function takes inputs: 
  
  # - dds: a dds object generated by the DESeq function
  
  # - design: a character string containing the design used to generate the dds object (eg, "cellType+genoType")
  
  # - contrast: a character string containing the name of the contrast to make (eg "cellType")
  
  # - pair1:  a character string for the first sample to include in the pairwise comparison (must match colData of dds!)
  
  # - pair2:  a character string for the second sample to include in the pairwise comparisonand
  
  # - FDRLevel: an false discovery rate threshold to use
  
  # - subset: a character string of the subset being tested (eg, if testing between genotypes, subset would be celltype, so subset="CD41")
  
  # - outDirData: an output directory for the results tables
  
  # - outDirMA: an output directory for the MA plots
  
  # output is a results table for the pairwise comparison filtered on the FDRLevel threshold saved to the outDirData output directory
  
  # perform the results function from DESeq
  res <- results(dds, 
                 contrast=c(contrast,
                            pair1,
                            pair2),
                 alpha=FDRLevel)
  
  
  # get just the significant frags
  res$logpvalue <- -log(res$pvalue)
  res$logpadj <- -log(res$padj)
  
  # filter only the FDR below chosen significance level
  resData <- res %>% data.frame %>% rownames_to_column(var="frags")
  diffInt <- resData %>% filter(padj<=FDRLevel)
  
  
  # format the name of this comparison
  compName <- paste(pair1,"vs",pair2,design,contrast,"FDR",FDRLevel,sep="_")
  
  # write the table
  write.table(diffInt,
              file=paste0(outDirData,"DESeq2_", compName, ".txt"), eol="\n", sep="\t", row.names=F, quote=F)
  
  
  # MA plot for this comparison
  
  # Default DESeq2 MA plots
  pdf(paste(outDirMA,paste("MA",design,contrast,pair1,"vs",pair2,FDRLevel,"qc-pca.pdf",sep="_")), 10,10)
  MA<-DESeq2::plotMA(res,alpha=FDRLevel)
  print(MA)
  dev.off()
  
}

#### PLOTTING QC FUNCTION ####
plotting_QC <- function(dds,design,contrast,condition,outDirPCAb){
  # inputs:
  
  # - dds: a DESeq object
  # - design: a character string of the design used to make the dds object (eg "cellType+genoType")
  # - contrast: a character string of the contrast being done (eg "cellType")
  # - condition: a factor variable used to perform DESeq, given to colData
  # - outDirPCAb: a directory path to output the PCA plot
  
  dat <- rlog(dds) # vst is a much faster transformation than rld!!!
  
  ## sample distance matrix
  # Specify colours
  mycols <- brewer.pal(8, "Set3")[1:length(unique(condition))]
  
  # Create distance matrix
  sampleDists <- as.matrix(dist(t(assay(dat))))
  
  # Plot heatmap
  pdf(paste0(outDirHeatmaps, design, "_qc-heatmap-samples.pdf"), 10,10)
  hmap <- heatmap.2(as.matrix(sampleDists), key=F, trace="none",  # Plot distance matrix
  col=colorpanel(100, "black", "white"),  # Distance coloured black-white
  ColSideColors=mycols[condition], RowSideColors=mycols[condition],  # Side annotations = Groups
   margin=c(19, 19), main="Sample Distance Matrix")
  leg <- legend("bottom",as.character(unique(condition)), col=mycols, pch=15)
  leg2 <- legend("bottomleft", paste0("maxDist=",as.character(round(max(hmap$carpet)))))
  print(hmap)
  print(leg)
  dev.off()
  
  # Default DESeq2 PCA
  pdf(paste(outDirPCAb,design,"_",contrast, "_justWT_small_qc-pca.pdf"), 3.3,3.3)
  pca_b <- plotPCA(dat, intgroup=contrast)
  print(pca_b)
  dev.off()
}

##############
### INPUTS ###
##############

## public directory
publicDir <- "http://sara.molbiol.ox.ac.uk/public/dowens/CTCF-KO/Tiled/CTCF-KO_virtCapC/"

## 
email <- "dominic.owens@imm.ox.ac.uk"

##
genome <- "mm9"

## the main directory
base = "C:/Users/Dominic/Desktop/Work/Paper/Bioinformatics/Tile-C/CTCF-KO/"

# the significance level to use for DESeq
FDRLevel <- 0.1


# are we using iced or raw matrices?
matrixType = "raw" # "raw"


####  relative to base folder, or could be specified elsewhere

## the bait file
baitFile <- paste0(base, "baitsRightIDs.txt")

## the fragments genome file
fragFile <- paste0(base, "fragData_2kb.txt")

## the data folder
if (matrixType=="iced"){
  dataFolder <- paste0(base, "iced_matrix/")
} else if (matrixType=="raw") {
  dataFolder <- paste0(base, "matrix/")
} else {
  cat("Warning: matrixType must be set to either \"iced\" or \"raw\"")
}


###############
### OUTPUTS ###
###############


variablesFolder <- paste0(base, "savedVariables_", matrixType, "/")
dir.create(variablesFolder, showWarnings = F)

## DESeq directories
DESeqDir <-  paste0(base, "DESeq_FDR_", FDRLevel, "_", matrixType, "/")
outDirData = paste0(DESeqDir, "/data_", matrixType, "/")
outDirPlots = paste0(DESeqDir, "/plots_", matrixType, "/")
outDirBeds = paste0(DESeqDir, "/bedFiles_", matrixType, "/")
outDirHeatmaps = paste0(outDirPlots, "heatmaps_", matrixType, "/")
outDirPCAb = paste0(outDirPlots, "PCA_basic_", matrixType, "/")
outDirPCAf = paste0(outDirPlots, "PCA_fancy_", matrixType, "/")
outDirMA = paste0(outDirPlots, "MA_plots_", matrixType, "/")


dir.create(DESeqDir, showWarnings=F)
dir.create(outDirPlots, showWarnings=F)
dir.create(outDirBeds, showWarnings=F)
dir.create(outDirData, showWarnings=F)
dir.create(outDirHeatmaps, showWarnings=F)
dir.create(outDirPCAb, showWarnings=F)
dir.create(outDirPCAf, showWarnings=F)
dir.create(outDirMA, showWarnings=F)

##########################
### LOAD THE VARIABLES ###
##########################

cat("Gathering the input files \n")

## load the bait file, get names of baits, and make into format useful for joining to later
## need to match the wrong DpnII bait IDs
baits = data.table::fread(baitFile)
colnames(baits) <- c("bait_chr", "bait_start", "bait_end", "baitID", "baitName")
baitNames <- baits$baitName
baits <- 
  baits %>% 
  mutate("bait_frag"=paste0(bait_chr,":",bait_start,"-",bait_end)) %>%
  dplyr::select(bait_frag, baitID, baitName)


## load the frag file and make into format useful for joining to later
fragData = data.table::fread(fragFile)
colnames(fragData) <- c("prey_chr", "prey_start", "prey_end", "preyID")
fragData <- 
  fragData %>% 
  mutate("baitID"=preyID, "fragID"=preyID)
#  dplyr::select(prey_frag, preyID)


# get the samples
samples <-
  list.files(path=dataFolder) %>%
  data.frame
colnames(samples) <- "sample"


# get the matrix location (sampleFIle) and add to the table
samples %<>%
  mutate(sampleFiles=paste0(dataFolder,sample,"/raw/2000/tiled_",sample,"_2000.matrix"))

# get the matrix location (sampleFIle) and add to the table
# along with
# tissue types, genotypes, and clone names
samples %<>%
  mutate(sampleFiles=paste0(dataFolder,sample,"/raw/2000/tiled_",sample,"_2000.matrix"),
         sample2=sample) %>%
  separate(sample2,into=c("tissue","genotype","clone"),sep="_")


# get the tissue types
tissues <- levels(factor(word(samples$sample, 1, sep="_")))
# geno types
genotypes <- levels(factor(word(samples$sample, 2, sep="_")))
# exchange the - for . to prevent problems with column naming
genotypes <- gsub("-", ".", genotypes)

# clone names (reps effectively)
clones <- levels(factor(word(samples$sample, 3, sep="_")))


###########################
#### DATA GATHER LOOP #####
###########################


#initialise empty data table
data <- NULL

# loop over directories and files to gather lots of files

for (i in 1:nrow(samples)){ # for samples loop
  sampleFile = samples[i,2]
  
  # get the variable for which sample
  thisSample <- samples[i,1]
  
  # load the data and add a column for the sample
  interactions <- 
    data.table::fread(sampleFile, fill = TRUE) %>%
    mutate(sample=thisSample)
  
  # combine all in one df
  data <- rbind(data,interactions)
} # for samples loop


colnames(data) <- c("baitID", "preyID", "reads", "sampleName")

cat("Total Usable Reporters:", sum(data$reads), "\n")


#######################
### DATA WRANGLING  ###
#######################


# assign individual dfs for each sample

for (n in 1:nrow(samples)){
  name=samples[n,1]
  x <- 
    data %>% 
    filter(sampleName==name) %>% 
    mutate(combo=paste0(baitID,"-",preyID)) %>% 
    dplyr::select(-baitID,-preyID)
  assign(paste0("df", n), x)
}


# now do a full join by bait and prey IDs combo 
# so that any sample with a read for that bait-prey combo 
# will be assigned to it in each row
union <- 
  df1 %>% 
  full_join(df2, by = "combo") %>%
  full_join(df3, by = "combo") %>%
  full_join(df4, by = "combo") %>%
  full_join(df5, by = "combo") %>%
  full_join(df6, by = "combo") %>%
  full_join(df7, by = "combo") %>%
  full_join(df8, by = "combo") %>%
  full_join(df9, by = "combo") %>%
  full_join(df10, by = "combo") %>%
  full_join(df11, by = "combo") %>%
  full_join(df12, by = "combo") %>%
  full_join(df13, by = "combo") %>%
  full_join(df14, by = "combo") %>%
  full_join(df15, by = "combo") %>%
  full_join(df16, by = "combo") %>%
  full_join(df17, by = "combo") %>%
  full_join(df18, by = "combo") %>%
  full_join(df19, by = "combo") %>%
  full_join(df20, by = "combo") %>%
  full_join(df21, by = "combo") %>%
  full_join(df22, by = "combo") %>%
  full_join(df23, by = "combo") %>%
  full_join(df24, by = "combo") %>%
  full_join(df25, by = "combo") %>%
  full_join(df26, by = "combo") %>%
  full_join(df27, by = "combo") %>%
  full_join(df28, by = "combo") %>%
  full_join(df29, by = "combo") %>%
  full_join(df30, by = "combo") %>%
  full_join(df31, by = "combo") %>%
  full_join(df32, by = "combo") %>%
  full_join(df33, by = "combo")

# keep only the reads
union <-
  union %>%
  dplyr::select(-starts_with("sampleName")) %>%
  dplyr::select(combo,starts_with("reads"))

# convert NAs to 0
union[is.na(union)] <- 0

# make sample names not a factor
samples$sample <- as.character(samples$sample)

# give proper names to the read columns
colnames(union) <- c("baitID-preyID",
                     samples[1,1],
                     samples[2,1],
                     samples[3,1],
                     samples[4,1],
                     samples[5,1],
                     samples[6,1],
                     samples[7,1],
                     samples[8,1],
                     samples[9,1],
                     samples[10,1],
                     samples[11,1],
                     samples[12,1],
                     samples[13,1],
                     samples[14,1],
                     samples[15,1],
                     samples[16,1],
                     samples[17,1],
                     samples[18,1],
                     samples[19,1],
                     samples[20,1],
                     samples[21,1],
                     samples[22,1],
                     samples[23,1],
                     samples[24,1],
                     samples[25,1],
                     samples[26,1],
                     samples[27,1],
                     samples[28,1],
                     samples[29,1],
                     samples[30,1],
                     samples[31,1],
                     samples[32,1],
                     samples[33,1])


## split up the bait and prey IDs
union <- separate(union, `baitID-preyID`, into=c("baitID","preyID"), sep = "-", remove = TRUE)

# make into integers to allow joining
union$baitID <- as.integer(union$baitID)
union$preyID <- as.integer(union$preyID)
fragData$baitID <- as.integer(fragData$baitID)
fragData$preyID <- as.integer(fragData$preyID)


## this variable is then taken into normalisation, or DESeq, for all BAITs
DESeqData <- 
  union %>%
  mutate(combo=paste0(baitID,"-",preyID)) %>%
  dplyr::select(-baitID,-preyID) %>%
  tibble::column_to_rownames("combo")

## DESeqData, POSSIBLY USEFUL so saving
# save this table for use later
saveRDS(DESeqData, paste0(variablesFolder, "/DESeqData_", matrixType, ".rds"))
write.table(DESeqData, file=paste0(variablesFolder, "/DESeqData_", matrixType, ".txt"),
            col.names = T, 
            row.names = F,
            quote=F,
            sep="\t")


##############
###  DESeq ###
##############

## I am performing DEseq on ALL the interactions together, not just bait by bait!!!
cat("Moving onto statistical testing...", "\n")

## load data cleaned above
DESeqData <- readRDS(paste0(variablesFolder, "/DESeqData_", matrixType, ".rds"))

# found better clustering (PCA) with filtered DESeqData
# also much faster calculations at each step with filtered DESeqData

##  fitType=local on filtered data gives most diff int frags

# filter DESeqData on greater than 1 read count
filtered <- DESeqData[!rowSums(DESeqData == 0) >= 1,]
# clean up colnames
colnames(filtered) <- gsub("-", ".", colnames(filtered))


## get just the wt samples before doing DESeq
filtered %<>%
  dplyr::select(matches("WT"))


#### SIMPLE ONE VARIABLE TESTS ####

## working 24th March 2020

# good between cell types (within genotype) diff Int Frags
# not so good between genotypes (within cell types)


### setting up variables used to define the DESeq comparison
cond_df <-
  data.frame(conditions=colnames(filtered)) %>%
  separate(conditions, into=c("cellType","genoType",NA,"XP"),sep="_") %>%
  mutate(combined=paste0(cellType,"_",genoType))

# set up the condition factor variable used in DESeq
condition = factor(cond_df$combined)


# set up the colData for DESeq
colData = data.frame(row.names = colnames(filtered),
                     combined = condition,
                     libType = "Tiled CapC")


# do the DESeq calculations
dds <- DESeqDataSetFromMatrix(countData = filtered,
                              colData = colData, 
                              design = ~ combined)

dds <- DESeq(dds, fitType = "local")

# save this variable for use later
#saveRDS(dds, paste0(variablesFolder, "/Combined_filtered_local_dds.rds"))

## to load it back
#dds <- readRDS(paste0(variablesFolder, "/Combined_filtered_local_dds.rds"))

# use a custom function to plot PCA
plotting_QC(dds,"~combined_filtered_local","combined",condition,outDirPCAb)


#### some more checking

#plotDispEsts(dds)


#### PAIRWISE TESTS ####

# set up a table of conditions
tests <-
  data.frame(test=levels(factor(cond_df$combined))) %>%
  mutate(test2=test) %>%
  separate(test2,into=c("Tissues","Genotypes"),sep="_")


#### testing within each tissue but between genotypes

for (tissue in tissues){
  
  thisTest <-
    tests %>%
    filter(Tissues==tissue)
  
  P1 <- as.character(thisTest$test[1])
  P2 <- as.character(thisTest$test[2])
  WT <- as.character(thisTest$test[3])
  
  # test P1 vs P2
  pairTestDESeq(dds,"~combined_filtered_local","combined",P1,P2,FDRLevel,outDirData,outDirMA)
  
  # test P1 vs WT
  pairTestDESeq(dds,"~combined_filtered_local","combined",P1,WT,FDRLevel,outDirData,outDirMA)
  
  # test P2 vs WT
  pairTestDESeq(dds,"~combined_filtered_local","combined",P2,WT,FDRLevel,outDirData,outDirMA)
}


#### testing within each genotype but between tissues

for (genotype in genotypes){
  
  thisTest <-
    tests %>%
    filter(Genotypes==genotype)
  
  CD41 <- as.character(thisTest$test[1])
  Flk1 <- as.character(thisTest$test[2])
  UN <- as.character(thisTest$test[3])
  
  # test P1 vs P2
  pairTestDESeq(dds,"~combined_filtered_local","combined",CD41,Flk1,FDRLevel,outDirData,outDirMA)
  
  # test P1 vs WT
  pairTestDESeq(dds,"~combined_filtered_local","combined",CD41,UN,FDRLevel,outDirData,outDirMA)
  
  # test P2 vs WT
  pairTestDESeq(dds,"~combined_filtered_local","combined",Flk1,UN,FDRLevel,outDirData,outDirMA)
}