# GoShifter

**Created**: 1 July 2022

## Environment

In [1]:
library(tidyverse)
library(data.table)
library(ComplexHeatmap)
library(circlize)

setwd("~/eQTL_pQTL_Characterization/")

source("03_Functional_Interpretation/scripts/utils/ggplot_theme.R")

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.1     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’

## Load Data

In [2]:
meta <- read.csv("03_Functional_Interpretation/metadata/reads_atac_seq.txt")

In [3]:
files.dir <- "/nfs/users/nfs_n/nm18/gains_team282/epigenetics/enrichment/go_shifter/conditional_snps_ld/"
files <- list.files(files.dir)
files <- files[grepl("overlap_scores", files)]

overlap.scores <- lapply(files, function(file) {
    fread(paste0(files.dir, "/", file)) %>%
        as.data.frame() %>%
        dplyr::mutate(Group=gsub("conditional_snps_ld_", "", gsub("_overlap_scores.tsv", "", file)))
}) %>%
    do.call(rbind, .) %>%
    dplyr::mutate(Overlap_Score=ifelse(Overlap == 1, Overlap_Score, 1))

In [4]:
loci <- read.table("/nfs/users/nfs_n/nm18/gains_team282/epigenetics/enrichment/go_shifter/snp_lists/conditional_snps_ld.txt", header=T)

In [5]:
c.cis.eqtl <- read.table("/nfs/users/nfs_n/nm18/gains_team282/eqtl/cisresults/conditionalanalysis/conditional_eQTL_results_final.txt")

In [6]:
gene.info <- read.table("/nfs/team282/data/gains_team282/gene_info_864_20412_hla.txt") %>%
    dplyr::select(gene_id, gene_name)

In [7]:
lineages <- read.csv("03_Functional_Interpretation/data/Calderon_et_al_lineages.csv") %>%
    dplyr::bind_rows(c("Lineage"="MYELOID", "Cell_Type"="Neutrophils"))

In [8]:
head(lineages)

Unnamed: 0_level_0,Lineage,Cell_Type
Unnamed: 0_level_1,<chr>,<chr>
1,B,Bulk_B
2,B,Mem_B
3,B,Naive_B
4,B,Plasmablasts
5,CD8,CD8pos_T
6,CD8,Central_memory_CD8pos_T


## Identify Specificity of Peaks

The "overlap score" is the probability that an observed overlap for a locus would occur by chance. It is calculated empirically by GoShifter based on how many permutations also generate the overlap. Thus, a lower overlap score suggests that the overlap occuring in the observed annotation is more unique.

I take the complementary score (so that more interesting loci have higher values). I then use the specificity method implemented in CHEERS (Euclidean normalisation) to identify peaks that are uniquely important to one anno

In [33]:
control.score.mtx <- overlap.scores %>%
    dplyr::filter(!grepl("Neutrophils", Group)) %>%
    dplyr::filter(grepl("-Control", Group)) %>%
    dplyr::mutate(Cell_Type = gsub("-Control", "", Group)) %>%
    dplyr::select(Locus, Cell_Type, Overlap_Score) %>%
    tidyr::spread(Cell_Type, Overlap_Score) %>%
    dplyr::arrange(Locus)

rownames(control.score.mtx) <- control.score.mtx$Locus
control.score.mtx$Locus <- NULL
control.score.mtx <- 1 - as.matrix(control.score.mtx)

In [34]:
treated.score.mtx <- overlap.scores %>%
    dplyr::filter(!grepl("Neutrophils", Group)) %>%
    dplyr::filter(grepl("-Stimulated", Group)) %>%
    dplyr::mutate(Cell_Type = gsub("-Stimulated", "", Group)) %>%
    dplyr::select(Locus, Cell_Type, Overlap_Score) %>%
    tidyr::spread(Cell_Type, Overlap_Score) %>%
    dplyr::arrange(Locus)

rownames(treated.score.mtx) <- treated.score.mtx$Locus
treated.score.mtx$Locus <- NULL
treated.score.mtx <- 1 - as.matrix(treated.score.mtx)

In [35]:
score.mtx <- overlap.scores %>%
    dplyr::filter(grepl("Neutrophils-", Group)) %>%
    dplyr::mutate(Treatment = gsub("Neutrophils-", "", Group)) %>%
    dplyr::select(Locus, Treatment, Overlap_Score) %>%
    tidyr::spread(Treatment, Overlap_Score)

rownames(score.mtx) <- score.mtx$Locus
score.mtx$Locus <- NULL
score.mtx <- 1 - as.matrix(score.mtx)

In [36]:
mask <- rowSums(control.score.mtx) != 0 & rowSums(treated.score.mtx) != 0 & rowSums(score.mtx) != 0

In [37]:
control.score.mtx <- control.score.mtx[mask, ]
treated.score.mtx <- treated.score.mtx[mask, ]
score.mtx <- score.mtx[mask, ]

In [45]:
h <- hclust(dist(cbind(control.score.mtx, treated.score.mtx, score.mtx)))

In [48]:
control.score.mtx <- control.score.mtx[h$order, ]
treated.score.mtx <- treated.score.mtx[h$order, ]
score.mtx <- score.mtx[h$order, ]

In [49]:
neutrophil.map <- c(
    "BGP" = "Stimulation", "Control" = "Stimulation", "FLAG" = "Stimulation", "HMGB1" = "Stimulation", "LPS" = "Stimulation", "LTA" = "Stimulation", "R848" = "Stimulation",
    "SA-1" = "S. aureus", "SA-3" = "S. aureus", "SA-5" = "S. aureus", "WB" = "S. aureus",
    "EC1h" = "E. coli", "EC4h" = "E. coli", "noEC1h" = "E. coli", "noEC4h" = "E. coli"
)

In [54]:
options(repr.plot.width=15, repr.plot.height=24)

col_fun = colorRamp2(c(0, 1), c("white", "royalblue4"))

h1 = Heatmap(
    control.score.mtx, name="Score", 
    use_raster=TRUE, col=col_fun,
    cluster_rows=F, show_column_dend=F, show_row_names=F,
    column_names_max_height = max_text_width(
        colnames(control.score.mtx), 
        gp = gpar(fontsize = 12)
    ),
    column_split=lineages$Lineage[match(colnames(control.score.mtx), lineages$Cell_Type)]
)

h2 = Heatmap(
    treated.score.mtx, name="Score", 
    use_raster=TRUE, col=col_fun,
    cluster_rows=F, show_column_dend=F, show_row_names=F,
    column_names_max_height = max_text_width(
        colnames(treated.score.mtx), 
        gp = gpar(fontsize = 12)
    ),
    column_split=lineages$Lineage[match(colnames(treated.score.mtx), lineages$Cell_Type)]
)

h3 = Heatmap(
    score.mtx, name="Score", 
    use_raster=TRUE, col=col_fun,
    cluster_rows=F, show_column_dend=F, show_row_names=F,
    column_names_max_height = max_text_width(
        colnames(score.mtx), 
        gp = gpar(fontsize = 12)
    ),
    column_split=neutrophil.map[colnames(score.mtx)]
)

pdf("03_Functional_Interpretation/results/goshifter_score_matrix.pdf", width=15, height=24)
h1 + h2 + h3
dev.off()

“Heatmap/annotation names are duplicated: Score”
“Heatmap/annotation names are duplicated: Score, Score”
