In [None]:
import functools
import operator
import os

import anndata
import faiss
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats
import seaborn as sns
import sklearn.cluster
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.metrics
import yaml
from matplotlib import rcParams
from networkx.algorithms.bipartite import biadjacency_matrix

import scglue
import utils

In [None]:
scglue.plot.set_publication_params()
DIST_BINS = [0, 25, 50, 75, 100, 125, 150]  # in KB

PATH = "s03_peak_gene_validation"
os.makedirs(PATH, exist_ok=True)

np.random.seed(0)

In [None]:
with open("../../evaluation/config/display.yaml", "r") as f:
    palette = yaml.load(f, Loader=yaml.Loader)["palette"]
palette["Cicero"] = "#8C564B"
palette["Spearman"] = "#17BECF"

# Read data

In [None]:
rna = anndata.read_h5ad("s01_preprocessing/rna.h5ad")
atac = anndata.read_h5ad("s01_preprocessing/atac.h5ad")

In [None]:
genes = scglue.genomics.Bed(rna.var.assign(name=rna.var_names).query("d_highly_variable"))
peaks = scglue.genomics.Bed(atac.var.assign(name=atac.var_names).query("d_highly_variable"))
tss = genes.strand_specific_start_site()
promoters = tss.expand(2000, 0)

In [None]:
dist_graph = nx.read_graphml("s01_preprocessing/dist.graphml.gz")
pchic_graph = nx.read_graphml("s01_preprocessing/pchic.graphml.gz")
eqtl_graph = nx.read_graphml("s01_preprocessing/eqtl.graphml.gz")

# Gene-peak linkage

## Distance

In [None]:
dist = pd.DataFrame(
    biadjacency_matrix(dist_graph, genes.index, peaks.index, weight="dist", dtype=np.float32).toarray(),
    index=genes.index, columns=peaks.index
)

## pcHi-C

In [None]:
pchic = pd.DataFrame(
    biadjacency_matrix(pchic_graph, genes.index, peaks.index, weight=None, dtype=np.float32).toarray(),
    index=genes.index, columns=peaks.index
)

## eQTL

In [None]:
eqtl = pd.DataFrame(
    biadjacency_matrix(eqtl_graph, genes.index, peaks.index, weight=None, dtype=np.float32).toarray(),
    index=genes.index, columns=peaks.index
)

## Correlation

In [None]:
rna_ = rna.copy()
atac_ = atac.copy()
corr = utils.pseudocell_corr(
    rna_, atac_, "X_pca", n_pseudocells=200,
    genes=genes.index, peaks=peaks.index, paired=True
).fillna(0).astype(np.float32)
del rna_, atac_

## GLUE

In [None]:
glue_list = []
for i in range(4):
    feature_embedding = pd.read_csv(f"s02_glue/prior:d/seed:{i}/feature_embeddings.csv", header=None, index_col=0)
    feature_embedding /= np.linalg.norm(feature_embedding, axis=1, keepdims=True)
    glue_list.append(pd.DataFrame(
        feature_embedding.loc[genes.index].to_numpy() @ feature_embedding.loc[peaks.index].to_numpy().T,
        index=genes.index, columns=peaks.index
    ).astype(np.float32))
glue = functools.reduce(operator.add, glue_list) / len(glue_list)
glue.iloc[:5, :5]

In [None]:
ax = sns.histplot(np.random.choice(glue.to_numpy().ravel(), 10000, replace=True))

## Cicero

In [None]:
cicero = pd.read_csv("e01_cicero/cicero_conns.csv.gz").dropna().query("coaccess != 0")
cicero["Peak1"] = cicero["Peak1"].str.split("_").map(lambda x: f"{x[0]}:{x[1]}-{x[2]}")
cicero["Peak2"] = cicero["Peak2"].str.split("_").map(lambda x: f"{x[0]}:{x[1]}-{x[2]}")
cicero.head()

In [None]:
peak_gene_mapping = scglue.genomics.window_graph(peaks, promoters, 0)
peak_gene_mapping = nx.DiGraph(peak_gene_mapping)
peak_gene_mapping = nx.to_pandas_edgelist(
    peak_gene_mapping, source="Peak1", target="Gene"
).loc[:, ["Peak1", "Gene"]]

In [None]:
cicero = pd.merge(cicero, peak_gene_mapping)
cicero = nx.from_pandas_edgelist(
    cicero.dropna(), source="Gene", target="Peak2",
    edge_attr="coaccess", create_using=nx.DiGraph
)
cicero = pd.DataFrame(
    biadjacency_matrix(cicero, genes.index, peaks.index, weight="coaccess", dtype=np.float32).toarray(),
    index=genes.index, columns=peaks.index
)

# Windowing

In [None]:
window = biadjacency_matrix(
    dist_graph, genes.index, peaks.index, weight=None
).astype(np.float32).tocoo()

In [None]:
dist_mat = window.multiply(dist)
pchic_mat = window.multiply(pchic)
eqtl_mat = window.multiply(eqtl)
cicero_mat = window.multiply(cicero)
corr_mat = window.multiply(corr)

glue_mat = window.multiply(glue)
glue_mat_list = [window.multiply(item) for item in glue_list]

for mat in (dist_mat, pchic_mat, eqtl_mat, cicero_mat, corr_mat, glue_mat):
    assert np.all(window.row == mat.row)
    assert np.all(window.col == mat.col)

In [None]:
df = pd.DataFrame({
    "dist": dist_mat.data.astype(int),
    "pchic": pchic_mat.data.astype(bool),
    "eqtl": eqtl_mat.data.astype(bool),
    "cicero": cicero_mat.data,
    "corr": corr_mat.data,

    "glue": glue_mat.data,
    **{f"glue{i}": item.data for i, item in enumerate(glue_mat_list)}
})
df["pchic"] = pd.Categorical(df["pchic"], categories=[False, True])
df["eqtl"] = pd.Categorical(df["eqtl"], categories=[False, True])
df["dist_bin"] = utils.make_dist_bins(df["dist"], bins=DIST_BINS)

# Comparisons

## Different random seeds

In [None]:
def corrfunc(x, y, ax=None, **kwargs):
    r"""
    Adapted from https://stackoverflow.com/questions/50832204/show-correlation-values-in-pairplot-using-seaborn-in-python
    """
    r, _ = scipy.stats.pearsonr(x, y)
    ax = ax or plt.gca()
    ax.annotate(f'r = {r:.3f}', xy=(0.1, 0.9), xycoords=ax.transAxes)

In [None]:
g = sns.pairplot(
    df.loc[:, [f"glue{i}" for i in range(4)]].rename(
        columns=lambda x: x.replace("glue", "Seed = ")
    ).sample(frac=0.3),
    diag_kind="kde", height=2,
    plot_kws=dict(s=1, edgecolor=None, alpha=0.2, rasterized=True)
).map_lower(corrfunc).map_upper(corrfunc)
g.fig.savefig(f"{PATH}/glue_pairplot.pdf")

## GLUE vs genomic distance

In [None]:
ax = sns.boxplot(
    x="dist_bin", y="glue", data=df.sample(frac=0.3),
    color="grey", width=0.7, showmeans=True,
    meanprops=dict(marker="^", markerfacecolor="white", markeredgecolor="black"),
    boxprops=dict(edgecolor="black"), medianprops=dict(color="black"),
    whiskerprops=dict(color="black"), capprops=dict(color="black"),
    flierprops=dict(marker=".", markerfacecolor="black", markeredgecolor="none", markersize=3)
)
ax.set_xlabel("Genomic distance")
ax.set_ylabel("GLUE regulatory score")
for item in ax.get_xticklabels():
    item.set_rotation(67.5)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_figure().savefig(f"{PATH}/dist_binned_glue.pdf")

## GLUE vs correlation

In [None]:
g = sns.jointplot(
    x="corr", y="glue", hue="pchic", data=df.sample(frac=0.3),
    kind="kde", height=5,
    joint_kws=dict(common_norm=False, levels=6),
    marginal_kws=dict(common_norm=False, fill=True)
).plot_joint(sns.scatterplot, s=1, edgecolor=None, alpha=0.5, rasterized=True)
g.ax_joint.set_xlabel("Spearman correlation")
g.ax_joint.set_ylabel("GLUE regulatory score")
g.ax_joint.get_legend().set_title("pcHi-C")
g.fig.savefig(f"{PATH}/corr_glue_pchic.pdf")

In [None]:
g = sns.jointplot(
    x="corr", y="glue", hue="eqtl", data=df.sample(frac=0.3),
    kind="kde", height=5,
    joint_kws=dict(common_norm=False, levels=6),
    marginal_kws=dict(common_norm=False, fill=True)
).plot_joint(sns.scatterplot, s=1, edgecolor=None, alpha=0.5, rasterized=True)
g.ax_joint.set_xlabel("Spearman correlation")
g.ax_joint.set_ylabel("GLUE regulatory score")
g.ax_joint.get_legend().set_title("eQTL")
g.fig.savefig(f"{PATH}/corr_glue_eqtl.pdf")

In [None]:
scipy.stats.spearmanr(df["corr"], df["glue"])

## GLUE vs pcHi-C

In [None]:
g = utils.boxplot(x="dist_bin", y="glue", hue="pchic", data=df)
g.ax_joint.legend(loc="center left", bbox_to_anchor=(1.25, 0.5), frameon=False, title="pcHi-C")
g.ax_joint.set_xlabel("Genomic distance")
g.ax_joint.set_ylabel("GLUE regulatory score")
for item in g.ax_joint.get_xticklabels():
    item.set_rotation(67.5)
g.fig.savefig(f"{PATH}/dist_binned_glue_pchic.pdf")

## GLUE vs eQTL

In [None]:
g = utils.boxplot(x="dist_bin", y="glue", hue="eqtl", data=df)
g.ax_joint.legend(loc="center left", bbox_to_anchor=(1.25, 0.5), frameon=False, title="eQTL")
g.ax_joint.set_xlabel("Genomic distance")
g.ax_joint.set_ylabel("GLUE regulatory score")
for item in g.ax_joint.get_xticklabels():
    item.set_rotation(67.5)
g.fig.savefig(f"{PATH}/dist_binned_glue_eqtl.pdf")

# ROC

In [None]:
rcParams["figure.figsize"] = (4, 4)

In [None]:
cicero_auc = sklearn.metrics.roc_auc_score(df["pchic"].astype(bool), df["cicero"])
corr_auc = sklearn.metrics.roc_auc_score(df["pchic"].astype(bool), df["corr"])
glue_auc = sklearn.metrics.roc_auc_score(df["pchic"].astype(bool), df["glue"])
ax = scglue.plot.roc(df["pchic"].astype(bool), df["cicero"], label=f"Cicero (AUC = {cicero_auc:.3f})", color=palette["Cicero"])
ax = scglue.plot.roc(df["pchic"].astype(bool), df["corr"], label=f"Spearman (AUC = {corr_auc:.3f})", color=palette["Spearman"], ax=ax)
ax = scglue.plot.roc(df["pchic"].astype(bool), df["glue"], label=f"GLUE (AUC = {glue_auc:.3f})", color=palette["GLUE"], ax=ax)
ax.set_title("pcHi-C prediction")
ax.axline((0, 0), (1, 1), ls="--", c="grey")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5), frameon=False)
ax.get_figure().savefig(f"{PATH}/pchic_roc.pdf")

In [None]:
cicero_auc = sklearn.metrics.roc_auc_score(df["eqtl"].astype(bool), df["cicero"])
corr_auc = sklearn.metrics.roc_auc_score(df["eqtl"].astype(bool), df["corr"])
glue_auc = sklearn.metrics.roc_auc_score(df["eqtl"].astype(bool), df["glue"])
ax = scglue.plot.roc(df["eqtl"].astype(bool), df["cicero"], label=f"Cicero (AUC = {cicero_auc:.3f})", color=palette["Cicero"])
ax = scglue.plot.roc(df["eqtl"].astype(bool), df["corr"], label=f"Spearman (AUC = {corr_auc:.3f})", color=palette["Spearman"], ax=ax)
ax = scglue.plot.roc(df["eqtl"].astype(bool), df["glue"], label=f"GLUE (AUC = {glue_auc:.3f})", color=palette["GLUE"], ax=ax)
ax.set_title("eQTL prediction")
ax.axline((0, 0), (1, 1), ls="--", c="grey")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.legend(loc="center left", bbox_to_anchor=(1.05, 0.5), frameon=False)
ax.get_figure().savefig(f"{PATH}/eqtl_roc.pdf")