In [None]:
import os
import json
import collections
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import Cell_BLAST as cb
import exputils

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = exputils.pick_gpu_lowest_memory()
cb.config.RANDOM_SEED = 0
cb.config.N_JOBS = 4
plt.rcParams['svg.fonttype'] = "none"
plt.rcParams['font.family'] = "Arial"
REFS = ["Baron_human", "Xin_2016", "Lawlor"]
QUERIES = ["Muraro", "Segerstolpe", "Enge"]
PATH = "./query_bias"
if not os.path.exists(PATH):
    os.makedirs(PATH)

In [None]:
def entropy(x, pool):
    x = np.expand_dims(x, axis=2)
    pool = np.array(pool).reshape((1, 1, -1))
    prob = (x == pool).sum(axis=1) / pool.size
    return np.apply_along_axis(scipy.stats.entropy, axis=1, arr=prob)
entropy_dict = collections.OrderedDict()

In [None]:
blast = cb.blast.BLAST.load("../../Results/Cell_BLAST/Pancreas/seed_0/blast")

## scmap

In [None]:
scmap_sim, scmap_ds = [], []
for qds in QUERIES:
    _scmap_sim = collections.OrderedDict([
        ("Baron_human", cb.data.read_hybrid_path("../../Results/scmap/Pancreas/seed_0/%s/result.h5//scmap_cell/nn/Baron_human/similarities" % qds)),
        ("Xin_2016", cb.data.read_hybrid_path("../../Results/scmap/Pancreas/seed_0/%s/result.h5//scmap_cell/nn/Xin_2016/similarities" % qds)),
        ("Lawlor", cb.data.read_hybrid_path("../../Results/scmap/Pancreas/seed_0/%s/result.h5//scmap_cell/nn/Lawlor/similarities" % qds))
    ])
    _scmap_ds = collections.OrderedDict([
        (key, np.tile(key, val.shape))
        for key, val in _scmap_sim.items()
    ])
    scmap_sim.append(np.concatenate(list(_scmap_sim.values()), axis=1))
    scmap_ds.append(np.concatenate(list(_scmap_ds.values()), axis=1))
scmap_sim = np.concatenate(scmap_sim, axis=0)
scmap_ds = np.concatenate(scmap_ds, axis=0)
for i in range(scmap_sim.shape[0]):
    argsort = np.argsort(scmap_sim[i])[::-1]
    scmap_ds[i] = scmap_ds[i][argsort]
scmap_ds = scmap_ds[:, :10]

In [None]:
scmap_ds, scmap_ds.shape

In [None]:
entropy_dict["scmap"] = entropy(scmap_ds, REFS)

## CellFishing.jl

In [None]:
cf_dist, cf_ds = [], []
for qds in QUERIES:
    cf_dist.append(cb.data.read_hybrid_path(
        "../../Results/CellFishing.jl/Pancreas/seed_0/%s/result.h5//hammingdistances" % qds))
    cf_ds.append(blast.ref.obs["dataset_name"].values[cb.data.read_hybrid_path(
        "../../Results/CellFishing.jl/Pancreas/seed_0/%s/result.h5//indexes" % qds) - 1])
cf_dist = np.concatenate(cf_dist, axis=0)
cf_ds = np.concatenate(cf_ds, axis=0)

In [None]:
cf_ds, cf_ds.shape

In [None]:
entropy_dict["CellFishing.jl"] = entropy(cf_ds, REFS)

## Cell BLAST

In [None]:
cb_ds = []
for qds in QUERIES:
    query = cb.data.ExprDataSet.read_dataset("../../Datasets/data/%s/data.h5" % qds)
    hits = blast.query(query, n_neighbors=10).reconcile_models()
    for i in range(hits.dist.shape[0]):
        argsort = np.argsort(hits.dist[i])
        hits.hits[i] = hits.hits[i][argsort]
    cb_ds.append(np.stack([blast.ref.obs["dataset_name"].values[item[0:10]] for item in hits.hits], axis=0))
cb_ds = np.concatenate(cb_ds, axis=0)

In [None]:
cb_ds, cb_ds.shape

In [None]:
entropy_dict["Cell BLAST"] = entropy(cb_ds, REFS)

## Comparison

In [None]:
df = pd.concat([pd.DataFrame({
    "Method": np.repeat(key, val.size),
    "Query hit entropy": val
}) for key, val in entropy_dict.items()])
df["Method"] = pd.Categorical(df["Method"], categories=entropy_dict.keys())

In [None]:
with open("../../Evaluation/palette_method.json", "r") as f:
    palette = json.load(f)

In [None]:
fig, ax = plt.subplots(figsize=(4.0, 4.0))
ax = sns.violinplot(
    x="Method", y="Query hit entropy", inner="box", data=df,
    linewidth=0.6, palette=palette, ax=ax
)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
fig.savefig(os.path.join(PATH, "entropy.pdf"), bbox_inches="tight")