# Case study: tracheal ionocytes

## Preparation

In [None]:
import collections
import functools
import json
import os
import subprocess
import sys

import Cell_BLAST as cb
import community
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.io
import scipy.cluster
import seaborn as sns
import sklearn.cluster
import sklearn.decomposition
import sklearn.manifold
import sklearn.neighbors

sys.path.insert(0, "../../../Evaluation")
import utils

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = utils.pick_gpu_lowest_memory()
cb.config.RANDOM_SEED = 0
cb.config.N_JOBS = 4
plt.rcParams['svg.fonttype'] = "none"
plt.rcParams['font.family'] = "Arial"

In [None]:
def snn(x, n_neighbors=10, metric="minkowski"):
    knn_graph = sklearn.neighbors.kneighbors_graph(x, n_neighbors=n_neighbors, metric=metric)
    snn_graph = scipy.sparse.lil_matrix(knn_graph.shape, dtype=np.int)
    for i in range(knn_graph.shape[0]):
        for j in range(i, knn_graph.shape[0]):
            snn_graph[i, j] = snn_graph[j, i] = np.sum(knn_graph[i].multiply(knn_graph[j]))
    return snn_graph

def louvain(g, resolution=1.0, random_state=None):
    g = nx.Graph(g)
    partition = community.best_partition(
        g, resolution=resolution, random_state=random_state)
    return np.vectorize(lambda i: partition[i])(np.arange(g.number_of_nodes()))

In [None]:
with open("markers.json", "r") as f:
    known_markers = json.load(f, object_pairs_hook=collections.OrderedDict)
known_markers_noi = known_markers.copy()  # shallow
del known_markers_noi["ionocyte"]

CTs = np.array(list(known_markers.keys()))  # numpy arrays are easier to subset
CTs_noi = np.array(list(known_markers_noi.keys()))  # numpy arrays are easier to subset

In [None]:
ct_palette = collections.OrderedDict([
    (ct, color) for ct, color in zip(
        known_markers.keys(), sns.color_palette()
    )
])

rj_palette = collections.OrderedDict([
    ("unassigned", sns.color_palette()[8]),
    ("rejected", sns.color_palette()[8]),
    ("ambiguous", sns.color_palette()[9])
])

palette = collections.OrderedDict(
    list(ct_palette.items()) + list(rj_palette.items()))

In [None]:
with open("../../../Evaluation/palette_method.json", "r") as f:
    method_palette = json.load(f)

### Read data

We first use Montoro_10x as reference and Plasschaert as query.

In [None]:
# Reference
montoro_10x_noi = cb.data.ExprDataSet.read_dataset("../../../Datasets/data/Montoro_10x_noi/data.h5")
montoro_10x_noi = utils.clean_dataset(montoro_10x_noi, "cell_ontology_class")
montoro_10x_noi.write_dataset("./montoro_10x_noi.h5")

# Query
plasschaert = cb.data.ExprDataSet.read_dataset("../../../Datasets/data/Plasschaert/data.h5")
plasschaert = utils.clean_dataset(plasschaert, "cell_ontology_class")
plasschaert.write_dataset("./plasschaert.h5")

plasschaert2montoronoi_expectation = pd.read_csv("./plasschaert2montoronoi_expectation.csv", index_col=0)

montoro_10x_noi.shape, plasschaert.shape

In [None]:
ionocyte_enrichment = dict()

## Cell BLAST

### Train model

In [None]:
montoro_10x_noi_models = []
for i in range(4):
    path = "/tmp/cb/%s" % cb.utils.rand_hex()
    print("==== Training model %d at %s ====" % (i, path))
    montoro_10x_noi_models.append(cb.directi.fit_DIRECTi(
        montoro_10x_noi, montoro_10x_noi.uns["seurat_genes"],
        latent_dim=10, cat_dim=20, epoch=1000, patience=30,
        random_seed=i, path=path
    ))

### Querying

In [None]:
montoro_10x_noi_blast = cb.blast.BLAST(
    montoro_10x_noi_models, montoro_10x_noi)
montoro_10x_noi_blast.save("./montoro_10x_noi_cb/blast")

In [None]:
montoro_10x_noi_blast = cb.blast.BLAST.load("./montoro_10x_noi_cb/blast")

In [None]:
hits = montoro_10x_noi_blast.query(plasschaert)

In [None]:
plasschaert.obs["cb_pred"] = hits.reconcile_models().filter("pval", 0.05).annotate(
    "cell_ontology_class"
)["cell_ontology_class"]
cb.data.write_hybrid_path(plasschaert.obs["cb_pred"].values, "./plasschaert_cb_pred.h5//prediction")

In [None]:
plasschaert.obs["cb_pred"] = cb.data.read_hybrid_path("./plasschaert_cb_pred.h5//prediction")

In [None]:
fig = cb.blast.sankey(
    plasschaert.obs["cell_ontology_class"], plasschaert.obs["cb_pred"],
    title="Cell BLAST prediction: Plasschaert → Montoro_10x without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./plasschaert_cb_sankey.pdf")

In [None]:
mba = cb.metrics.mean_balanced_accuracy(
    plasschaert.obs["cell_ontology_class"],
    plasschaert.obs["cb_pred"],
    plasschaert2montoronoi_expectation
)
print(f"MBA = {mba:.3f}")

In [None]:
ionocyte_mask = plasschaert.obs["cell_ontology_class"] == "ionocyte"
rejected_mask = plasschaert.obs["cb_pred"] == "rejected"
ionocyte_enrichment["cb"] = (
    np.logical_and(
        ionocyte_mask, rejected_mask
    ).sum() / rejected_mask.sum()
) / (
    ionocyte_mask.sum() /
    plasschaert.shape[0]
)

### Check rejections on each cell type

#### Fraction

In [None]:
rejected_fractions = plasschaert.obs.query(
    "cb_pred == 'rejected' & cell_ontology_class != 'ionocyte'"
).groupby("cell_ontology_class").apply(lambda x: x.shape[0])
rejected_fractions /= rejected_fractions.sum()
rejected_fractions

In [None]:
all_fractions = plasschaert.obs.query(
    "cell_ontology_class != 'ionocyte'"
).groupby("cell_ontology_class").apply(lambda x: x.shape[0])
all_fractions /= all_fractions.sum()
all_fractions

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
ax = sns.barplot(
    x="fraction", y="cell_ontology_class", hue="type",
    data=pd.DataFrame({
        "overall": all_fractions,
        "rejected": rejected_fractions
    }).reset_index().melt(
        id_vars=["cell_ontology_class"],
        value_vars=["overall", "rejected"],
        var_name="type", value_name="fraction"
    ), ax=ax
)
_ = ax.set_xlabel("Fraction")
_ = ax.set_ylabel("Cell ontology class")
_ = ax.legend(
    bbox_to_anchor=(1.05, 0.5), loc="center left",
    borderaxespad=0.0, frameon=False
)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)

Rarer cell types seem overrepresented in rejected cells.

#### Dimension reduction

In [None]:
CTs_use = CTs_noi[np.in1d(CTs_noi, plasschaert.obs["cell_ontology_class"])]
CTs_use

In [None]:
fig, axes = plt.subplots(
    nrows=len(CTs_use), ncols=2, figsize=(4 * 2, 4 * len(CTs_use)),
    gridspec_kw=dict(wspace=0.4, hspace=0.4)
)
for ct, (ax_density, ax_latent) in zip(CTs_use, axes):
    plasschaert_subset = plasschaert[
        plasschaert.obs.query(
            "cell_ontology_class == '%s'" % ct
        ).index,
        np.intersect1d(montoro_10x_noi.uns["seurat_genes"], plasschaert.var_names)  # same set of genes used for BLAST
    ]

    mat = plasschaert_subset.normalize().exprs.log1p().toarray()
    pca = sklearn.decomposition.PCA(n_components=2).fit(mat)
    plasschaert_subset.latent = pca.transform(mat)
    
    plasschaert_subset.obs["Density"] = scipy.stats.gaussian_kde(
        plasschaert_subset.latent.T
    ).evaluate(plasschaert_subset.latent.T)

    hue_order = CTs_use[np.in1d(CTs_use, plasschaert_subset.obs["cb_pred"])].tolist()
    for item in ("ambiguous", "rejected"):
        if np.in1d(item, plasschaert_subset.obs["cb_pred"]):
            hue_order.append(item)
    plasschaert_subset.obs = plasschaert_subset.obs.rename({"cb_pred": "Prediction"}, axis=1)
    pred_n_map = {
        item: f"{item}\n(n = {count})" for item, count in
        zip(*np.unique(plasschaert_subset.obs["Prediction"], return_counts=True))
    }
    plasschaert_subset.obs["Prediction (n)"] = plasschaert_subset.obs["Prediction"].map(pred_n_map)
    palette_n = collections.OrderedDict([
        (pred_n_map[key], val) for key, val in palette.items()
        if key in pred_n_map
    ])
    hue_order_n = [
        pred_n_map[item] for item in hue_order
        if item in pred_n_map
    ]
    
    ax_density = sns.boxplot(
        x="Density", y="Prediction (n)",
        data=plasschaert_subset.obs,
        fliersize=2, palette=palette_n, order=hue_order_n,
        ax=ax_density
    )
    ax_density.set_ylabel("Prediction")
    ax_density.spines["right"].set_visible(False)
    ax_density.spines["top"].set_visible(False)
    ax_latent = plasschaert_subset.visualize_latent(
        "Prediction", method=None, size=9,
        scatter_kws=dict(palette=palette, hue_order=hue_order, rasterized=True),
        ax=ax_latent
    )
    _ = ax_latent.set_xlabel("PC1 (%.2f%%)" % (pca.explained_variance_ratio_[0] * 100))
    _ = ax_latent.set_ylabel("PC2 (%.2f%%)" % (pca.explained_variance_ratio_[1] * 100))

fig.savefig("plasschaert_cb_ctvis.pdf", dpi=300, bbox_inches="tight")

### Clustering rejected cells

Let's take a closer look at the ionocytes rejected.

In [None]:
plasschaert_unassigned = plasschaert[plasschaert.obs.query("cb_pred == 'rejected'").index, :]
plasschaert_unassigned.latent = montoro_10x_noi_blast.models[0].inference(plasschaert_unassigned)
plasschaert_unassigned.obs["Average Cell BLAST p-value"] = np.array([
    scipy.stats.gmean(hits.pval[i], axis=None)
    for i in np.where(plasschaert.obs["cb_pred"] == "rejected")[0]
])
plasschaert_unassigned.obs = plasschaert_unassigned.obs.rename({
    "cell_ontology_class": "Cell ontology class"
}, axis=1)

In [None]:
ax = plasschaert_unassigned.visualize_latent(
    "Cell ontology class", method="tSNE", dr_kws=dict(perplexity=10), reuse=False,
    size=15, width=4, height=4, scatter_kws=dict(
        rasterized=True, palette=palette,
        hue_order=CTs[np.in1d(CTs, plasschaert_unassigned.obs["Cell ontology class"])]
    ), random_seed=0
)
ax.get_figure().savefig("./plasschaert_cb_unassigned_tsne_cl.pdf", dpi=300, bbox_inches="tight")

In [None]:
snn_graph = snn(plasschaert_unassigned.latent, n_neighbors=10)

In [None]:
plasschaert_unassigned.obs["Cluster"] = louvain(snn_graph, resolution=3.0, random_state=0)
cluster_map = collections.OrderedDict([
    (i, "cluster %d" % i) for i in
    sorted(np.unique(plasschaert_unassigned.obs["Cluster"]))
])
plasschaert_unassigned.obs["Cluster"] = plasschaert_unassigned.obs["Cluster"].map(cluster_map)

In [None]:
cluster_approx = collections.OrderedDict([
    (0, 5),
    (1, 0),
    (2, 2),
    (3, 8),
    (4, 3),
    (5, 1)
])

cluster_palette = collections.OrderedDict([
    ("cluster %d" % i, sns.color_palette("tab20")[2 * j])
    for i, j in cluster_approx.items()
])
cluster_palette_muted = collections.OrderedDict([
    ("cluster %d" % i, sns.color_palette("tab20")[2 * j + 1])
    for i, j in cluster_approx.items()
])

In [None]:
ax = plasschaert_unassigned.visualize_latent(
    "Cluster", method="tSNE",
    size=15, width=4, height=4,
    scatter_kws=dict(
        rasterized=True, palette=cluster_palette,
        hue_order=cluster_map.values()
    )
)
ax.get_figure().savefig("./plasschaert_cb_unassigned_tsne_cluster.pdf", dpi=300, bbox_inches="tight")

In [None]:
ax = plasschaert_unassigned.visualize_latent(
    "Average Cell BLAST p-value", method="tSNE", sort=True,
    size=15, width=4, height=4,
    scatter_kws=dict(rasterized=True))
ax.get_figure().savefig("./plasschaert_cb_unassigned_tsne_pval.pdf", dpi=300, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax2 = ax.twinx()
legend_handles = []
for i in cluster_map.values():
    ax = sns.distplot(plasschaert_unassigned.obs.loc[
        plasschaert_unassigned.obs["Cluster"] == i, "Average Cell BLAST p-value"
    ], color=cluster_palette[i], kde=False, hist_kws=dict(density=True), ax=ax)
    ax2 = sns.distplot(plasschaert_unassigned.obs.loc[
        plasschaert_unassigned.obs["Cluster"] == i, "Average Cell BLAST p-value"
    ], color=cluster_palette[i], hist=False, kde_kws=dict(cumulative=True), ax=ax2)
    legend_handles.append(mpatches.Patch(color=cluster_palette[i], label=i))
_ = plt.legend(handles=legend_handles, frameon=False, bbox_to_anchor=(1.17, 0.5), loc="center left", title="Cluster")
_ = ax.set(xlabel="Average Cell BLAST p-value", ylabel="Density")
_ = ax2.set(ylabel="Cumulative probability")
plt.xlim(-0.03, 0.45)
fig.savefig("./plasschaert_cb_unassigned_pval_distribution.pdf", bbox_inches="tight")

### Check other rejected clusters

#### One-vs-rest differential expression

In [None]:
plasschaert_use = plasschaert[plasschaert.obs.query("cb_pred != 'ambiguous'").index, :]
plasschaert_use.obs.loc[
    plasschaert_unassigned.obs.index, "cb_pred"
] = plasschaert_unassigned.obs["Cluster"].values

In [None]:
plasschaert_cb_pred_markers = plasschaert_use.fast_markers("cb_pred")
plasschaert_cb_pred_markers = collections.OrderedDict(sorted(
    plasschaert_cb_pred_markers.items(),
    key=lambda x: "z" + x[0] if x[0].startswith("cluster") else x[0]
))  # Prepend a z so that rejected clusters sort to last

In [None]:
if not os.path.exists("plasschaert_cb_pred_markers"):
    os.mkdir("plasschaert_cb_pred_markers")
for item in plasschaert_cb_pred_markers:
    plasschaert_cb_pred_markers[item].to_csv(
        "plasschaert_cb_pred_markers/{item}.csv".format(item=item))

In [None]:
plasschaert_cb_reject_markers = {
    key: val.query("padj < 0.05 & logfc > 0").index.values.tolist()
    for key, val in plasschaert_cb_pred_markers.items()
    if key.startswith("cluster")
}

In [None]:
plasschaert_cb_reject_markers_head = {
    key: val[0:min(10, len(val))]
    for key, val in plasschaert_cb_reject_markers.items()
}

#### Heatmap

In [None]:
combined_markers = collections.OrderedDict(
    list(filter(lambda x: np.in1d(x[0], plasschaert_use.obs["cb_pred"]), known_markers_noi.items())) +
    list(plasschaert_cb_reject_markers_head.items())
)

In [None]:
combined_markers.keys()

In [None]:
plasschaert_use.var[""] = np.nan
for ct in list(combined_markers.keys())[::-1]:  # rejected first, so that they are overwritten by known markers
    plasschaert_use.var.loc[combined_markers[ct], ""] = ct
marker_df = plasschaert_use.var.dropna()
del plasschaert_use.var[""]  # Prevent interference with next heatmap
marker_df[""] = pd.Categorical(marker_df[""], categories=combined_markers.keys())
marker_df = marker_df.sort_values("")

In [None]:
cell_df = plasschaert_use.obs.loc[:, ["cb_pred"]]
cell_df["cb_pred"] = pd.Categorical(cell_df["cb_pred"], categories=combined_markers.keys())

In [None]:
random_state = np.random.RandomState(0)
subset_idx = np.concatenate([
    random_state.choice(
        np.where(cell_df["cb_pred"] == item)[0],
        size=min(np.sum(cell_df["cb_pred"] == item), 30),
        replace=False
    ) for item in np.unique(cell_df["cb_pred"])
])
cell_df = cell_df.iloc[subset_idx, :]
cell_df = cell_df.sort_values("cb_pred")
cell_df.columns = ["Cells"]

In [None]:
mat = pd.DataFrame(
    plasschaert_use.normalize()[cell_df.index, marker_df.index].exprs.log1p().toarray(),
    index=cell_df.index, columns=marker_df.index
)
mask = mat.sum(axis=0) > 0
mat = mat.loc[:, mask]
marker_df = marker_df.loc[mask, :]

In [None]:
color_lut = collections.OrderedDict(list(palette.items()) + list(cluster_palette_muted.items()))
ct_legend_patch = [
    mpatches.Patch(color=color_lut[k], label=k)
    for k in combined_markers.keys() if not k.startswith("cluster")
]
cluster_legend_patch = [
    mpatches.Patch(color=color_lut[k], label=k)
    for k in combined_markers.keys() if k.startswith("cluster")
]

In [None]:
for column in cell_df.columns:
    cell_df[column] = cell_df[column].map(color_lut)
for column in marker_df.columns:
    marker_df[column] = marker_df[column].map(color_lut)

In [None]:
g = sns.clustermap(
    mat.T, standard_scale=0,
    row_cluster=False, col_cluster=True,
    col_linkage=scipy.cluster.hierarchy.linkage(mat, method="average"),
    row_colors=marker_df, col_colors=cell_df,
    cbar_kws=dict(label="Standardized expression"),
    xticklabels=False, vmax=0.9, rasterized=True,
    figsize=(20, 20)
)
g.ax_col_dendrogram.set_xlim([0, 0])  # hide dendrogram
l1 = g.ax_col_dendrogram.legend(
    loc="upper left", bbox_to_anchor=(0.0, 1.0), ncol=2, fontsize="x-large", title_fontsize="x-large",
    handles=ct_legend_patch, frameon=False, title="Cell ontology predictions of matched cells"
)
l2 = g.ax_col_dendrogram.legend(
    loc="upper left", bbox_to_anchor=(0.7, 1.0), ncol=2, fontsize="x-large", title_fontsize="x-large",
    handles=cluster_legend_patch, frameon=False, title="Clustering of rejected cells"
)
g.ax_col_dendrogram.add_artist(l1)
g.fig.savefig("plasschaert_cb_pred_expr_heatmap.pdf", dpi=300, bbox_inches="tight")

#### Pairwise differential expression

##### Cluster 1 vs Basal

In [None]:
plasschaert_pairwise = plasschaert_use[plasschaert_use.obs.query(
    "cb_pred == 'basal cell of epithelium of trachea' | cb_pred == 'cluster 1'"
).index, :]
np.unique(plasschaert_pairwise.obs["cb_pred"], return_counts=True)

In [None]:
pairwise_markers = plasschaert_pairwise.fast_markers("cb_pred")

In [None]:
tmp = pairwise_markers["cluster 1"].query("padj < 0.05 & logfc > 0").index
np.savetxt("sighigh.txt", tmp, fmt="%s")
tmp = pairwise_markers["cluster 1"].query("padj < 0.05 & logfc < 0").index
np.savetxt("siglow.txt", tmp, fmt="%s")
tmp = plasschaert_pairwise.var_names[np.array(plasschaert_pairwise.exprs.sum(axis=0)).ravel() > 0]
np.savetxt("sigbg.txt", tmp, fmt="%s")

Proceed on metascape...

In [None]:
df = pd.read_csv(
    "plasschaert_cb_cluster1_vs_basal_siglow/Enrichment_GO/_FINAL_GO.csv"
).query("FirstInGroupByLogP == 1").loc[:, ["GO", "Description", "Log(q-value)"]]
df = df.assign(
    neg_log_q=-df["Log(q-value)"],
    go_term=df["GO"] + " " + df["Description"]
).query("neg_log_q > 2")
df

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
ax = sns.barplot(
    x="neg_log_q", y="go_term", data=df,
    color=sns.color_palette()[0]
)
ax.axvline(x=2, c="red")
ax.set_xlabel("-log10 FDR")
ax.set_ylabel("GO term")
ax.yaxis.tick_right()
fig.savefig("plasschaert_cb_cluster1_vs_basal_siglow.pdf", bbox_inches="tight")

##### Cluster 5 vs Club

In [None]:
plasschaert_pairwise = plasschaert_use[plasschaert_use.obs.query(
    "cb_pred == 'club cell' | cb_pred == 'cluster 5'"
).index, :]
np.unique(plasschaert_pairwise.obs["cb_pred"], return_counts=True)

In [None]:
pairwise_markers = plasschaert_pairwise.fast_markers("cb_pred")

In [None]:
tmp = pairwise_markers["cluster 5"].query("padj < 0.05 & logfc > 0").index
np.savetxt("sighigh.txt", tmp, fmt="%s")
tmp = pairwise_markers["cluster 5"].query("padj < 0.05 & logfc < 0").index
np.savetxt("siglow.txt", tmp, fmt="%s")
tmp = plasschaert_pairwise.var_names[np.array(plasschaert_pairwise.exprs.sum(axis=0)).ravel() > 0]
np.savetxt("sigbg.txt", tmp, fmt="%s")

Proceed on metascape...

In [None]:
df = pd.read_csv(
    "plasschaert_cb_cluster5_vs_club_siglow/Enrichment_GO/_FINAL_GO.csv"
).query("FirstInGroupByLogP == 1").loc[:, ["GO", "Description", "Log(q-value)"]]
df = df.assign(
    neg_log_q=-df["Log(q-value)"],
    go_term=df["GO"] + " " + df["Description"]
).query("neg_log_q > 2")
df

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
ax = sns.barplot(
    x="neg_log_q", y="go_term", data=df,
    color=sns.color_palette()[0]
)
ax.axvline(x=2, c="red")
ax.set_xlabel("-log10 FDR")
ax.set_ylabel("GO term")
ax.yaxis.tick_right()
fig.savefig("plasschaert_cb_cluster5_vs_club_siglow.pdf", bbox_inches="tight")

##### Cluster 0 vs Rest

In [None]:
plasschaert_pairwise = plasschaert_use[plasschaert_use.obs.query(
    "cb_pred != 'cluster 0' | cb_pred == 'cluster 0'"
).index, :]
np.unique(plasschaert_pairwise.obs["cb_pred"], return_counts=True)

In [None]:
pairwise_markers = plasschaert_pairwise.fast_markers("cb_pred")

In [None]:
tmp = pairwise_markers["cluster 0"].query("padj < 0.05 & logfc > 0").index
np.savetxt("sighigh.txt", tmp, fmt="%s")
tmp = pairwise_markers["cluster 0"].query("padj < 0.05 & logfc < 0").index
np.savetxt("siglow.txt", tmp, fmt="%s")
tmp = plasschaert_pairwise.var_names[np.array(plasschaert_pairwise.exprs.sum(axis=0)).ravel() > 0]
np.savetxt("sigbg.txt", tmp, fmt="%s")

Proceed on metascape...

In [None]:
df = pd.read_csv(
    "plasschaert_cb_cluster0_vs_rest_sighigh/Enrichment_GO/_FINAL_GO.csv"
).query("FirstInGroupByLogP == 1").loc[:, ["GO", "Description", "Log(q-value)"]]
df = df.assign(
    neg_log_q=-df["Log(q-value)"],
    go_term=df["GO"] + " " + df["Description"]
).query("neg_log_q > 2")
df

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))
ax = sns.barplot(
    x="neg_log_q", y="go_term", data=df,
    color=sns.color_palette()[0]
)
ax.axvline(x=2, c="red")
ax.set_xlabel("-log10 FDR")
ax.set_ylabel("GO term")
ax.yaxis.tick_right()
fig.savefig("plasschaert_cb_cluster0_vs_rest_sighigh.pdf", bbox_inches="tight")

## scmap

### Querying

In [None]:
p = subprocess.Popen([
    "Rscript", "run_scmap.R",
    "-r", "../Notebooks/Case/Trachea/montoro_10x_noi.h5", "-n", "Montoro_10x_noi",
    "-q", "../Notebooks/Case/Trachea/plasschaert.h5",
    "-o", "../Notebooks/Case/Trachea/plasschaert_scmap_pred.h5",
    "-g", "scmap_genes", "-s", "0", "--threshold", "0.0", "0.5",
    "--cluster-col", "cell_ontology_class"
], cwd="../../../Evaluation", stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(output, err) = p.communicate()
exit_code = p.wait()
print(output.decode())
print(err.decode())

In [None]:
plasschaert.obs["scmap_pred"] = cb.data.read_hybrid_path("./plasschaert_scmap_pred.h5//prediction/0.0")
plasschaert.obs["Average scmap similarity"] = scipy.stats.gmean(cb.data.read_hybrid_path(
    "./plasschaert_scmap_pred.h5//scmap_cell/nn/Montoro_10x_noi/similarities"
), axis=1)

In [None]:
fig = cb.blast.sankey(
    plasschaert.obs["cell_ontology_class"], plasschaert.obs["scmap_pred"],
    title="scmap prediction: Plasschaert → Montoro_10x without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./plasschaert_scmap_sankey_permissive.pdf")

In [None]:
plasschaert.obs["scmap_pred"] = cb.data.read_hybrid_path("./plasschaert_scmap_pred.h5//prediction/0.5")
plasschaert.obs["Average scmap similarity"] = scipy.stats.gmean(cb.data.read_hybrid_path(
    "./plasschaert_scmap_pred.h5//scmap_cell/nn/Montoro_10x_noi/similarities"
), axis=1)

In [None]:
fig = cb.blast.sankey(
    plasschaert.obs["cell_ontology_class"], plasschaert.obs["scmap_pred"],
    title="scmap prediction: Plasschaert → Montoro_10x without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./plasschaert_scmap_sankey.pdf")

In [None]:
mba = cb.metrics.mean_balanced_accuracy(
    plasschaert.obs["cell_ontology_class"],
    plasschaert.obs["scmap_pred"],
    plasschaert2montoronoi_expectation
)
print(f"MBA = {mba:.3f}")

In [None]:
ionocyte_mask = plasschaert.obs["cell_ontology_class"] == "ionocyte"
rejected_mask = plasschaert.obs["scmap_pred"] == "unassigned"
ionocyte_enrichment["scmap"] = (
    np.logical_and(
        ionocyte_mask, rejected_mask
    ).sum() / rejected_mask.sum()
) / (
    ionocyte_mask.sum() /
    plasschaert.shape[0]
)

### Check rejections on each cell type

#### Fraction

In [None]:
rejected_fractions = plasschaert.obs.query(
    "scmap_pred == 'unassigned' & cell_ontology_class != 'ionocyte'"
).groupby("cell_ontology_class").apply(lambda x: x.shape[0])
rejected_fractions /= rejected_fractions.sum()
rejected_fractions

In [None]:
all_fractions = plasschaert.obs.query(
    "cell_ontology_class != 'ionocyte'"
).groupby("cell_ontology_class").apply(lambda x: x.shape[0])
all_fractions /= all_fractions.sum()
all_fractions

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
ax = sns.barplot(
    x="fraction", y="cell_ontology_class", hue="type",
    data=pd.DataFrame({
        "overall": all_fractions,
        "rejected": rejected_fractions
    }).reset_index().melt(
        id_vars=["cell_ontology_class"],
        value_vars=["overall", "rejected"],
        var_name="type", value_name="fraction"
    ), ax=ax
)
_ = ax.set_xlabel("Fraction")
_ = ax.set_ylabel("Cell ontology class")
_ = ax.legend(
    bbox_to_anchor=(1.05, 0.5), loc="center left",
    borderaxespad=0.0, frameon=False
)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)

#### Dimension reduction

In [None]:
CTs_use = CTs_noi[np.in1d(CTs_noi, plasschaert.obs["cell_ontology_class"])]
CTs_use

In [None]:
fig, axes = plt.subplots(
    nrows=len(CTs_use), ncols=2, figsize=(4 * 2, 4 * len(CTs_use)),
    gridspec_kw=dict(wspace=0.4, hspace=0.4)
)
for ct, (ax_density, ax_latent) in zip(CTs_use, axes):
    plasschaert_subset = plasschaert[
        plasschaert.obs.query(
            "cell_ontology_class == '%s'" % ct
        ).index,
        np.intersect1d(montoro_10x_noi.uns["seurat_genes"], plasschaert.var_names)  # same set of genes used for BLAST
        # plasschaert.uns["seurat_genes"]  # seurat selected genes on the query dataset
        # functools.reduce(list.__add__, [val for key, val in known_markers.items() if key in CTs])  # known markers
        # known_markers_noi[ct]  # markers for this particular cell type
    ]

    mat = plasschaert_subset.normalize().exprs.log1p().toarray()
    pca = sklearn.decomposition.PCA(n_components=2).fit(mat)
    plasschaert_subset.latent = pca.transform(mat)
    
    plasschaert_subset.obs["Density"] = scipy.stats.gaussian_kde(
        plasschaert_subset.latent.T
    ).evaluate(plasschaert_subset.latent.T)
    
    hue_order = CTs_use[np.in1d(CTs_use, plasschaert_subset.obs["scmap_pred"])].tolist()
    for item in ("unassigned", ):
        if np.in1d(item, plasschaert_subset.obs["scmap_pred"]):
            hue_order.append(item)
    plasschaert_subset.obs = plasschaert_subset.obs.rename({"scmap_pred": "Prediction"}, axis=1)
    
    ax_density = sns.boxplot(
        x="Density", y="Prediction",
        data=plasschaert_subset.obs,
        fliersize=2, palette=palette, order=hue_order,
        ax=ax_density
    )
    ax_density.spines["right"].set_visible(False)
    ax_density.spines["top"].set_visible(False)
    ax_latent = plasschaert_subset.visualize_latent(
        "Prediction", method=None, size=9,
        scatter_kws=dict(palette=palette, hue_order=hue_order),
        ax=ax_latent
    )
    _ = ax_latent.set_xlabel("PC1 (%.2f%%)" % (pca.explained_variance_ratio_[0] * 100))
    _ = ax_latent.set_ylabel("PC2 (%.2f%%)" % (pca.explained_variance_ratio_[1] * 100))

fig.savefig("plasschaert_scmap_ctvis.pdf", bbox_inches="tight")

### Clustering rejected cells

In [None]:
plasschaert_unassigned = plasschaert.normalize()[
    plasschaert.obs.query("scmap_pred == 'unassigned'").index,
    plasschaert.uns["scmap_genes"]
]
plasschaert_unassigned.obs = plasschaert_unassigned.obs.rename({
    "cell_ontology_class": "Cell ontology class"
}, axis=1)

In [None]:
plasschaert_unassigned_tsne = sklearn.manifold.TSNE(
    perplexity=15, metric="correlation", random_state=0
).fit_transform(plasschaert_unassigned.exprs.log1p().toarray())
plasschaert_unassigned.obs["tSNE1"] = plasschaert_unassigned_tsne[:, 0]
plasschaert_unassigned.obs["tSNE2"] = plasschaert_unassigned_tsne[:, 1]

In [None]:
ax = plasschaert_unassigned.visualize_latent(
    "Cell ontology class", method="tSNE",
    size=10, width=4, height=4,
    scatter_kws=dict(
        rasterized=True, palette=palette,
        hue_order=CTs[np.in1d(CTs, plasschaert_unassigned.obs["Cell ontology class"])]
    ), random_seed=0
)
ax.get_figure().savefig("./plasschaert_scmap_unassigned_cl_tsne.pdf", dpi=300, bbox_inches="tight")

In [None]:
snn_graph = snn(
    plasschaert_unassigned.exprs.log1p().toarray(),
    n_neighbors=5, metric="cosine"
)

In [None]:
plasschaert_unassigned.obs["Cluster"] = louvain(snn_graph, resolution=2.0, random_state=0)
cluster_map = collections.OrderedDict([
    (i, "cluster %d" % i) for i in
    sorted(np.unique(plasschaert_unassigned.obs["Cluster"]))
])
plasschaert_unassigned.obs["Cluster"] = plasschaert_unassigned.obs["Cluster"].map(cluster_map)

In [None]:
cluster_approx = collections.OrderedDict([
    (0, 9),
    (1, 0),
    (2, 5),
    (3, 3),
    (4, 8)
])

cluster_palette = collections.OrderedDict([
    ("cluster %d" % i, sns.color_palette("tab20")[2 * j])
    for i, j in cluster_approx.items()
])
cluster_palette_muted = collections.OrderedDict([
    ("cluster %d" % i, sns.color_palette("tab20")[2 * j + 1])
    for i, j in cluster_approx.items()
])

In [None]:
ax = plasschaert_unassigned.visualize_latent(
    hue="Cluster", method="tSNE",
    size=10, width=4, height=4,
    scatter_kws=dict(
        rasterized=True, palette=cluster_palette,
        hue_order=cluster_map.values()
    )
)
ax.get_figure().savefig("./plasschaert_scmap_unassigned_cluster_tsne.pdf", dpi=300, bbox_inches="tight")

In [None]:
ax = plasschaert_unassigned.visualize_latent(
    "Average scmap similarity", method="tSNE", sort=True,
    size=10, width=4, height=4,
    scatter_kws=dict(rasterized=True)
)
ax.get_figure().savefig("./plasschaert_scmap_unassigned_sim_tsne.pdf", dpi=300, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax2 = ax.twinx()
legend_handles = []
for i in cluster_map.values():
    ax = sns.distplot(plasschaert_unassigned.obs.loc[
        plasschaert_unassigned.obs["Cluster"] == i, "Average scmap similarity"
    ], color=cluster_palette[i], kde=False, hist_kws=dict(density=True), ax=ax)
    ax2 = sns.distplot(plasschaert_unassigned.obs.loc[
        plasschaert_unassigned.obs["Cluster"] == i, "Average scmap similarity"
    ], color=cluster_palette[i], hist=False, kde_kws=dict(cumulative=True), ax=ax2)
    legend_handles.append(mpatches.Patch(color=cluster_palette[i], label=i))
_ = plt.legend(handles=legend_handles, frameon=False, bbox_to_anchor=(1.17, 0.5), loc="center left", title="Cluster")
_ = ax.set(xlabel="Average scmap similarity", ylabel="Density")
_ = ax2.set(ylabel="Cumulative probability")
fig.savefig("./plasschaert_scmap_unassigned_sim_distribution.pdf", bbox_inches="tight")

### Check other rejected clusters

#### One-vs-rest differential expression

In [None]:
plasschaert_use = plasschaert.copy(deep=True)
plasschaert_use.obs.loc[
    plasschaert_unassigned.obs.index, "scmap_pred"
] = plasschaert_unassigned.obs["Cluster"].values

In [None]:
plasschaert_scmap_pred_markers = plasschaert_use.fast_markers("scmap_pred")
plasschaert_scmap_pred_markers = collections.OrderedDict(sorted(
    plasschaert_scmap_pred_markers.items(),
    key=lambda x: "z" + x[0] if x[0].startswith("cluster") else x[0]
))  # Prepend a z so that rejected clusters sort to last

In [None]:
if not os.path.exists("plasschaert_scmap_pred_markers"):
    os.mkdir("plasschaert_scmap_pred_markers")
for item in plasschaert_scmap_pred_markers:
    plasschaert_scmap_pred_markers[item].to_csv(
        "plasschaert_scmap_pred_markers/{item}.csv".format(item=item))

In [None]:
plasschaert_scmap_reject_markers = {
    key: val.query("padj < 0.05 & logfc > 0").index.values.tolist()
    for key, val in plasschaert_scmap_pred_markers.items()
    if key.startswith("cluster")
}

In [None]:
plasschaert_scmap_reject_markers_head = {
    key: val[0:min(10, len(val))]
    for key, val in plasschaert_scmap_reject_markers.items()
}

#### Heatmap

In [None]:
combined_markers = collections.OrderedDict(
    list(filter(lambda x: np.in1d(x[0], plasschaert_use.obs["scmap_pred"]), known_markers_noi.items())) +
    list(plasschaert_scmap_reject_markers_head.items())
)

In [None]:
combined_markers.keys()

In [None]:
plasschaert_use.var[""] = np.nan
for ct in list(combined_markers.keys())[::-1]:  # rejected first, so that they are overwritten by known markers
    plasschaert_use.var.loc[combined_markers[ct], ""] = ct
marker_df = plasschaert_use.var.dropna()
del plasschaert_use.var[""]  # Prevent interference with next heatmap
marker_df[""] = pd.Categorical(marker_df[""], categories=combined_markers.keys())
marker_df = marker_df.sort_values("")

In [None]:
cell_df = plasschaert_use.obs.loc[:, ["scmap_pred"]]
cell_df["scmap_pred"] = pd.Categorical(cell_df["scmap_pred"], categories=combined_markers.keys())

In [None]:
random_state = np.random.RandomState(0)
subset_idx = np.concatenate([
    random_state.choice(
        np.where(cell_df["scmap_pred"] == item)[0],
        size=min(np.sum(cell_df["scmap_pred"] == item), 30),
        replace=False
    ) for item in np.unique(cell_df["scmap_pred"])
])
cell_df = cell_df.iloc[subset_idx, :]
cell_df = cell_df.sort_values("scmap_pred")
cell_df.columns = ["Cells"]

In [None]:
mat = pd.DataFrame(
    plasschaert_use.normalize()[cell_df.index, marker_df.index].exprs.log1p().toarray(),
    index=cell_df.index, columns=marker_df.index
)
mask = mat.sum(axis=0) > 0
mat = mat.loc[:, mask]
marker_df = marker_df.loc[mask, :]

In [None]:
color_lut = collections.OrderedDict(list(palette.items()) + list(cluster_palette_muted.items()))
ct_legend_patch = [
    mpatches.Patch(color=color_lut[k], label=k)
    for k in combined_markers.keys() if not k.startswith("cluster")
]
cluster_legend_patch = [
    mpatches.Patch(color=color_lut[k], label=k)
    for k in combined_markers.keys() if k.startswith("cluster")
]

In [None]:
for column in cell_df.columns:
    cell_df[column] = cell_df[column].map(color_lut)
for column in marker_df.columns:
    marker_df[column] = marker_df[column].map(color_lut)

In [None]:
g = sns.clustermap(
    mat.T, standard_scale=0,
    row_cluster=False, col_cluster=True,
    col_linkage=scipy.cluster.hierarchy.linkage(mat, method="average"),
    row_colors=marker_df, col_colors=cell_df,
    cbar_kws=dict(label="Standardized expression"),
    xticklabels=False, vmax=0.9, rasterized=True,
    figsize=(20, 20)
)
g.ax_col_dendrogram.set_xlim([0, 0])  # hide dendrogram
l1 = g.ax_col_dendrogram.legend(
    loc="upper left", bbox_to_anchor=(0.0, 1.0), ncol=2, fontsize="x-large", title_fontsize="x-large",
    handles=ct_legend_patch, frameon=False, title="Cell ontology predictions of matched cells"
)
l2 = g.ax_col_dendrogram.legend(
    loc="upper left", bbox_to_anchor=(0.7, 1.0), ncol=2, fontsize="x-large", title_fontsize="x-large",
    handles=cluster_legend_patch, frameon=False, title="Clustering of rejected cells"
)
g.ax_col_dendrogram.add_artist(l1)
g.fig.savefig("plasschaert_scmap_pred_expr_heatmap.pdf", dpi=300, bbox_inches="tight")

#### Pairwise differential expression

In [None]:
plasschaert_pairwise = plasschaert_use[plasschaert_use.obs.query(
    "scmap_pred == 'lung neuroendocrine cell' | scmap_pred == 'cluster 2'"
).index, :]
np.unique(plasschaert_pairwise.obs["scmap_pred"], return_counts=True)

In [None]:
pairwise_markers = plasschaert_pairwise.fast_markers("scmap_pred")

In [None]:
tmp = pairwise_markers["cluster 2"].query("padj < 0.05 & logfc > 0").index
np.savetxt("sighigh.txt", tmp, fmt="%s")
tmp = pairwise_markers["cluster 2"].query("padj < 0.05 & logfc < 0").index
np.savetxt("siglow.txt", tmp, fmt="%s")
tmp = plasschaert_pairwise.var_names[np.array(plasschaert_pairwise.exprs.sum(axis=0)).ravel() > 0]
np.savetxt("sigbg.txt", tmp, fmt="%s")

Proceed on metascape...

## CellFishing.jl

### Querying

In [None]:
p = subprocess.Popen([
    "julia", "run_CellFishing.jl.jl",
    "--annotation=cell_ontology_class",
    "--gene=cf_genes", "--seed=0", "--cutoff=120",
    "../Notebooks/Case/Trachea/montoro_10x_noi.h5",
    "../Notebooks/Case/Trachea/plasschaert.h5",
    "../Notebooks/Case/Trachea/plasschaert_cf_pred.h5"
], cwd="../../../Evaluation", stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(output, err) = p.communicate()
exit_code = p.wait()
print(output.decode())
print(err.decode())

In [None]:
plasschaert.obs["cf_pred"] = cb.data.read_hybrid_path("plasschaert_cf_pred.h5//prediction/120")

In [None]:
fig = cb.blast.sankey(
    plasschaert.obs["cell_ontology_class"], plasschaert.obs["cf_pred"],
    title="CellFishing.jl prediction: Plasschaert → Montoro_10x without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./plasschaert_cf_sankey.pdf")

In [None]:
mba = cb.metrics.mean_balanced_accuracy(
    plasschaert.obs["cell_ontology_class"],
    plasschaert.obs["cf_pred"],
    plasschaert2montoronoi_expectation
)
print(f"MBA = {mba:.3f}")

In [None]:
ionocyte_mask = plasschaert.obs["cell_ontology_class"] == "ionocyte"
rejected_mask = plasschaert.obs["cf_pred"] == "rejected"
ionocyte_enrichment["cf"] = (
    np.logical_and(
        ionocyte_mask, rejected_mask
    ).sum() / rejected_mask.sum()
) / (
    ionocyte_mask.sum() /
    plasschaert.shape[0]
)

Too many false negative rejections.

## Ionocyte enrichment score

In [None]:
df = pd.DataFrame(ionocyte_enrichment, index=["Ionocyte enrichment ratio"]).T
method_mapping = collections.OrderedDict(
    scmap="scmap",
    cf="CellFishing.jl",
    cb="Cell BLAST"
)
df["Method"] = [method_mapping[item] for item in df.index]
df = df.sort_values("Ionocyte enrichment ratio")

In [None]:
fig, ax = plt.subplots(figsize=(3.5, 3.5))
ax = sns.barplot(x = "Ionocyte enrichment ratio", y = "Method", data=df, palette=method_palette, ax=ax)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
for patch in ax.patches :
    current_height = patch.get_height()
    diff = current_height - 0.7
    patch.set_height(0.7)
    patch.set_y(patch.get_y() + diff * .5)
fig.savefig("plasschaert_ionocyte_enrichment.pdf", bbox_inches="tight")

## Examine problematic ionocytes

We are going to focus on Cell BLAST and scmap's prediction with regard to cells originally labeled as ionocytes.

In [None]:
plasschaert_ionocyte = plasschaert[plasschaert.obs.query("cell_ontology_class == 'ionocyte'").index, :]
del plasschaert_ionocyte.obs["cell_ontology_class"]

In [None]:
plasschaert_ionocyte.obs.loc[:, ["cb_pred", "scmap_pred"]]

In [None]:
CTs_use = CTs_noi[
    np.in1d(CTs_noi, plasschaert_ionocyte.obs.loc[:, ["cb_pred", "scmap_pred"]].values)
].tolist() + ["ionocyte"]
CTs_use

In [None]:
montoro_10x = cb.data.ExprDataSet.read_dataset("../../../Datasets/data/Montoro_10x/data.h5")

In [None]:
random_state = np.random.RandomState(0)

# Just to line-up with the query dataset slots
montoro_10x.obs["cb_pred"] = montoro_10x.obs["cell_ontology_class"]
montoro_10x.obs["scmap_pred"] = montoro_10x.obs["cell_ontology_class"]

subset_idx = np.concatenate([
    random_state.choice(
        np.where(montoro_10x.obs["cell_ontology_class"] == ct)[0],
        size=min(20, np.sum(montoro_10x.obs["cell_ontology_class"] == ct)),
        replace=False
    ) for ct in CTs_use
])
combined_dataset = cb.data.ExprDataSet.merge_datasets({
    "reference cells (in Montoro_10x)": montoro_10x[subset_idx, :],
    "query ionocytes (in Plasschaert)": plasschaert_ionocyte
}, meta_col="Dataset")
combined_dataset.obs = combined_dataset.obs.loc[
    :, ["Dataset", "cb_pred", "scmap_pred"]
].rename({
    "cb_pred": "Cell BLAST prediction",
    "scmap_pred": "scmap prediction"
}, axis=1)

### Expression of known markers

In [None]:
ct_uniq = CTs_use + ["rejected"]
role_uniq = ["reference cells (in Montoro_10x)", "query ionocytes (in Plasschaert)"]

#### Heatmap

In [None]:
combined_dataset.obs.replace("unassigned", "rejected", inplace=True)
for column in combined_dataset.obs.columns:
    categories = role_uniq if column == "Dataset" else ct_uniq
    combined_dataset.obs[column] = pd.Categorical(
        combined_dataset.obs[column],
        categories=categories
    )
cell_df = combined_dataset.obs.sort_values(combined_dataset.obs.columns.values.tolist())

In [None]:
combined_dataset.var[""] = np.nan
for ct in CTs_use:
    combined_dataset.var.loc[known_markers[ct], ""] = ct
marker_df = combined_dataset.var.dropna().loc[
    functools.reduce(list.__add__, [val for key, val in known_markers.items() if key in CTs_use]), :
]

In [None]:
mat = pd.DataFrame(
    combined_dataset.normalize()[cell_df.index, marker_df.index].exprs.log1p().toarray(),
    index=cell_df.index, columns=marker_df.index
)

In [None]:
extended_palette = palette.copy()
extended_palette["rejected"] = (0.1, 0.1, 0.1)
extended_palette["reference cells (in Montoro_10x)"] = sns.color_palette()[9]
extended_palette["query ionocytes (in Plasschaert)"] = sns.color_palette()[8]

ct_legend_patch = [mpatches.Patch(color=extended_palette[k], label=k) for k in ct_uniq]
role_legend_patch = [mpatches.Patch(color=extended_palette[k], label=k) for k in role_uniq]

In [None]:
for column in cell_df.columns:
    cell_df[column] = cell_df[column].map(extended_palette)
for column in marker_df.columns:
    marker_df[column] = marker_df[column].map(extended_palette)

In [None]:
g = sns.clustermap(
    mat.T, standard_scale=0,
    row_cluster=False, col_cluster=False,
    row_colors=marker_df, col_colors=cell_df,
    cbar_kws=dict(label="Standardized expression"),
    xticklabels=False, vmax=0.9, figsize=(14, 12.5), rasterized=True
)
l1 = g.ax_col_dendrogram.legend(
    loc="upper left", bbox_to_anchor=(0.0, 1.2), fontsize="large", title_fontsize="large",
    handles=ct_legend_patch, frameon=False, title="Cell ontology"
)
l2 = g.ax_col_dendrogram.legend(
    loc="upper left", bbox_to_anchor=(0.6, 1.2), fontsize="large", title_fontsize="large",
    handles=role_legend_patch, frameon=False, title="Dataset"
)
g.ax_col_dendrogram.add_artist(l1)
g.fig.savefig("plasschaert_problematic_expr_heatmap.pdf", dpi=300, bbox_inches="tight")

### Potential doublets

In [None]:
plasschaert.obs["total_umi"] = plasschaert.exprs.sum(axis=1)
plasschaert.obs["detected_genes"] = (plasschaert.exprs > 0).sum(axis=1)

In [None]:
plasschaert.obs["potential doublets"] = functools.reduce(np.logical_and, [
    plasschaert.obs["cell_ontology_class"] == "ionocyte",
    plasschaert.obs["cb_pred"] == "club cell",
    plasschaert.obs["scmap_pred"] == "club cell"
])

In [None]:
plasschaert.obs["potential doublets"].sum()

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.distplot(plasschaert.obs.loc[plasschaert.obs["cell_ontology_class"] == "club cell", "total_umi"])
ax = sns.distplot(plasschaert.obs.loc[plasschaert.obs["cell_ontology_class"] == "ionocyte", "total_umi"], ax=ax)
ax = sns.distplot(plasschaert.obs.loc[plasschaert.obs["potential doublets"], "total_umi"], ax=ax)
_ = plt.legend(handles=[
    mpatches.Patch(color=sns.color_palette()[0], label="club cell"),
    mpatches.Patch(color=sns.color_palette()[1], label="ionocyte"),
    mpatches.Patch(color=sns.color_palette()[2], label="potential doublets")
], frameon=False, bbox_to_anchor=(1.05, 0.5), loc="center left", title="Cell groups")
_ = ax.set(xlabel="Total UMI count", ylabel="Density")
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
fig.savefig("./plasschaert_potential_doublet_umi.pdf", bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax = sns.distplot(plasschaert.obs.loc[plasschaert.obs["cell_ontology_class"] == "club cell", "detected_genes"])
ax = sns.distplot(plasschaert.obs.loc[plasschaert.obs["cell_ontology_class"] == "ionocyte", "detected_genes"], ax=ax)
ax = sns.distplot(plasschaert.obs.loc[plasschaert.obs["potential doublets"], "detected_genes"], ax=ax)
_ = plt.legend(handles=[
    mpatches.Patch(color=sns.color_palette()[0], label="club cell"),
    mpatches.Patch(color=sns.color_palette()[1], label="ionocyte"),
    mpatches.Patch(color=sns.color_palette()[2], label="potential doublets")
], frameon=False, bbox_to_anchor=(1.05, 0.5), loc="center left", title="Cell groups")
_ = ax.set(xlabel="Number of detected genes", ylabel="Density")
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
fig.savefig("./plasschaert_potential_doublet_ngenes.pdf", bbox_inches="tight")

So these do not look like doublets.

---

## Querying the other way around

Read data

In [None]:
plasschaert_noi = cb.data.ExprDataSet.read_dataset("../../../Datasets/data/Plasschaert_noi/data.h5")
plasschaert_noi = utils.clean_dataset(plasschaert_noi, "cell_ontology_class")
plasschaert_noi.obs["Cell ontology class"] = plasschaert_noi.obs["cell_ontology_class"]

montoro_10x = cb.data.ExprDataSet.read_dataset("../../../Datasets/data/Montoro_10x/data.h5")
montoro_10x = utils.clean_dataset(montoro_10x, "cell_ontology_class")
montoro_10x.obs["Cell ontology class"] = montoro_10x.obs["cell_ontology_class"]
montoro_10x.write_dataset("./montoro_10x.h5")

montoro2plasschaertnoi_expectation = pd.read_csv("./montoro2plasschaertnoi_expectation.csv", index_col=0)

Many Plasschaert selected genes are not detected or highly differientially expressed in Montoro_10x, which we filter out here.

In [None]:
pm = cb.data.ExprDataSet.merge_datasets({"montoro_10x": montoro_10x, "plasschaert": plasschaert_noi})
pm = pm[:, np.intersect1d(montoro_10x.var_names, plasschaert_noi.var_names)]

In [None]:
pmdiff = pm.normalize().fast_markers("dataset_name")

In [None]:
pmdiff_head = pmdiff["Plasschaert_noi"].sort_values("z", ascending=False).index[:100]
pmdiff_head

In [None]:
len(plasschaert_noi.uns["seurat_genes"])

In [None]:
plasschaert_noi.uns["seurat_genes_clean"] = np.intersect1d(
    np.setdiff1d(
        plasschaert_noi.uns["seurat_genes"], pmdiff_head,
    ), montoro_10x.var_names
)

In [None]:
plasschaert_noi.uns["scmap_genes_clean"] = np.intersect1d(
    np.setdiff1d(
        plasschaert_noi.uns["scmap_genes"], pmdiff_head,
    ), montoro_10x.var_names
)

In [None]:
plasschaert_noi.uns["seurat_genes_clean"].size, plasschaert_noi.uns["scmap_genes_clean"].size

In [None]:
plasschaert_noi.write_dataset("./plasschaert_noi.h5")

In [None]:
ionocyte_enrichment = dict()

### Cell BLAST

#### Train model

In [None]:
plasschaert_noi_models = []
for i in range(4):
    path = "/tmp/cb/%s" % cb.utils.rand_hex()
    print("==== Training model %d at %s ====" % (i, path))
    plasschaert_noi_models.append(cb.directi.fit_DIRECTi(
        plasschaert_noi, plasschaert_noi.uns["seurat_genes_clean"],
        latent_dim=10, cat_dim=20, epoch=1000, patience=30,
        random_seed=i, path=path
    ))

#### Querying

In [None]:
plasschaert_noi_blast = cb.blast.BLAST(
    plasschaert_noi_models, plasschaert_noi)
plasschaert_noi_blast.save("./plasschaert_noi_cb/blast")

In [None]:
plasschaert_noi_blast = cb.blast.BLAST.load("./plasschaert_noi_cb/blast")

In [None]:
hits = plasschaert_noi_blast.query(montoro_10x)

In [None]:
montoro_10x.obs["cb_pred"] = hits.reconcile_models().filter("pval", 0.05).annotate(
    "cell_ontology_class"
)["cell_ontology_class"]
cb.data.write_hybrid_path(montoro_10x.obs["cb_pred"].values, "./montoro_10x_cb_pred.h5//prediction")

In [None]:
montoro_10x.obs["cb_pred"] = cb.data.read_hybrid_path("./montoro_10x_cb_pred.h5//prediction")

In [None]:
fig = cb.blast.sankey(
    montoro_10x.obs["cell_ontology_class"], montoro_10x.obs["cb_pred"],
    title="Cell BLAST prediction: Montoro_10x → Plasschaert without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./montoro_10x_cb_sankey.pdf")

In [None]:
mba = cb.metrics.mean_balanced_accuracy(
    montoro_10x.obs["cell_ontology_class"],
    montoro_10x.obs["cb_pred"],
    montoro2plasschaertnoi_expectation
)
print(f"MBA = {mba:.3f}")

In [None]:
ionocyte_mask = montoro_10x.obs["cell_ontology_class"] == "ionocyte"
rejected_mask = montoro_10x.obs["cb_pred"] == "rejected"
ionocyte_enrichment["cb"] = (
    np.logical_and(
        ionocyte_mask, rejected_mask
    ).sum() / rejected_mask.sum()
) / (
    ionocyte_mask.sum() /
    montoro_10x.shape[0]
)

#### Cluster rejected cells

In [None]:
montoro_10x_unassigned = montoro_10x[montoro_10x.obs["cb_pred"] == "rejected", :]
montoro_10x_unassigned.latent = plasschaert_noi_blast.models[0].inference(montoro_10x_unassigned)
montoro_10x_unassigned.obs["Average Cell BLAST p-value"] = np.array([
    scipy.stats.gmean(hits.pval[i], axis=None) for i in np.where(
        montoro_10x.obs["cb_pred"] == "rejected"
    )[0]
])

In [None]:
ax = montoro_10x_unassigned.visualize_latent(
    "Cell ontology class", method="tSNE", dr_kws=dict(perplexity=15), reuse=False,
    size=15, width=4, height=4,
    scatter_kws=dict(rasterized=True), random_seed=0
)
ax.get_figure().savefig("./montoro_10x_cb_unassigned_tsne_cl.pdf", dpi=300, bbox_inches="tight")

In [None]:
snn_graph = snn(montoro_10x_unassigned.latent, n_neighbors=15)

In [None]:
montoro_10x_unassigned.obs["Cluster"] = np.vectorize(
    lambda x: "cluster %d" % x
)(louvain(snn_graph, resolution=1.5, random_state=0))
montoro_10x_unassigned.obs["Cluster"] = pd.Categorical(
    montoro_10x_unassigned.obs["Cluster"],
    categories=sorted(np.unique(
        montoro_10x_unassigned.obs["Cluster"]
    ), key=lambda x: int(x.split(" ")[1]))
)

In [None]:
ax = montoro_10x_unassigned.visualize_latent(
    "Cluster", method="tSNE",
    size=15, width=4, height=4,
    scatter_kws=dict(rasterized=True)
)
ax.get_figure().savefig("./montoro_10x_cb_unassigned_tsne_cluster.pdf", dpi=300, bbox_inches="tight")

In [None]:
ax = montoro_10x_unassigned.visualize_latent(
    "Average Cell BLAST p-value", method="tSNE", sort=True,
    size=15, width=4, height=4,
    scatter_kws=dict(rasterized=True)
)
ax.get_figure().savefig("./montoro_10x_cb_unassigned_tsne_pval.pdf", dpi=300, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax2 = ax.twinx()
legend_handles = []
for i in range(np.unique(montoro_10x_unassigned.obs["Cluster"]).size):
    ax = sns.distplot(montoro_10x_unassigned.obs.loc[
        montoro_10x_unassigned.obs["Cluster"] == "cluster %d" % i, "Average Cell BLAST p-value"
    ], color=sns.color_palette()[i], kde=False, hist_kws=dict(density=True), ax=ax)
    ax2 = sns.distplot(montoro_10x_unassigned.obs.loc[
        montoro_10x_unassigned.obs["Cluster"] == "cluster %d" % i, "Average Cell BLAST p-value"
    ], color=sns.color_palette()[i], hist=False, kde_kws=dict(cumulative=True), ax=ax2)
    legend_handles.append(mpatches.Patch(color=sns.color_palette()[i], label="cluster %d" % i))
_ = plt.legend(handles=legend_handles, frameon=False, bbox_to_anchor=(1.17, 0.5), loc="center left", title="Cluster")
_ = ax.set(xlabel="Average Cell BLAST p-value", ylabel="Density")
_ = ax2.set(ylabel="Cumulative probability")
plt.xlim(-0.03, 0.45)
fig.savefig("./montoro_10x_cb_unassigned_pval_distribution.pdf", bbox_inches="tight")

### scmap

#### Querying

In [None]:
p = subprocess.Popen([
    "Rscript", "run_scmap.R",
    "-r", "../Notebooks/Case/Trachea/plasschaert_noi.h5", "-n", "Plasschaert_noi",
    "-q", "../Notebooks/Case/Trachea/montoro_10x.h5",
    "-o", "../Notebooks/Case/Trachea/montoro_10x_scmap_pred.h5",
    "-g", "scmap_genes_clean", "-s", "0", "--threshold", "0.5",
    "--cluster-col", "cell_ontology_class"
], cwd="../../../Evaluation", stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(output, err) = p.communicate()
exit_code = p.wait()
print(output.decode())
print(err.decode())

In [None]:
montoro_10x.obs["scmap_pred"] = cb.data.read_hybrid_path("./montoro_10x_scmap_pred.h5//prediction/0.5")

In [None]:
fig = cb.blast.sankey(
    montoro_10x.obs["cell_ontology_class"], montoro_10x.obs["scmap_pred"],
    title="scmap prediction: Montoro_10x → Plasschaert without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./montoro_10x_scmap_sankey.pdf")

In [None]:
mba = cb.metrics.mean_balanced_accuracy(
    montoro_10x.obs["cell_ontology_class"],
    montoro_10x.obs["scmap_pred"],
    montoro2plasschaertnoi_expectation
)
print(f"MBA = {mba:.3f}")

In [None]:
ionocyte_mask = montoro_10x.obs["cell_ontology_class"] == "ionocyte"
rejected_mask = montoro_10x.obs["scmap_pred"] == "unassigned"
ionocyte_enrichment["scmap"] = (
    np.logical_and(
        ionocyte_mask, rejected_mask
    ).sum() / rejected_mask.sum()
) / (
    ionocyte_mask.sum() /
    montoro_10x.shape[0]
)

There are only 4 unassigned.

## CellFishing.jl

### Querying

In [None]:
p = subprocess.Popen([
    "julia", "run_CellFishing.jl.jl",
    "--annotation=cell_ontology_class",
    "--gene=cf_genes", "--seed=0", "--cutoff=120",
    "../Notebooks/Case/Trachea/plasschaert_noi.h5",
    "../Notebooks/Case/Trachea/montoro_10x.h5",
    "../Notebooks/Case/Trachea/montoro_10x_cf_pred.h5"
], cwd="../../../Evaluation", stdout=subprocess.PIPE, stderr=subprocess.PIPE)
(output, err) = p.communicate()
exit_code = p.wait()
print(output.decode())
print(err.decode())

In [None]:
montoro_10x.obs["cf_pred"] = cb.data.read_hybrid_path("montoro_10x_cf_pred.h5//prediction/120")

In [None]:
fig = cb.blast.sankey(
    montoro_10x.obs["cell_ontology_class"], montoro_10x.obs["cf_pred"],
    title="CellFishing.jl prediction: Montoro_10x → Plasschaert without ionocyte",
    width=660, height=600, tint_cutoff=5, font_size=11
)
plotly.io.write_image(fig, "./montoro_10x_cf_sankey.pdf")

In [None]:
mba = cb.metrics.mean_balanced_accuracy(
    montoro_10x.obs["cell_ontology_class"],
    montoro_10x.obs["cf_pred"],
    montoro2plasschaertnoi_expectation
)
print(f"MBA = {mba:.3f}")

In [None]:
ionocyte_mask = montoro_10x.obs["cell_ontology_class"] == "ionocyte"
rejected_mask = montoro_10x.obs["cf_pred"] == "rejected"
ionocyte_enrichment["cf"] = (
    np.logical_and(
        ionocyte_mask, rejected_mask
    ).sum() / rejected_mask.sum()
) / (
    ionocyte_mask.sum() /
    montoro_10x.shape[0]
)

Too many false negative rejections.

## Ionocyte enrichment score

In [None]:
df = pd.DataFrame(ionocyte_enrichment, index=["Ionocyte enrichment ratio"]).T
method_mapping = collections.OrderedDict(
    scmap="scmap",
    cf="CellFishing.jl",
    cb="Cell BLAST"
)
df["Method"] = [method_mapping[item] for item in df.index]
df = df.sort_values("Ionocyte enrichment ratio")

In [None]:
fig, ax = plt.subplots(figsize=(3.5, 3.5))
ax = sns.barplot(x = "Ionocyte enrichment ratio", y = "Method", data=df, palette=method_palette, ax=ax)
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
for patch in ax.patches :
    current_height = patch.get_height()
    diff = current_height - 0.7
    patch.set_height(0.7)
    patch.set_y(patch.get_y() + diff * .5)
fig.savefig("montoro_10x_ionocyte_enrichment.pdf", bbox_inches="tight")