In [None]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy import linalg, spatial, stats
from sklearn.linear_model import LinearRegression

from rp2 import get_data_path, get_output_path, GeneSymbolMap

In [None]:
phagocyte_genes_df = pd.read_excel(
    get_data_path("hagai_2018", "41586_2018_657_MOESM4_ESM.xlsx"),
    sheet_name="phagocytes_FC_diveregnce"
)
display(phagocyte_genes_df)

In [None]:
responsive_phagocyte_genes = phagocyte_genes_df.loc[phagocyte_genes_df.mouse_padj < 0.01].gene
print(f"{len(responsive_phagocyte_genes):,} responsive phagocyte genes")

Load Hagai *et al.* (2018) UMI counds and prepare them for mean-variance calculations

In [None]:
ae_path = get_data_path("ArrayExpress")

mv_genes = responsive_phagocyte_genes
mv_replicates = ["1", "2", "3"]
mv_treatments = ["unst", "lps"]
mv_time_points = ["0", "2", "4", "6"]

adata = anndata.read_h5ad(ae_path.joinpath("E-MTAB-6754.processed.2.mouse.h5ad"))
adata = adata[:, mv_genes]
adata = adata[adata.obs.replicate.isin(mv_replicates), :]
adata = adata[adata.obs.treatment.isin(mv_treatments), :]
adata = adata[adata.obs.time_point.isin(mv_time_points), :]
adata = adata.copy()

In [None]:
gene_variability = pd.Series(index=adata.var.index, data=adata.X.A.var(axis=0)).sort_values(ascending=False)
print(gene_variability)

In [None]:
mouse_genes_df = pd.read_table(get_data_path("BioMart", "mouse_genes.tsv"), names=["id", "symbol", "description"], index_col=0)
symbol_map = GeneSymbolMap(mouse_genes_df)

In [None]:
gene_stats_df = pd.DataFrame()

for (replicate, treatment, time_point), group_df in adata.obs.groupby(["replicate", "treatment", "time_point"]):
    print(f"Calculating mean and variance for replicate={replicate}, treatment={treatment}, time={time_point}h")

    cell_view = adata[group_df.index, :]

    gene_stats_df = gene_stats_df.append(
        pd.DataFrame(
            data={
                "gene": mv_genes,
                "replicate": replicate,
                "treatment": treatment,
                "time_point": time_point,
                "n_barcodes": cell_view.n_obs,
                "mean": cell_view.X.A.mean(axis=0),
                "variance": cell_view.X.A.var(axis=0, ddof=1),
                "std_dev": cell_view.X.A.std(axis=0, ddof=1),
            },
        ),
        ignore_index=True,
    )

gene_stats_df = gene_stats_df.sort_values(["gene", "replicate", "time_point", "treatment"])
display(symbol_map.added_to(gene_stats_df))

Determine outliers based on (squared) Mahalanobis distance

In [None]:
gene_stats_df["outlier"] = False

outlier_distance_threshold = np.sqrt(stats.chi2.ppf(0.95, 2))

for _, gene_df in gene_stats_df.groupby("gene"):
    mean_var = gene_df.loc[:, ["mean", "variance"]].to_numpy()
    centroid = mean_var.mean(axis=0)

    mv_cov = np.cov(mean_var, rowvar=False)
    mv_cov_inv = linalg.inv(mv_cov)

    outlier_ids = []
    for idx, mv in zip(gene_df.index, mean_var):
        distance = spatial.distance.mahalanobis(mv, centroid, mv_cov_inv)
        if distance > outlier_distance_threshold:
            outlier_ids.append(idx)

    gene_stats_df.loc[outlier_ids, "outlier"] = True

print(f"Outliers: {gene_stats_df.outlier.sum():,}")

Plot histograms of mean and variance for all non-outliers

In [None]:
for stat_name in ["Mean", "Variance"]:
    gene_stats_df.loc[~gene_stats_df.outlier, stat_name.lower()].plot.hist(log=True).set(xlabel=stat_name)
    plt.show()

gene_stats_df.loc[~gene_stats_df.outlier].groupby("gene")["mean"].max().plot.hist(log=True).set(xlabel="Max mean")
plt.show()

Fit linear regression model to mean-variance relationship of all genes

In [None]:
def fit_mean_var_regression(gene_df):
    mean_and_var = gene_df[["mean", "variance"]].to_numpy()
    outlier_mask = gene_df.outlier.to_numpy()

    lr = LinearRegression().fit(*mean_and_var[np.newaxis, ~outlier_mask].T)
    r2 = lr.score(*mean_and_var[np.newaxis, ~outlier_mask].T)

    return pd.Series(data={
        "slope": np.squeeze(lr.coef_),
        "intercept": np.squeeze(lr.intercept_),
        "r2": r2,
    })


gene_regression_df = gene_stats_df.groupby("gene").apply(fit_mean_var_regression)
display(symbol_map.added_to(gene_regression_df))

Save the descriptive statistics and results of fitting the regression models

In [None]:
output_path = get_output_path()
output_path.mkdir(parents=True, exist_ok=True)

gene_stats_df.to_csv(output_path.joinpath("mouse_stats_per_condition_per_gene.csv"), index=False)
gene_regression_df.to_csv(output_path.joinpath("mouse_lr_fit_per_gene.csv"))

Show mean-variance plots for 10 most variable genes

In [None]:
plot_outliers = True

for gene in gene_variability.index[:10]:
    gene_df = gene_stats_df.loc[gene_stats_df.gene == gene]

    mean_and_var = gene_df[["mean", "variance"]].to_numpy()
    outlier_mask = gene_df.outlier.to_numpy()

    slope, intercept = gene_regression_df.loc[gene, ["slope", "intercept"]]

    lr_plot_x = mean_and_var[:, 0, np.newaxis]
    if not plot_outliers:
        lr_plot_x = lr_plot_x[~outlier_mask]
    lr_plot_x = np.sort(lr_plot_x)
    lr_plot_y = (lr_plot_x * slope) + intercept

    if plot_outliers:
        plt.scatter(*mean_and_var[outlier_mask].T, marker="x")
    plt.scatter(*mean_and_var[~outlier_mask].T, marker="o")
    plt.plot(lr_plot_x, lr_plot_y)
    plt.title(f"{symbol_map.lookup(gene)} / {gene}")
    plt.xlabel("Mean")
    plt.ylabel("Variance")
    plt.xlim(left=min(0, np.min(lr_plot_x)))
    plt.ylim(bottom=min(0, np.min(lr_plot_y)))
    plt.show()