In [None]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

from rp2 import get_data_path

In [None]:
phagocyte_genes_df = pd.read_excel(
    get_data_path("hagai_2018", "41586_2018_657_MOESM4_ESM.xlsx"),
    sheet_name="phagocytes_FC_diveregnce"
)
display(phagocyte_genes_df)

In [None]:
responsive_phagocyte_genes = phagocyte_genes_df.loc[phagocyte_genes_df.mouse_padj < 0.01].gene
print(f"{len(responsive_phagocyte_genes):,} responsive phagocyte genes")

Load Hagai *et al.* (2018) UMI counds and prepare them for mean-variance calculations

In [None]:
ae_path = get_data_path("ArrayExpress")

mv_genes = responsive_phagocyte_genes
mv_replicates = ["1", "2", "3"]
mv_treatments = ["unst", "lps"]
mv_time_points = ["0", "2", "4", "6"]

adata = anndata.read_h5ad(ae_path.joinpath("E-MTAB-6754.processed.2.mouse.h5ad"))
adata = adata[:, mv_genes]
adata = adata[adata.obs.replicate.isin(mv_replicates), :]
adata = adata[adata.obs.treatment.isin(mv_treatments), :]
adata = adata[adata.obs.time_point.isin(mv_time_points), :]
adata = adata.copy()

In [None]:
gene_variability = pd.Series(index=adata.var.index, data=adata.X.A.var(axis=0)).sort_values(ascending=False)
print(gene_variability)

In [None]:
gene_stats_df = pd.DataFrame()

for (replicate, treatment, time_point), group_df in adata.obs.groupby(["replicate", "treatment", "time_point"]):
    print(f"Calculating mean and variance for replicate {replicate} treatment {treatment} time point {time_point}")

    cell_view = adata[group_df.index, :]

    gene_stats_df = gene_stats_df.append(pd.DataFrame(data={
        "gene": mv_genes,
        "replicate": replicate,
        "treatment": treatment,
        "time_point": time_point,
        "n_barcodes": cell_view.n_obs,
        "mean": cell_view.X.A.mean(axis=0),
        "variance": cell_view.X.A.var(axis=0, ddof=1),
        "std_dev": cell_view.X.A.std(axis=0, ddof=1),
    }))

display(gene_stats_df)

Show mean-regression plots for 10 most variable genes

In [None]:
for gene in gene_variability.index[:10]:
    gene_df = gene_stats_df.loc[gene_stats_df.gene == gene]
    mean_values = gene_df["mean"].values.reshape(-1, 1)
    var_values = gene_df["variance"].values.reshape(-1, 1)

    lr = LinearRegression().fit(mean_values, var_values)
    lr_y = lr.predict(mean_values)

    plt.scatter(mean_values, var_values)
    plt.plot(mean_values, lr_y)
    plt.title(gene)
    plt.xlabel("Mean")
    plt.ylabel("Variance")
    plt.xlim(left=min(0, np.min(mean_values)))
    plt.ylim(bottom=min(0, np.min(lr_y)))
    plt.show()