In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import metrics
from IPython.display import display

import rp2.data
from rp2 import hagai_2018

rp2.check_environment()

In [None]:
condition_columns = ["replicate", "treatment", "time_point"]
time_points = ["0", "2", "4", "6"]

gene_info_df = rp2.load_biomart_gene_symbols_df("mouse")

In [None]:
mouse_umi_adata = hagai_2018.load_umi_counts("mouse")
mouse_umi_adata = mouse_umi_adata[mouse_umi_adata.obs.time_point.isin(time_points)].copy()

print("Full Hagai mouse dataset has:")
print(f"  {mouse_umi_adata.n_obs:,} cells")
print(f"  {mouse_umi_adata.n_vars:,} genes")

assert(mouse_umi_adata.n_obs == 53_086)
assert(mouse_umi_adata.n_vars == 22_048)

del mouse_umi_adata

In [None]:
mouse_counts_adata = hagai_2018.load_counts("mouse", scaling="median")
mouse_counts_adata = mouse_counts_adata[mouse_counts_adata.obs.time_point.isin(time_points)].copy()

print("Scaled Hagai mouse dataset has:")
print(f"  {mouse_counts_adata.n_vars:,} genes")

assert(mouse_counts_adata.n_obs == 53_086)
assert(mouse_counts_adata.n_vars == 16_798)

In [None]:
lps_responsive_gene_ids = mouse_counts_adata.var.index[mouse_counts_adata.var.lps_responsive]
print(f"{len(lps_responsive_gene_ids):,} genes are LPS-responsive")

assert(len(lps_responsive_gene_ids) == 2_336)

In [None]:
additional_gene_symbols = ["Tnf"]
additional_gene_ids = gene_info_df.index[gene_info_df.symbol.isin(additional_gene_symbols)]
analysis_gene_ids = sorted(set(lps_responsive_gene_ids).union(additional_gene_ids))

print(f"{len(analysis_gene_ids):,} genes to be used in analysis")

assert(len(analysis_gene_ids) == 2_337)

In [None]:
condition_df = mouse_counts_adata.obs[condition_columns].drop_duplicates()

print(f"{len(condition_df)} conditions per gene")
print(f"{len(condition_df) * len(analysis_gene_ids)} conditions overall")

display(condition_df.replicate.value_counts().sort_index())

assert(len(condition_df) == 20)
assert((20 * 2_337) == 46_740)

del condition_df

In [None]:
analysis_count_adata = mouse_counts_adata[:, analysis_gene_ids].copy()
gene_condition_stats_df = hagai_2018.calculate_counts_condition_stats(analysis_count_adata)

assert(len(gene_condition_stats_df) == 46_740)

In [None]:
def fit_mean_variance_trends(df):
    x = sm.add_constant(df["mean"])
    y = df["variance"]
    rlm_results = sm.RLM(y, x, M=sm.robust.norms.HuberT(t=1.345)).fit()

    results = {
        "intercept": rlm_results.params[0],
        "slope": rlm_results.params[1],
        "intercept_pval": rlm_results.pvalues[0],
        "slope_pval": rlm_results.pvalues[1],
        "r2_unweighted": metrics.r2_score(y, rlm_results.fittedvalues),
        "r2_weighted": metrics.r2_score(y, rlm_results.fittedvalues, sample_weight=rlm_results.weights),
    }
    return pd.Series(results)


treatment_sets = {
    "all": ["unst", "lps", "pic"],
#    "lps": ["unst", "lps"],
#    "pic": ["unst", "pic"],
}

mv_fit_map = {set_name: gene_condition_stats_df[gene_condition_stats_df.treatment.isin(set_list)].groupby("gene").apply(fit_mean_variance_trends)
              for set_name, set_list in treatment_sets.items()}

In [None]:
all_treatment_mv_fit = mv_fit_map["all"].copy()
all_treatment_mv_fit["accept_intercept"] = all_treatment_mv_fit["intercept_pval"] < 0.05
all_treatment_mv_fit["accept_slope"] = all_treatment_mv_fit["slope_pval"] < 0.05
all_treatment_mv_fit["accept_fit"] = all_treatment_mv_fit["accept_intercept"] & all_treatment_mv_fit["accept_slope"]
all_treatment_mv_fit["accept_r2"] = all_treatment_mv_fit["r2_unweighted"] > 0.6
display(all_treatment_mv_fit[[c for c in all_treatment_mv_fit.columns if c.startswith("accept_")]].agg(np.count_nonzero))

print(f"{np.count_nonzero(all_treatment_mv_fit.accept_fit):,} mean-variance trends are significant")

In [None]:
all_treatment_good_mv_fit = all_treatment_mv_fit.loc[all_treatment_mv_fit.accept_fit & all_treatment_mv_fit.accept_r2]

print(f"{len(all_treatment_good_mv_fit):,} mean-variance trends have a good fit (based on unweighted R2)")
print(f"i.e. {100 * (len(all_treatment_good_mv_fit) / len(analysis_gene_ids)):.1f}%")

In [None]:
txburst_df = rp2.data.load_txburst_results("mouse", condition_columns, "median")
txburst_df = txburst_df.loc[txburst_df.time_point.isin(time_points)].copy()

assert(len(txburst_df) == 46_740)
assert(len(txburst_df[condition_columns].drop_duplicates()) == 20)

In [None]:
txburst_df["valid_points"] = txburst_df.bs_point.notna() & txburst_df.bf_point.notna()
txburst_df["valid_bs_interval"] = txburst_df.bs_lower.notna() & txburst_df.bs_upper.notna()
txburst_df["valid_bf_interval"] = txburst_df.bf_lower.notna() & txburst_df.bf_upper.notna()
txburst_df["valid_intervals"] = txburst_df.valid_bs_interval & txburst_df.valid_bf_interval
display(txburst_df[["keep"] + [c for c in txburst_df.columns if c.startswith("valid_")]].agg(np.count_nonzero))    