# Comparaing simulations against the data from Mitchell
Simulated data generated by `simulations.sh parameters.txt`.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.lines import Line2D
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import subprocess
from pathlib import Path

from hscpy import mitchell, realisation, parameters
from hscpy.figures import PlotOptions, simulations, ToCellFrequency
from hscpy.figures import burden as burden_fig
from hscpy.figures import sfs as sfs_fig
from hscpy import variant

from futils import parse_version, snapshot

LATEST = True
SAVEFIG = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
EXTENSION = ".svg"
PATH2HSC = Path("~").expanduser() / "hsc"
PLOT_OPTIONS = PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVEFIG)
NCELLS = 100_000

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2MITCHELL = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2MITCHELL = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2MITCHELL = Path("~").expanduser()

In [None]:
%%bash -s "$PATH2HSC" --out version
$1/target/release/hsc  --version

In [None]:
if LATEST:
    VERSION = parse_version(version)
else:
    VERSION = "sfs/v3.0.6"
PATH2SAVE = Path(f"./{VERSION}")

print("Running hsc with version:", VERSION)

In [None]:
donors = mitchell.donors()
donors

## Single-cell mutational burden

In [None]:
%%time
# mitchell's donors
burden_donors = list()
for donor in donors.itertuples():
    print("loading burden for donor", donor.name)
    burden_donors.append(
        mitchell.burden_donor_mitchell(donor.name, donor.age, PATH2MITCHELL, False)
    )

In [None]:
# DONORS only
fig, ax = plt.subplots(1, 1)
means, variances = list(), list()
for b, c in zip(burden_donors, mcolors.TABLEAU_COLORS.values()):
    tot_cells = sum(b[3].values())
    assert tot_cells == donors[donors.name == b[0]].cells.iloc[0]
    array = snapshot.array_from_hist(b[3])
    means.append((b[0], b[1], array.mean()))
    variances.append((b[0], b[1], array.var()))

    burden_fig.plot_burden(
        ax,
        b[3],
        normalise=True,
        options=PLOT_OPTIONS,
        ls="-",
        marker=".",
        mew=3,
        alpha=0.5,
        color=c,
        label=f"{donors.loc[donors.name == b[0], 'age'].iloc[0]} y.o.",
    )
ax.legend(fontsize="small", ncols=2)
plt.show()

In [None]:
burden_sims = dict()
for donor in donors.itertuples():
    burden_sims[donor.name] = realisation.load_all_burden_by_age(
        Path(f"{VERSION}/{donor.cells}cells/burden")
    )[donor.age]

In [None]:
# check that the burden in the sims match the data
means_var_s = list()
for b in burden_donors:
    print(b[0])
    fig, ax = plt.subplots(1, 1)
    # sims
    pooled = snapshot.Uniformise.pooled_distribution(
        [bur.burden for bur in burden_sims[b[0]]]
    )
    m_, v_ = realisation.compute_mean_variance(pooled)
    burden_fig.plot_burden(
        ax,
        pooled,
        normalise=False,
        color="grey",
        marker=".",
        alpha=0.5,
        label=f"{len(burden_sims[b[0]])} sims",
        options=PLOT_OPTIONS,
    )
    means_var_s.append((b[0], b[1], m_, v_))
    # data
    muts, counts = list(b[3].keys()), list(b[3].values())
    tot_cells = sum(counts)
    assert tot_cells == donors[donors.name == b[0]].cells.iloc[0]
    burden_fig.plot_burden(
        ax,
        b[3],
        normalise=True,
        color="#d95f0e",
        marker=".",
        bins=10,
        alpha=0.5,
        label=f"{b[1]} y.o.",
        options=PLOT_OPTIONS,
    )
    ax.legend(fontsize="small")
    plt.show()

In [None]:
fig, ax = plt.subplots(1, 1)
means_var_df_s = pd.DataFrame(means_var_s, columns=["name", "age", "mean", "variance"])
means_df = pd.DataFrame(means, columns=["name", "age", "mean"])

means_df.plot(
    ax=ax, x="age", y="mean", marker=".", color="#d95f0e", label="Mitchell data"
)
means_var_df_s.plot(ax=ax, x="age", y="mean", color="grey", label="pooled sims")
ax.set_title("Mean")
ax.set_xlabel("Time [years]")
ax.set_ylabel("Single-cell burden")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1)
variances_df = pd.DataFrame(variances, columns=["name", "age", "variance"])

variances_df.plot(
    ax=ax, x="age", y="variance", marker=".", color="#d95f0e", label="Mitchell data"
)
means_var_df_s.plot(ax=ax, x="age", y="variance", color="grey", label="pooled sims")
ax.set_title("Variance")
ax.set_xlabel("Time [years]")
ax.set_ylabel("Single-cell burden")
plt.show()

In [None]:
m1, m2 = (
    snapshot.array_from_hist(burden_donors[0][3]).mean(),
    snapshot.array_from_hist(burden_donors[1][3]).mean(),
)
print(
    f"The mean single-cell mut burden of the two neoborns computed from the genotype matrix is: {m1:.2f}, {m2:.2f}",
)
print(
    "from the sims:", means_var_df_s.loc[means_var_df_s["age"] == 0, "mean"].to_numpy()
)
m1, m2 = (
    snapshot.array_from_hist(burden_donors[0][3]).var(),
    snapshot.array_from_hist(burden_donors[1][3]).var(),
)
print(
    f"The variance single-cell mut burden of the two neoborns computed from the genotype matrix is: {m1:.2f}, {m2:.2f}",
)
print(
    "from the sims:",
    means_var_df_s.loc[means_var_df_s["age"] == 0, "variance"].to_numpy(),
)

In [None]:
# regress neutral donors
fig, ax = plt.subplots(1, 1)
# neutral donors have no detected exp clone
neutral_donors = {"CB002", "KX001", "SX001"}
x, y = (
    means_df.loc[means_df.name.isin(neutral_donors), "age"],
    means_df.loc[means_df.name.isin(neutral_donors), "mean"],
)
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y, rcond=None)[0]
ax.plot(donors.age, m * donors.age + c, "black", linewidth=2, linestyle="--")
for donor in donors.itertuples():
    d_burden = [d for d in burden_donors if d[0] == donor.name][0]
    array = snapshot.array_from_hist(d_burden[3])
    ax.plot([d_burden[1]] * array.shape[0], array, ls="", marker=".", alpha=0.3)
    ax.plot([d_burden[1]], array.mean(), ls="", marker="x", mew=2, color="black")
ax.plot()
print(m, c)
ax.set_ylabel("Single-cell burden")
ax.set_xlabel("Time [years]")
ax.text(x=1, y=1500, s=f"m={m:.2f}")
plt.show()

## Expanded clones

In [None]:
%%time
counts_sims = dict()

for donor in donors[["name", "age", "cells"]].itertuples():
    print(
        f"\tloading sims variant counts for donor {donor.name} with {donor.cells} cells"
    )
    counts_sims.update(
        variant.load_all_detected_var_counts_by_age(
            Path(f"{VERSION}/{donor.cells}cells/variant_fraction"), 0.01
        )
    )

In [None]:
counts = variant.variant_counts_detected_df(counts_sims)
fig, ax = plt.subplots(1, 1)
sns.lineplot(
    counts,
    x="age",
    y="variant counts detected",
    errorbar=lambda x: (np.min(x), np.max(x)),
    color="grey",
    alpha=0.3,
    ax=ax,
    label="min-max",
)
ax.plot(donors.age, donors.clones, marker=".", color="#d95f0e", label="Mitchell")
ax.legend()
plt.show()
print(counts[["variant counts detected", "age"]].groupby("age").describe())

## SFS 
Combine different data for this plot:
1. 1/f^2 sampled prediction (computed here in python)
2. Mitchell's SFS (loaded and computed here in python)
2. 1/f sampled prediction from Nate's (loaded from external file)
3. SFS from simulations (need to generate them)

### Generate/load/compute the data
####  1. 1/f2 predictions


In [None]:
%%time
# compute the correction for the sims' SFS with sampled distributions from
# https://www.biorxiv.org/content/10.1101/2022.11.07.515470v2
corrected_variants_one_over_1_squared = dict()
for donor in donors.itertuples():
    print(
        f"apply sampling correction to SFS of donor {donor.name} with age {donor.age} with sample size {donor.cells}"
    )
    corrected_variants_one_over_1_squared[donor.name] = realisation.compute_variants(
        realisation.Correction.ONE_OVER_F_SQUARED,
        pop_size=NCELLS,
        sample_size=donor.cells,
    )

#### 2. Mitchell's SFS

In [None]:
%%time
# there are two donors with the same age 0
mitchell_sfs = {
    donor.name: mitchell.sfs_donor_mitchell(
        donor.name, donor.age, PATH2MITCHELL, remove_indels=False
    )
    for donor in donors.itertuples()
}

In [None]:
assert all(
    [
        m[2] == donors.loc[donors.name == name, "cells"].squeeze()
        for name, m in mitchell_sfs.items()
    ]
), "number of cells loaded for the SFSF do not match those in donors"

#### 3. 1/f sample predictions

In [None]:
# theoretical homeostatic neutral SFS data, from Nate's paper in Elife: for each patient (skipping the neonates)
# I evolved until their specific age, and then sampled to the same size as in the data
mapping = {
    age: f"predictions_1_over_f/homeostasisSFS_pid{i}.csv"
    for i, age in enumerate(donors.age.unique().tolist()[1:], 3)
}
mapping

#### 4. SFS from simulations
The data have been generated with the cmd `sfs.sh parameters.txt`.

In [None]:
%%time
sfs_sims = dict()
for donor in donors.itertuples():
    sfs_sims[donor.name] = realisation.load_all_sfs_by_age(
        Path(f"{VERSION}/{donor.cells}cells/sfs")
    )[donor.age]

#### Plots

In [None]:
selected = ["CB002", "KX002", "KX008"]
for name in donors.name.unique().tolist():
    age = donors.loc[donors.name == name, "age"].squeeze()
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=PLOT_OPTIONS.figsize)
    sfs_fig.plot_ax_sfs_predictions_data_sims(
        ax,
        donor=donors[donors.name == name].squeeze(),
        corrected_one_over_1_squared=corrected_variants_one_over_1_squared[name],
        sfs_sims_donor=None,
        mitchell_sfs=mitchell_sfs[name][3],
        one_over_f_csv=mapping.get(age),
        idx_sim2plot=None,
        plot_options=PLOT_OPTIONS,
    )

    ax.text(
        x=0.65,
        y=0.9,
        s=f"donor {age} y.o.",
        transform=ax.transAxes,
    )
    handles, labels = ax.get_legend_handles_labels()
    """
    ax.legend(
        fontsize="small",
        loc="upper right",
        frameon=False,
    )
    if name not in selected:
        ax.legend(
            fontsize="small",
            loc="upper right",
            frameon=False,
        )
    else:
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(
            handles=[handles[-1]],
            labels=[labels[-1]],
            fontsize="medium",
            loc="upper right",
            frameon=False,
            handletextpad=0,
        )
    """
    if PLOT_OPTIONS.save:
        fig.savefig(f"./sfs_age{age}_{name}{PLOT_OPTIONS.extension}")
    fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 1))
legend_elements = [
    Line2D([1], [1], color="black", alpha=0.8, lw=4, label="growth scaling law"),
    Line2D(
        [0],
        [0],
        color="black",
        ls="--",
        alpha=0.8,
        lw=4,
        label="homeostatsis scaling law",
    ),
    Line2D(
        [0],
        [0],
        marker="x",
        ls="",
        mew=4,
        color="#d95f0e",
        label="data",
        markerfacecolor="g",
        markersize=13,
    ),
]

ax.legend(
    handles=legend_elements,
    mode="extend",
    ncols=5,
    handletextpad=0.4,
)
ax.axis("off")
plt.show()

In [None]:
selected = ["CB002", "KX002", "KX008"]
for name in donors.name.unique().tolist():
    idx_available = [sfs_.parameters.idx for sfs_ in sfs_sims[name]]
    print(f"there are {len(idx_available)} runs for {name}")
    age = donors.loc[donors.name == name, "age"].squeeze()
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=PLOT_OPTIONS.figsize)
    sfs_fig.plot_ax_sfs_predictions_data_sims(
        ax,
        donor=donors[donors.name == name].squeeze(),
        corrected_one_over_1_squared=corrected_variants_one_over_1_squared[name],
        sfs_sims_donor=sfs_sims[name],
        mitchell_sfs=mitchell_sfs[name][3],
        one_over_f_csv=mapping.get(age),
        idx_sim2plot=idx_available[0],
        plot_options=PLOT_OPTIONS,
    )

    ax.text(
        x=0.65,
        y=0.9,
        s=f"donor {age} y.o.",
        transform=ax.transAxes,
    )
    handles, labels = ax.get_legend_handles_labels()
    """
    ax.legend(
        fontsize="small",
        loc="upper right",
        frameon=False,
    )
    if name not in selected:
        ax.legend(
            fontsize="small",
            loc="upper right",
            frameon=False,
        )
    else:
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(
            handles=[handles[-1]],
            labels=[labels[-1]],
            fontsize="medium",
            loc="upper right",
            frameon=False,
            handletextpad=0,
        )
    """
    if PLOT_OPTIONS.save:
        fig.savefig(f"./sfs_age{age}_{name}{PLOT_OPTIONS.extension}")
    fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 1))
legend_elements = [
    Line2D([1], [1], color="black", alpha=0.8, lw=4, label="growth scaling law"),
    Line2D(
        [0],
        [0],
        color="black",
        ls="--",
        alpha=0.8,
        lw=4,
        label="homeostatic scaling law",
    ),
    Line2D(
        [0],
        [0],
        marker="x",
        ls="",
        mew=4,
        color="#d95f0e",
        label="data",
        markerfacecolor="g",
        markersize=13,
    ),
    Line2D(
        [0],
        [0],
        marker="o",
        ls="",
        mew=1,
        color="grey",
        label="single simulation",
        markersize=12,
    ),
    Line2D([0], [0], color="grey", alpha=0.6, lw=4, label="simulation average"),
]

ax.legend(
    handles=legend_elements,
    mode="extend",
    ncols=5,
    handletextpad=0.4,
)
ax.axis("off")
plt.show()

In [None]:
for donor in donors.itertuples():
    fig, ax = plt.subplots(1, 1, layout="tight")
    print(donor.name)

    for sfs_s in sfs_sims[donor.name]:
        cdf_x_sim, cdf_y_sim = realisation.cdf_from_dict(sfs_s.sfs)
        ax.plot(cdf_x_sim / donor.cells, cdf_y_sim, color="#bdbdbd", alpha=0.1)

    cdf_x_target, cdf_y_target = realisation.cdf_from_dict(mitchell_sfs[donor.name][3])
    ax.plot(cdf_x_target / donor.cells, cdf_y_target, marker="o", color="#d95f0e")
    ax.set_xscale("log")
    ax.set_ylabel("Cumulative distribution")
    ax.set_xlabel(r"Variant frequency $f$")
    ax.text(
        x=0.65,
        y=0.1,
        s=f"donor {donor.age} y.o.",
        transform=ax.transAxes,
    )
    ax.set_ylim([0.9, 1])
    plt.show()

### REDO THE SAME PLOT with sims from ABC

In [None]:
idx2load = [93820, 577720, 456630, 531350, 589970, 268970, 69560, 262610]
# load the data from abc
PATH2SIMS = Path("/data/scratch") / f"hfx923/hsc-draft/v2.2.14"


for r in donors[["name", "age", "cells"]].itertuples():
    path2sfs_abc = Path(PATH2SIMS / f"{r.cells}cells/sfs/")
    print(f"\tloading sims SFS for donor {r.name} with {r.cells} cells")
    sfs_sims_abc = Path(PATH2SIMS / f"{r.cells}cells/sfs/")
    sfs_sims[r.name].extend(
        realisation.load_all_sfs_by_age(path2sfs_abc, idx2load)[r.age]
    )

In [None]:
selected = ["CB002", "KX002", "KX008"]
for i, (name, idx2plot) in enumerate(
    zip(donors.name.unique().tolist(), [idx2load[0]] + idx2load)
):  # trick for using twice the donor 0
    age = donors.loc[donors.name == name, "age"].squeeze()
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=PLOT_OPTIONS.figsize)
    sfs_fig.plot_ax_sfs_predictions_data_sims(
        ax,
        donor=donors[donors.name == name].squeeze(),
        corrected_one_over_1_squared=corrected_variants_one_over_1_squared[name],
        sfs_sims_donor=sfs_sims[name],
        mitchell_sfs=mitchell_sfs[name][3],
        one_over_f_csv=mapping.get(age),
        idx_sim2plot=idx2plot,
        plot_options=PLOT_OPTIONS,
    )
    if i % 3:
        ax.set_ylabel("")

    ax.text(
        x=0.65,
        y=0.9,
        s=f"donor {age} y.o.",
        transform=ax.transAxes,
    )
    handles, labels = ax.get_legend_handles_labels()
    """
    ax.legend(
        fontsize="small",
        loc="upper right",
        frameon=False,
    )
    if name not in selected:
        ax.legend(
            fontsize="small",
            loc="upper right",
            frameon=False,
        )
    else:
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(
            handles=[handles[-1]],
            labels=[labels[-1]],
            fontsize="medium",
            loc="upper right",
            frameon=False,
            handletextpad=0,
        )
    """
    if PLOT_OPTIONS.save:
        fig.savefig(f"./sfs_age{age}_{name}{PLOT_OPTIONS.extension}")
    fig.show()