# HSC
Markov process with fixed-size population with k-types such that the type 0 is the wild-type with growth rate of `B0`. 

A cells can get a mutation conferring a proliferative advantage upon cell division. We model this process with a Bernouilli trial with success probability of `u`, with units of 1 mutation/division. To compute `u` we can do `u =  MU0 / (B0 * NCELLS)` for the symmetric division case.

For now, all k clones have the same proliferative advantage with k greater than 0.

## Simulation and parameters

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import sys
from hscpy import burden, variant, get_idx_timepoint_from_age, sfs
from typing import Dict
from scipy import stats
from pathlib import Path
from futils import parse_version, snapshot

PATH2BIN = Path("~").expanduser() / "hsc/target/release"
assert PATH2BIN.is_dir()

YEARS_FAST = 80
RUNS = 8
NB_TIMEPOINTS = 21
DETECTION_THRESH = 0.01
SUBCLONES = 60
USE_SCRATCH = False
mitchell_ages = (0, 29, 38, 48, 63, 75, 81)

SAVE = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"

In [None]:
NCELLS = 200_000
# mean of the Bernouilli trial (prob of success) to get an asymmetric
# division upon cell division, units are [1 asymmetric division / division]
P_ASYMMETRIC = 0

## NEUTRAL RATES
# division rate for the wild-type in units of [division / (year * cell)]
# Welch, J.S. et al. (2012) ‘The Origin and Evolution of Mutations in Acute Myeloid Leukemia’,
# Cell, 150(2), pp. 264–278
B0 = 1  # TODO: double check this, should be between 2 and 20?
# Abascal, F. et al. (2021) ‘Somatic mutation landscapes at single-molecule resolution’,
# Nature, 593(7859), pp. 405–410. fig. 2b
# see also fig 1b of Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
NEUTRAL_RATE = 20  # [mut/(year * cell)]

## FIT CLONES
# avg fit mutations arising in 1 year, units are [mutations/year]
# from ABC's inference
MU0 = 2
# proliferative advantage conferred by fit mutations, all clones
# have the same proliferative advantage for now. Units are
# [mutation / division]
S = 0.11
# mean of the Bernouilli trial (prob of success) to get a fit variant upon
# cell division, units are [1 mutation/division]
u = MU0 / (B0 * NCELLS)
# should be 2.0 × 10−3 per HSC per year according to Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
# driver mutations enter the HSC compartment at 2.0 × 10−3 per HSC per year
print(f"average sucess rate of occurence of 1 fit mutation upon cell division u={u}")

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2SIMS = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/")
    YEARS = YEARS_FAST
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2SIMS = Path("/mnt/c/Users/fra_t/Documents/PhD/")
    # need + 1 to save the last timepoint
    YEARS = YEARS_FAST
else:
    PATH2SIMS = Path("~").expanduser()
    YEARS = 100

PATH2SIMS /= Path("totalVariantFracTime.csv")
assert PATH2SIMS.is_file()

In [None]:
%%bash -s "$PATH2BIN" --out version
$1/hsc --version

In [None]:
VERSION = parse_version(version)
if USE_SCRATCH:
    PATH2SAVE = Path(f"/data/scratch/hfx923/hsc-draft/{VERSION}")
else:
    PATH2SAVE = Path(f"./{VERSION}")
print("Running hsc with version:", VERSION)

In [None]:
%%bash -s "$PATH2BIN" "$PATH2SAVE" "$B0" "$MU0" "$NEUTRAL_RATE" "$S" "$P_ASYMMETRIC" "$RUNS" "$NCELLS" "$YEARS" "$NB_TIMEPOINTS"
rm -rf $2
$1/hsc -c $9 -y ${10} -r $8 --b0 $3 --mu0 $4 --neutral-rate $5 -s $6 --p-asymmetric $7 --snapshots ${11} $2

## SFS

### SFS last timepoint

In [None]:
sfs_all = sfs.load_sfs(PATH2SAVE, runs=RUNS, timepoint=1)  # 1 is the last timepoint

In [None]:
sfs_three = sfs_all["3"]
fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
sfs2plot = pd.Series(sfs_three, dtype=int).value_counts()
ax.plot(sfs2plot.index, sfs2plot, linestyle="", marker="x")
ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlabel("j cells")
ax.set_ylabel("# of muts in j cells")
fig.show()

In [None]:
sfs_all = sfs.pandafy_sfs_dict(sfs_all)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
x = np.arange(1, NCELLS, dtype=int)
y = 1.0 / x * NCELLS
ax.plot(x, y, linestyle="--", color="black", alpha=0.5, label="1/f")
sns.scatterplot(
    sfs_all,
    x="# of j cells",
    y="# of muts in j cells",
    hue="id",
    ax=ax,
    marker="x",
    alpha=0.5,
)
ax.set_xscale("log")
ax.set_yscale("log")
ax.set_xlabel("j cells")
ax.set_ylabel("# of muts in j cells")
if SAVE:
    plt.savefig(PATH2SAVE / f"sfs{EXTENSION}")
fig.show()

## Entropy

In [None]:
closest_age = dict.fromkeys(mitchell_ages)
simulated = dict()
for age in mitchell_ages:
    print(f"\nloading mutational burden for age {age}")
    idx_timepoint, closest_age_ = get_idx_timepoint_from_age(
        age, YEARS_FAST, NB_TIMEPOINTS
    )
    closest_age[age] = closest_age_
    simulated[closest_age[age]] = dict()
    try:
        for idx_sim, simulation in sfs.load_sfs(
            PATH2SAVE, runs=RUNS, timepoint=idx_timepoint
        ).items():
            simulated[closest_age[age]][idx_sim] = simulation
    except AssertionError:
        print(
            f"skipping timepoint {idx_timepoint} with age {closest_age_} because empty sfs"
        )

In [None]:
fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
entropies = dict()

for age, sfs_ in simulated.items():
    if not age:
        continue
    entropies[age] = list()
    for id, sfs_run in sfs_.items():
        entropies[age].append(stats.entropy(sfs_run))

for age, entropy in entropies.items():
    ax.errorbar(
        age,
        np.mean(entropies[age]),
        yerr=np.std(entropies[age]),
        label=age,
        color="yellowgreen",
        marker="x",
    )
ax.set_xlabel("age [years]")
ax.set_ylabel("entropy")
ax.set_title(f"variant entropy averaged over {RUNS} simulations")
if SAVE:
    plt.savefig(PATH2SAVE / f"entropy{EXTENSION}")
fig.show()

## Single-cell mutational burden

In [None]:
closest_age = dict.fromkeys(mitchell_ages)
simulated = dict()
for age in mitchell_ages:
    print(f"\nloading mutational burden for age {age}")
    idx_timepoint, closest_age_ = get_idx_timepoint_from_age(
        age, YEARS_FAST, NB_TIMEPOINTS
    )
    closest_age[age] = closest_age_
    simulated[closest_age[age]] = dict()
    try:
        for idx_sim, simulation in burden.load_burden(
            PATH2SAVE, runs=RUNS, timepoint=idx_timepoint
        ).items():
            simulated[closest_age[age]][idx_sim] = simulation
    except AssertionError:
        print(
            f"skipping timepoint {idx_timepoint} with age {closest_age_} because empty mutational burden"
        )

In [None]:
id2plot = "3"
fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
for (age, sfs_dict), c in zip(
    simulated.items(), ("red", "grey", "black", "cyan", "yellowgreen", "blue")
):
    burden.plot_burden(sfs_dict[id2plot], ax, label=age, color=c, alpha=0.8)
ax.legend(title="age")
ax.set_xlabel("single nucleotide variant")
ax.set_ylabel("cell count")
ax.set_title("single cell mutational burden")
if SAVE:
    plt.savefig(PATH2SAVE / f"burden{EXTENSION}")
plt.show()

## Total variant
The total variant fraction is the fraction of all selected clones averaged over all patients, that is anything except the wild type.

In [None]:
x = np.linspace(0, YEARS, NB_TIMEPOINTS)
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
other_sims = pd.read_csv(PATH2SIMS).loc[:YEARS, :]

ax.set_xlabel("time [years]")
ax.set_ylabel("avg total variant fraction")
variant_fraction = variant.load_variant_fractions(
    PATH2SAVE, NB_TIMEPOINTS, RUNS, SUBCLONES
)
total_fraction = variant_fraction.sum(axis=-1)
ax.errorbar(
    x,
    total_fraction.mean(axis=-1),
    yerr=total_fraction.std(axis=-1),
    fmt="o",
    alpha=0.8,
    label=f"ABM, avg of {RUNS} runs",
)
ax.plot(other_sims.t, other_sims["Expected total variant fraction"], label="theory")
ax.plot(
    other_sims.t,
    other_sims["Average total variant fraction"],
    linestyle="--",
    label="sims, avg of ?? runs",
)
ax.legend(loc="upper left")
if SAVE:
    plt.savefig(PATH2SAVE / f"total_variant{EXTENSION}")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
clones_abm = (
    np.sum(np.sum(np.where(variant_fraction > 0.0, 1, 0), axis=-1), axis=-1) / RUNS
)
ax.scatter(x, clones_abm, label=f"ABM, avg {RUNS} runs")
ax.plot(other_sims.t, other_sims[f"Average number of existing clones"], label="sims")
# ax.set_yscale("log")
ax.set_xlabel("time [years]")
ax.set_ylabel("clones")
ax.legend()
ax.set_title("avg # of clones")
fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
clones_abm = (
    np.sum(
        np.sum(np.where(variant_fraction > DETECTION_THRESH, 1, 0), axis=-1), axis=-1
    )
    / RUNS
)
ax.scatter(x, clones_abm, label=f"ABM, avg {RUNS} runs")
ax.plot(
    other_sims.t,
    other_sims[f"Average number of clones above threshold 0.01"],
    label="sims",
)
# ax.set_yscale("log")
ax.set_xlabel("time [years]")
ax.set_ylabel("clones")
ax.legend()
ax.set_title(f"avg # of clones above frequency threshold of {DETECTION_THRESH}")
fig.show()

In [None]:
# record-format
df = list()
for t in range(0, NB_TIMEPOINTS):
    for r in range(RUNS):
        for c in range(0, variant_fraction.shape[-1]):
            df.append((x[t], r, c, variant_fraction[t, r, c]))
df = pd.DataFrame(
    df, columns=["time [years]", "run", "clone_id", "avg tot variant fraction"]
)
df

In [None]:
rl = sns.relplot(
    data=df.loc[
        df["avg tot variant fraction"] > DETECTION_THRESH,
        ["time [years]", "avg tot variant fraction"],
    ],
    x="time [years]",
    y="avg tot variant fraction",
    kind="line",
    errorbar="sd",
    aspect=2,
    height=3,
)
rl.fig.suptitle("tot avg fraction for detectable clones")
rl.fig.show()

In [None]:
grouped = (
    df[["run", "time [years]", "avg tot variant fraction"]]
    .groupby(["run", "time [years]"])
    .sum()
    .reset_index()
)
grouped.rename(
    columns={"avg tot variant fraction": "tot variant fraction"}, inplace=True
)
grouped

In [None]:
df = df.merge(grouped, on=["run", "time [years]"], how="left", validate="many_to_one")
df["effective fitness"] = (
    S * df["avg tot variant fraction"] / df["tot variant fraction"]
).fillna(0)
df

In [None]:
int_x = [round(x_) for x_ in x]
clones = []
for run in range(RUNS):
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
    try:
        detected = df.loc[
            (df["avg tot variant fraction"] > DETECTION_THRESH) & (df.run == run), :
        ]
        detected_clones = set(detected.clone_id.tolist())
        sns.barplot(
            detected,
            x="time [years]",
            y="avg tot variant fraction",
            hue="clone_id",
            ax=ax,
            palette="Dark2",
        )
    except ValueError:  # no detectable clone for this run
        continue
    ax.set_ylabel("variant fraction")
    ax.legend(loc="center left", title="clone id")
    ax.set_title(
        f"variant fraction of clones above frequency threshold of {DETECTION_THRESH}"
    )
    plt.show()

    pivoted = detected.pivot(
        columns="clone_id", index="time [years]", values="avg tot variant fraction"
    ).fillna(0)

    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
    pivoted.plot(ax=ax, color=sns.color_palette("Dark2"))
    ax.set_ylabel("variant fraction")
    ax.legend(loc="center left", title="clone id")
    ax.set_title(
        f"variant fraction of clones above frequency threshold of {DETECTION_THRESH}"
    )
    ax.set_xlim([0, YEARS])

    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
    pivoted = (
        df.loc[
            (df.run == run) & (df.clone_id.isin(detected_clones)),
            ["clone_id", "time [years]", "effective fitness"],
        ]
        .pivot(columns="clone_id", index="time [years]", values="effective fitness")
        .fillna(0)
    )
    pivoted.plot(ax=ax, color=sns.color_palette("Dark2"))
    ax.set_ylabel("effective fitness")
    ax.legend(loc="center left", title="clone id")
    ax.set_title(
        f"effective fitness of clones above frequency threshold of {DETECTION_THRESH}"
    )

    plt.show()