# HSC
Markov process with fixed-size population with k-types such that the type 0 is the wild-type with growth rate of `B0`. 

A cells can get a mutation conferring a proliferative advantage upon cell division. We model this process with a Bernouilli trial with success probability of `u`, with units of 1 mutation/division. To compute `u` we can do `u =  MU0 / (B0 * NCELLS)` for the symmetric division case.

For now, all k clones have the same proliferative advantage with k greater than 0.

**Entropy:** based on the code they [developped](https://github.com/emily-mitchell/normal_haematopoiesis/blob/23d221e8d125d78c1e8bcbe05d41d0f3594b0cfb/4_phylogeny_analysis/scripts/shannon_diversity.Rmd#L147), I think they define entropy as in [here](http://math.bu.edu/people/mkon/J6A.pdf) using the phylogenetic tree.
We just compute the entropy from the number of cells: we consider a class being the cells with the same number of mutations and compute the abbundance of those classes, that is the abbundance of cells with the same number of mutations.

## How to use it
Install a version of python greather or equal to 3.11 and then install `seaborn`, `scipy`, `pandas`, `ipykernel` with pip.
Then, install `futils` and `hscpy` in editable mode.
Finally, on the cluster, make this env availbale as a ipython kernel.

In [None]:
%%bash
cd ../hsc/
git pull
cargo b --release

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import sys

from pathlib import Path

from hscpy.figures import burden as burden_figures
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import variant as variant_figures
from hscpy.figures import options
from hscpy.sfs import compute_variants, Correction

from futils import parse_version

PATH2BIN = Path("~").expanduser() / "hsc/target/release"
assert PATH2BIN.is_dir()

YEARS = 80
YEARS_ENTROPY = 1
RUNS = 32
NB_TIMEPOINTS = 19
DETECTION_THRESH = 0.01
SUBCLONES = 60
USE_SCRATCH = True
mitchell_ages = (0, 29, 38, 48, 63, 75, 81)

SAVE = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"

PLOT_OPTIONS = options.PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVE)

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2SIMS = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2SIMS = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2SIMS = Path("~").expanduser()

PATH2SIMS /= Path("variantFractionTime_s0.15_sigma0.03_mu2.csv")
assert PATH2SIMS.is_file(), f"cannot find totalVariantFracTime.csv from {PATH2SIMS}"

In [None]:
%%bash -s "$PATH2BIN" --out version
$1/hsc --version

In [None]:
VERSION = parse_version(version)
if USE_SCRATCH:
    PATH2SAVE = Path(f"/data/scratch/hfx923/hsc-draft/{VERSION}")
else:
    PATH2SAVE = Path(f"./{VERSION}")

print("Running hsc with version:", VERSION)

## Mitchell's data

In [None]:
summary = pd.read_csv(PATH2SIMS.parent / "Summary_cut.csv", index_col=0)
summary.cell_type = summary.cell_type.astype("category")
summary.sample_type = summary.sample_type.astype("category")
summary.sort_values(by="age", inplace=True)
summary.reset_index(inplace=True)
ages = summary.age.unique()
closest_age = dict.fromkeys(ages)
# neglect some duplicated colonies e.g. summary.colony_ID == "11_E07"
summary = summary.merge(
    summary[["donor_id", "age"]]
    .groupby("donor_id")
    .count()
    .reset_index()
    .rename(columns={"age": "cells"}),
    on="donor_id",
    validate="many_to_one",
    how="left",
)
summary.dtypes

In [None]:
print(summary.describe())
print(f"\n\ncell types: \n{summary.cell_type.value_counts()}")
print(f"\n\nsample types: \n{summary.sample_type.value_counts()}")
print(f"\n\ntimepoints: \n{summary.timepoint.value_counts()}")
print(
    f'\n\nages and cells: \n{summary[["donor_id", "cells", "age"]].drop_duplicates()}'
)
print(
    f'\n\nmutations per donor: \n{summary[["donor_id", "number_mutations"]].groupby("donor_id").sum()}'
)

In [None]:
mean_mutations = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .mean()
    .reset_index()
    .merge(
        summary[["donor_id", "age"]].drop_duplicates(),
        on="donor_id",
        how="inner",
        validate="one_to_one",
    )
    .sort_values(by="age")
)

x = mean_mutations.age.to_numpy()
y = mean_mutations.number_mutations.to_numpy()
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y, rcond=None)[0]

fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
ax.plot(
    summary["age"], summary["number_mutations"], linestyle="", marker="o", 
    alpha=0.4, label="Mitchell's data", mew=2,
)
ax.plot(x, y, "x", c="orange", label="avg Mitchell's data", mew=2)
ax.plot(x, m * x + c, linestyle="--", c="orange", label="LS regression")
ax.plot([7.8, 15.6, 23.3, 31.1, 38.9, 46.7, 54.4, 62.2, 70], 
        [137.80, 264.58, 391.92, 518.37, 645.25, 771.70, 898.47, 1024.67, 1152.31], 
        "x", c="yellowgreen", label="simulation", mew=2)
ax.set_xlabel("age [years]")
ax.set_ylabel("number of SNVs")
ax.set_title(f"y=mx+c with m={m:.2f}, c={c:.2f}")
ax.legend()
plt.show()

fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="count",
)
sns.move_legend(ax, bbox_to_anchor=(1.01, 1), loc="upper left", frameon=False)
if SAVE:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if SAVE:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

## Simulations

In [None]:
SAMPLE_STRONG = int(summary.cells.mean())
SAMPLE_WEAK = SAMPLE_STRONG * 10  # TODO * 100
NCELLS = 200_000
# mean of the Bernouilli trial (prob of success) to get an asymmetric
# division upon cell division, units are [1 asymmetric division / division]
P_ASYMMETRIC = 0

## NEUTRAL RATES
# division rate for the wild-type in units of [division / (year * cell)]
# Welch, J.S. et al. (2012) ‘The Origin and Evolution of Mutations in Acute Myeloid Leukemia’,
# Cell, 150(2), pp. 264–278
B0 = 1  # TODO: double check this, should be between 2 and 20?
# Abascal, F. et al. (2021) ‘Somatic mutation landscapes at single-molecule resolution’,
# Nature, 593(7859), pp. 405–410. fig. 2b
# see also fig 1b of Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
NEUTRAL_RATE = 20  # [mut/(year * cell)]

## FIT CLONES
# avg fit mutations arising in 1 year, units are [mutations/year]
# from ABC's inference
MU0 = 2
# proliferative advantage conferred by fit mutations, all clones
# have the same proliferative advantage for now. Units are
# [mutation / division]
S = 0.11
# mean of the Bernouilli trial (prob of success) to get a fit variant upon
# cell division, units are [1 mutation/division]
u = MU0 / (B0 * NCELLS)
# should be 2.0 × 10−3 per HSC per year according to Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
# driver mutations enter the HSC compartment at 2.0 × 10−3 per HSC per year
print(f"average sucess rate of occurence of 1 fit mutation upon cell division u={u}")

### Positive selection

#### Rust simulations

We run the simulations with and without subsampling at the same time.

In [None]:
sim_options_population = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=NCELLS,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=YEARS + 1,
    nb_subclones=SUBCLONES,
    s=S,
)

sim_options_subsampling_strong = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=SAMPLE_STRONG,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=YEARS + 1,
    nb_subclones=SUBCLONES,
    s=S,
)

sim_options_subsampling_weak = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=SAMPLE_WEAK,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=YEARS + 1,
    nb_subclones=SUBCLONES,
    s=S,
)

In [None]:
%%bash -s "$PATH2BIN" "$sim_options_population.path2save" "$B0" "$MU0" "$sim_options_population.neutral_rate" "$sim_options_population.s" "$P_ASYMMETRIC" "$sim_options_population.runs" "$sim_options_population.cells" "$YEARS" "$sim_options_population.nb_timepoints" "$sim_options_subsampling_strong.sample" "$sim_options_subsampling_weak.sample" "$YEARS_ENTROPY"
rm -rf $2
$1/hsc -c $9 -y ${10} -r $8 --b0 $3 --mu0 $4 --neutral-rate $5 --p-asymmetric $7 --snapshot-entropy ${14} --subsample ${12} ${13} --snapshots ${11} --mean-std 0.1 0.03 --exponential $2

In [None]:
for i, f in enumerate((PATH2SAVE / "competition/rates/").iterdir()):
    fig, ax = plt.subplots(1, 1)
    pd.read_csv(f, header=None).squeeze().plot(kind="hist", ax=ax, bins=15)
    ax.set_xlim(0.95, 1.25)  # TODO?
    ax.set_title(f"simulation id: {i}")
    plt.show()

In [None]:
donors = sfs_figures.donors_from_mitchell(summary, sim_options_population)

In [None]:
%%time
# compute the correction for the SFS with sampled distributions from
# https://www.biorxiv.org/content/10.1101/2022.11.07.515470v2
corrected_variants_one_over_1_squared = dict()
for donor in donors:
    print(
        f"apply sampling correction to SFS of donor {donor.name} with age {donor.age} mapped to closest_age {donor.closest_age}"
    )
    corrected_variants_one_over_1_squared[donor.name] = compute_variants(
        Correction.ONE_OVER_F_SQUARED,
        pop_size=sim_options_subsampling_strong.cells,
        sample_size=donor.cells,
    )

#### no subsampling

In [None]:
%%time
# load simulated sfs for all the ages of the donors present in the data
sfs_age_simulations = sfs_figures.load_sfs_simulations(donors, sim_options_population)

In [None]:
sfs_figures.plot_sfs_simulations_data(
    sfs_age_simulations,
    corrected_variants_one_over_1_squared,
    sim_options_population.cells,
    sim_options_population.sample,
    donors,
    PLOT_OPTIONS,
    PATH2SIMS.parent,
)

In [None]:
sfs_figures.plot_sfs_simulations_data(
    sfs_age_simulations,
    corrected_variants_one_over_1_squared,
    sim_options_population.cells,
    sim_options_population.sample,
    donors,
    PLOT_OPTIONS,
    PATH2SIMS.parent,
    id2plot="2",
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_population, PLOT_OPTIONS, mitchell_ages)

In [None]:
variant_figures.show_variant_plots(
    sim_options_population, PLOT_OPTIONS, PATH2SIMS, DETECTION_THRESH
)

#### weak subsampling

In [None]:
%%time
# load simulated sfs for all the ages of the donors present in the data
sfs_age_simulations = sfs_figures.load_sfs_simulations(
    donors, sim_options_subsampling_weak
)

In [None]:
sfs_figures.plot_sfs_simulations_data(
    sfs_age_simulations,
    corrected_variants_one_over_1_squared,
    sim_options_subsampling_weak.cells,
    sim_options_subsampling_weak.sample,
    donors,
    PLOT_OPTIONS,
    PATH2SIMS.parent,
)

In [None]:
sfs_figures.plot_sfs_simulations_data(
    sfs_age_simulations,
    corrected_variants_one_over_1_squared,
    sim_options_subsampling_weak.cells,
    sim_options_subsampling_weak.sample,
    donors,
    PLOT_OPTIONS,
    PATH2SIMS.parent,
    id2plot="2",
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages
)

#### strong subsampling

In [None]:
%%time
# load simulated sfs for all the ages of the donors present in the data
sfs_age_simulations = sfs_figures.load_sfs_simulations(
    donors, sim_options_subsampling_strong
)

In [None]:
sfs_figures.plot_sfs_simulations_data(
    sfs_age_simulations,
    corrected_variants_one_over_1_squared,
    sim_options_subsampling_strong.cells,
    sim_options_subsampling_strong.sample,
    donors,
    PLOT_OPTIONS,
    PATH2SIMS.parent,
)

In [None]:
sfs_figures.plot_sfs_simulations_data(
    sfs_age_simulations,
    corrected_variants_one_over_1_squared,
    sim_options_subsampling_strong.cells,
    sim_options_subsampling_strong.sample,
    donors,
    PLOT_OPTIONS,
    PATH2SIMS.parent,
    id2plot="2",
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong,
    PLOT_OPTIONS,
    mitchell_ages,
    early_variants_only=True,
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong,
    PLOT_OPTIONS,
    mitchell_ages,
    early_variants_only=False,
)

In [None]:
burden_figures.show_burden_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages
)

### Competition vs neutral vs 1 clone (logistic fn)

In [None]:
# TODO 1 clone logistic fn

### Neutral scenario

In [None]:
%%bash -s "$PATH2BIN" "$sim_options_subsampling_strong.path2save" "$B0" "$MU0" "$sim_options_population.neutral_rate" "$sim_options_population.s" "$P_ASYMMETRIC" "$sim_options_population.runs" "$sim_options_population.cells" "$YEARS" "$sim_options_population.nb_timepoints" "$sim_options_subsampling_strong.sample" "$sim_options_subsampling_weak.sample" "$YEARS_ENTROPY"
rm -rf $2
$1/hsc -c $9 -y ${10} -r $8 --b0 $3 --mu0 $4 --neutral-rate $5 --p-asymmetric $7 --snapshot-entropy ${14} --subsample ${12} ${13} --snapshots ${11} --neutral --exponential $2

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong,
    PLOT_OPTIONS,
    mitchell_ages,
    early_variants_only=True,
)

sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong,
    PLOT_OPTIONS,
    mitchell_ages,
    early_variants_only=False,
)