# HSC
Markov process with fixed-size population with k-types such that the type 0 is the wild-type with growth rate of `B0`. 

A cells can get a mutation conferring a proliferative advantage upon cell division. We model this process with a Bernouilli trial with success probability of `u`, with units of 1 mutation/division. To compute `u` we can do `u =  MU0 / (B0 * NCELLS)` for the symmetric division case.

For now, all k clones have the same proliferative advantage with k greater than 0.

**Entropy:** based on the code they [developped](https://github.com/emily-mitchell/normal_haematopoiesis/blob/23d221e8d125d78c1e8bcbe05d41d0f3594b0cfb/4_phylogeny_analysis/scripts/shannon_diversity.Rmd#L147), I think they define entropy as in [here](http://math.bu.edu/people/mkon/J6A.pdf) using the phylogenetic tree.
We just compute the entropy from the number of cells: we consider a class being the cells with the same number of mutations and compute the abbundance of those classes, that is the abbundance of cells with the same number of mutations.

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import sys

from hscpy import get_idx_timepoint_from_age, sfs
from hscpy.figures import burden as burden_figures
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import variant as variant_figures
from hscpy.figures import options, mitchell

from typing import Dict
from scipy import stats
from pathlib import Path
from futils import parse_version, snapshot

PATH2BIN = Path("~").expanduser() / "hsc/target/release"
assert PATH2BIN.is_dir()

YEARS_FAST = 80
YEARS_ENTROPY = 1
RUNS = 12
NB_TIMEPOINTS = 21
DETECTION_THRESH = 0.01
SUBCLONES = 60
USE_SCRATCH = True
mitchell_ages = (0, 29, 38, 48, 63, 75, 81)

SAVE = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"

PLOT_OPTIONS = options.PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVE)

In [None]:
def get_ymin(min1: float, min2: float) -> float:
    my_min = min1 if min1 < min2 else min2
    return my_min - my_min * 0.2


def get_xmax(max1: int, max2: int) -> int:
    my_max = max1 if max1 > max2 else max2
    return my_max + my_max * 0.2


def plot_sfs_simulations_data(simulated):
    # pop size
    N = 200_000
    _f = sfs.compute_frequencies(N)
    id2plot = '0'  # id of the simulation

    for i, donor in enumerate(summary.donor_id.unique()):
        age = summary.loc[summary.donor_id == donor, "age"].iloc[0]
        # TODO: rust starts saving at 1
        if age == 0:
            age = 2
        if donor == "KX003":
            continue
        print(f"donor {donor}")
        filtered_matrix = mitchell.filter_mutations(
            *mitchell.load_patient(
                donor, 
                path2data / f"mutMatrix{donor}.csv",
                path2data / f"mutType{donor}.csv"
            )
        )
        sfs_donor = filtered_matrix.sum(axis=1).value_counts(normalize=True)
        sfs_donor.drop(index=sfs_donor[sfs_donor.index==0].index, inplace=True)
        x_sfs = sfs_donor.index.to_numpy(dtype=int)
        y_sfs = sfs_donor.to_numpy()
        # sample size
        cells = summary.loc[summary.donor_id == donor, "cells"].unique()[0]
        correction = sfs.SamplingCorrection(N, cells)
        assert cells <= 1000

        sampled_f, y = sfs.compute_variants(correction, sfs.Correction.ONE_OVER_F, cells)
        sampled_f_squared, y_squared = sfs.compute_variants(correction, sfs.Correction.ONE_OVER_F_SQUARED, cells)

        sfs_simulations = pd.Series(simulated[closest_age[age]][id2plot], dtype=int).value_counts()
        sfs_simulations /= sfs_simulations.max()

        fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
        ax.plot(_f[:cells], sampled_f, label=f"$1/f$ sampled", alpha=0.4, linestyle="--", c="black")
        ax.plot(_f[:cells], sampled_f_squared / sampled_f_squared.max(), label=f"$1/f^2$ sampled", alpha=0.4, c="black")
        ax.plot(x_sfs, sfs_donor.to_numpy(), label=f"{donor}", linestyle="", marker="x", c="blue", alpha=0.7)
        ax.plot(sfs_simulations.index, sfs_simulations, linestyle="", marker="o", label="simulation", c="purple", alpha=0.7)
        ax.set_yscale("log")
        ax.set_xscale("log")
        ax.set_xlabel("j cells")
        ax.set_ylabel("normalised nb of muts in j cells")
        ax.set_ylim([get_ymin(y_sfs.min(), sfs_simulations.min()), 2])
        ax.set_xlim([0.8, get_xmax(x_sfs.max(), sfs_simulations.index.to_numpy().max())])
        ax.legend()
        ax.set_title(f"age {age}")
        if SAVE:
            plt.savefig(f"./{donor}_sfs{EXTENSION}")
        plt.show()


def load_sfs_simulations(sim_options):
    simulated = dict()
    for age in ages:
        # TODO: rust starts saving at 1
        if age == 0:
            age = 2
        print(f"\nloading sfs for age {age}")
        idx_timepoint, closest_age_ = get_idx_timepoint_from_age(
            age,
            sim_options.last_timepoint_years,
            nb_timepoints=sim_options.nb_timepoints,
        )
        closest_age[age] = closest_age_
        simulated[closest_age[age]] = dict()
        for idx_sim, simulation in sfs.load_sfs(
            sim_options.path2save,
            runs=sim_options.runs,
            cells=sim_options.sample,
            timepoint=idx_timepoint,
        ).items():
            simulated[closest_age[age]][idx_sim] = simulation

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2SIMS = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/")
    YEARS = YEARS_FAST
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2SIMS = Path("/mnt/c/Users/fra_t/Documents/PhD/")
    # need + 1 to save the last timepoint
    YEARS = YEARS_FAST
else:
    PATH2SIMS = Path("~").expanduser()
    YEARS = 100

PATH2SIMS /= Path("totalVariantFracTime.csv")
assert PATH2SIMS.is_file()

In [None]:
%%bash -s "$PATH2BIN" --out version
$1/hsc --version

In [None]:
VERSION = parse_version(version)
if USE_SCRATCH:
    PATH2SAVE = Path(f"/data/scratch/hfx923/hsc-draft/{VERSION}")
else:
    PATH2SAVE = Path(f"./{VERSION}")
print("Running hsc with version:", VERSION)

## Mitchell's data

In [None]:
try:
    path2data = Path.home()
    summary = pd.read_csv(path2data / "Summary_cut.csv", index_col=0)
except FileNotFoundError:
    path2data = "/mnt/c/Users/fra_t/Documents/PhD/"
    summary = pd.read_csv(
        path2data / "Summary_cut.csv", index_col=0
    )
summary.cell_type = summary.cell_type.astype("category")
summary.sample_type = summary.sample_type.astype("category")
summary.sort_values(by="age", inplace=True)
summary.reset_index(inplace=True)
ages = summary.age.unique()
closest_age = dict.fromkeys(ages)    
# neglect some duplicated colonies e.g. summary.colony_ID == "11_E07"
summary = summary.merge(
    summary[["donor_id", "age"]]
    .groupby("donor_id")
    .count()
    .reset_index()
    .rename(columns={"age": "cells"}),
    on="donor_id",
    validate="many_to_one",
    how="left",
)
summary.dtypes

In [None]:
print(summary.describe())
print(f"\n\ncell types: \n{summary.cell_type.value_counts()}")
print(f"\n\nsample types: \n{summary.sample_type.value_counts()}")
print(f"\n\ntimepoints: \n{summary.timepoint.value_counts()}")
print(f'\n\nages and cells: \n{summary[["donor_id", "cells", "age"]].drop_duplicates()}')
print(f'\n\nmutations per donor: \n{summary[["donor_id", "number_mutations"]].groupby("donor_id").sum()}')

In [None]:
mean_mutations = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .mean()
    .reset_index()
    .merge(
        summary[["donor_id", "age"]].drop_duplicates(),
        on="donor_id",
        how="inner",
        validate="one_to_one",
    )
    .sort_values(by="age")
)

x = mean_mutations.age.to_numpy()
y = mean_mutations.number_mutations.to_numpy()
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y, rcond=None)[0]

fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
ax.plot(summary["age"], summary["number_mutations"], linestyle="", marker="o", alpha=0.4)
ax.plot(x, y, "x", c="orange")
ax.plot(x, m * x + c, linestyle="--", c="orange")
ax.set_xlabel("age [years]")
ax.set_ylabel("number of SNVs")
ax.set_title(f"y=mx+c with m={m:.2f}, c={c:.2f}")
plt.show()

fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="count",
)
sns.move_legend(ax, bbox_to_anchor=(1.01, 1), loc="upper left", frameon=False)
if SAVE:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if SAVE:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

## Simulations

In [None]:
SAMPLE_STRONG = 600
SAMPLE_WEAK = 6_000
NCELLS = 200_000
# mean of the Bernouilli trial (prob of success) to get an asymmetric
# division upon cell division, units are [1 asymmetric division / division]
P_ASYMMETRIC = 0

## NEUTRAL RATES
# division rate for the wild-type in units of [division / (year * cell)]
# Welch, J.S. et al. (2012) ‘The Origin and Evolution of Mutations in Acute Myeloid Leukemia’,
# Cell, 150(2), pp. 264–278
B0 = 1  # TODO: double check this, should be between 2 and 20?
# Abascal, F. et al. (2021) ‘Somatic mutation landscapes at single-molecule resolution’,
# Nature, 593(7859), pp. 405–410. fig. 2b
# see also fig 1b of Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
NEUTRAL_RATE = 20  # [mut/(year * cell)]

## FIT CLONES
# avg fit mutations arising in 1 year, units are [mutations/year]
# from ABC's inference
MU0 = 2
# proliferative advantage conferred by fit mutations, all clones
# have the same proliferative advantage for now. Units are
# [mutation / division]
S = 0.11
# mean of the Bernouilli trial (prob of success) to get a fit variant upon
# cell division, units are [1 mutation/division]
u = MU0 / (B0 * NCELLS)
# should be 2.0 × 10−3 per HSC per year according to Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
# driver mutations enter the HSC compartment at 2.0 × 10−3 per HSC per year
print(f"average sucess rate of occurence of 1 fit mutation upon cell division u={u}")

We run the simulations with and without subsampling at the same time.

In [None]:
sim_options_population = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=NCELLS,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=80,
    nb_subclones=SUBCLONES,
    s=S,
)

sim_options_subsampling_strong = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=SAMPLE_STRONG,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=80,
    nb_subclones=SUBCLONES,
    s=S,
)

sim_options_subsampling_weak = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=SAMPLE_WEAK,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=80,
    nb_subclones=SUBCLONES,
    s=S,
)

In [None]:
%%bash -s "$PATH2BIN" "$sim_options_population.path2save" "$B0" "$MU0" "$sim_options_population.neutral_rate" "$sim_options_population.s" "$P_ASYMMETRIC" "$sim_options_population.runs" "$sim_options_population.cells" "$sim_options_population.last_timepoint_years" "$sim_options_population.nb_timepoints" "$sim_options_subsampling_strong.sample" "$sim_options_subsampling_weak.sample" "$YEARS_ENTROPY"
# rm -rf $2
# $1/hsc -c $9 -y ${10} -r $8 --b0 $3 --mu0 $4 --neutral-rate $5 -s $6 --p-asymmetric $7 --snapshot-entropy ${14} --subsample ${12} ${13} --snapshots ${11} $2

### no subsampling

In [None]:
sfs_figures.show_sfs_last_timepoint_plots(sim_options_population, PLOT_OPTIONS)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
%%time
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_population, PLOT_OPTIONS, mitchell_ages)

In [None]:
variant_figures.show_variant_plots(
    sim_options_population, PLOT_OPTIONS, PATH2SIMS, DETECTION_THRESH
)

In [None]:
%%time
plot_sfs_simulations_data(load_sfs_simulations(sim_options_subsampling_weak))

### weak subsampling

In [None]:
sfs_figures.show_sfs_last_timepoint_plots(sim_options_subsampling_weak, PLOT_OPTIONS)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
%%time
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages)

In [None]:
%%time
plot_sfs_simulations_data(load_sfs_simulations(sim_options_subsampling_weak))

### strong subsampling

In [None]:
sfs_figures.show_sfs_last_timepoint_plots(sim_options_subsampling_strong, PLOT_OPTIONS)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
%%time
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages)

In [None]:
%%time
plot_sfs_simulations_data(load_sfs_simulations(sim_options_subsampling_strong))

## Competition vs neutral vs 1 clone (logistic fn)

In [None]:
# TODO 1 clone logistic fn