# HSC
Markov process with fixed-size population with k-types such that the type 0 is the wild-type with growth rate of `B0`. 

A cells can get a mutation conferring a proliferative advantage upon cell division. We model this process with a Bernouilli trial with success probability of `u`, with units of 1 mutation/division. To compute `u` we can do `u =  MU0 / (B0 * NCELLS)` for the symmetric division case.

For now, all k clones have the same proliferative advantage with k greater than 0.

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import sys

from hscpy.figures import burden as burden_figures
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import variant as variant_figures
from hscpy.figures import options

from typing import Dict
from scipy import stats
from pathlib import Path
from futils import parse_version, snapshot

PATH2BIN = Path("~").expanduser() / "hsc/target/release"
assert PATH2BIN.is_dir()

YEARS_FAST = 80
YEARS_ENTROPY = 1
RUNS = 12
NB_TIMEPOINTS = 21
DETECTION_THRESH = 0.01
SUBCLONES = 60
USE_SCRATCH = False
mitchell_ages = (0, 29, 38, 48, 63, 75, 81)

SAVE = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"

PLOT_OPTIONS = options.PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVE)

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2SIMS = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/")
    YEARS = YEARS_FAST
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2SIMS = Path("/mnt/c/Users/fra_t/Documents/PhD/")
    # need + 1 to save the last timepoint
    YEARS = YEARS_FAST
else:
    PATH2SIMS = Path("~").expanduser()
    YEARS = 100

PATH2SIMS /= Path("totalVariantFracTime.csv")
assert PATH2SIMS.is_file()

In [None]:
%%bash -s "$PATH2BIN" --out version
$1/hsc --version

In [None]:
VERSION = parse_version(version)
if USE_SCRATCH:
    PATH2SAVE = Path(f"/data/scratch/hfx923/hsc-draft/{VERSION}")
else:
    PATH2SAVE = Path(f"./{VERSION}")
print("Running hsc with version:", VERSION)

## Competition

In [None]:
SAMPLE_STRONG = 600
SAMPLE_WEAK = 6_000
NCELLS = 200_000
# mean of the Bernouilli trial (prob of success) to get an asymmetric
# division upon cell division, units are [1 asymmetric division / division]
P_ASYMMETRIC = 0

## NEUTRAL RATES
# division rate for the wild-type in units of [division / (year * cell)]
# Welch, J.S. et al. (2012) ‘The Origin and Evolution of Mutations in Acute Myeloid Leukemia’,
# Cell, 150(2), pp. 264–278
B0 = 1  # TODO: double check this, should be between 2 and 20?
# Abascal, F. et al. (2021) ‘Somatic mutation landscapes at single-molecule resolution’,
# Nature, 593(7859), pp. 405–410. fig. 2b
# see also fig 1b of Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
NEUTRAL_RATE = 20  # [mut/(year * cell)]

## FIT CLONES
# avg fit mutations arising in 1 year, units are [mutations/year]
# from ABC's inference
MU0 = 2
# proliferative advantage conferred by fit mutations, all clones
# have the same proliferative advantage for now. Units are
# [mutation / division]
S = 0.11
# mean of the Bernouilli trial (prob of success) to get a fit variant upon
# cell division, units are [1 mutation/division]
u = MU0 / (B0 * NCELLS)
# should be 2.0 × 10−3 per HSC per year according to Mitchell, E. et al.
# (2022) ‘Clonal dynamics of haematopoiesis across the human lifespan’,
# Nature, 606(7913), pp. 343–350
# driver mutations enter the HSC compartment at 2.0 × 10−3 per HSC per year
print(f"average sucess rate of occurence of 1 fit mutation upon cell division u={u}")

We run the simulations with and without subsampling at the same time.

In [None]:
sim_options_population = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=NCELLS,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=80,
    nb_subclones=SUBCLONES,
    s=S,
)

sim_options_subsampling_strong = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=SAMPLE_STRONG,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=80,
    nb_subclones=SUBCLONES,
    s=S,
)

sim_options_subsampling_weak = options.SimulationOptions(
    runs=RUNS,
    cells=NCELLS,
    sample=SAMPLE_STRONG,
    path2save=PATH2SAVE / "competition",
    neutral_rate=NEUTRAL_RATE,
    nb_timepoints=NB_TIMEPOINTS,
    last_timepoint_years=80,
    nb_subclones=SUBCLONES,
    s=S,
)

In [None]:
%%bash -s "$PATH2BIN" "$sim_options_population.path2save" "$B0" "$MU0" "$sim_options_population.neutral_rate" "$sim_options_population.s" "$P_ASYMMETRIC" "$sim_options_population.runs" "$sim_options_population.cells" "$sim_options_population.last_timepoint_years" "$sim_options_population.nb_timepoints" "$sim_options_subsampling_strong.sample" "$sim_options_subsampling_weak.sample" "$YEARS_ENTROPY"
rm -rf $2
$1/hsc -c $9 -y ${10} -r $8 --b0 $3 --mu0 $4 --neutral-rate $5 -s $6 --p-asymmetric $7 --snapshot-entropy ${14} --subsample ${12} ${13} --snapshots ${11} $2

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
sim_options_subsampling_strong.cells

In [None]:
from hscpy import burden

In [None]:
burden_strong_sub = burden.load_burden(sim_options_subsampling_strong.path2save, sim_options_subsampling_strong.runs, sim_options_subsampling_strong.cells)
burden_strong_sub

In [None]:
sum(burden_strong_sub['1'].values())

In [None]:
for sim in burden_strong_sub.values():
    print(sum(sim.values()))

In [None]:
count_values

In [None]:
count_values_entropy

In [None]:
count_values = dict()
for idx, sfs_ in sfs.load_sfs(sim_options_subsampling_strong.path2save, sim_options_subsampling_strong.runs, sim_options_subsampling_strong.cells).items():
    sfs_series = pd.Series(sfs_)
    print(f"{idx} {sfs_series.shape}")
    count_values[idx] = sfs_series.value_counts()

In [None]:
count_values_entropy = dict()
entr = dict()
for idx, sfs_ in sfs.load_sfs_entropy(sim_options_subsampling_strong.path2save, sim_options_subsampling_strong.runs, sim_options_subsampling_strong.cells).items():
    entr[idx] = stats.entropy(sfs_)
    sfs_series = pd.Series(sfs_)
    print(f"{idx} {sfs_series.shape}")
    count_values_entropy[idx] = sfs_series.value_counts()

In [None]:
entr

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

### no subsampling

In [None]:
sfs_figures.show_sfs_last_timepoint_plots(sim_options_population, PLOT_OPTIONS)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
%%time
sfs_figures.show_entropy_plots(
    sim_options_population, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_population, PLOT_OPTIONS, mitchell_ages)

In [None]:
variant_figures.show_variant_plots(
    sim_options_population, PLOT_OPTIONS, PATH2SIMS, DETECTION_THRESH
)

### weak subsampling

In [None]:
sfs_figures.show_sfs_last_timepoint_plots(sim_options_subsampling_weak, PLOT_OPTIONS)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
%%time
sfs_figures.show_entropy_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_subsampling_weak, PLOT_OPTIONS, mitchell_ages)

In [None]:
variant_figures.show_variant_plots(
    sim_options_subsampling_weak, PLOT_OPTIONS, PATH2SIMS, DETECTION_THRESH
)

### strong subsampling

In [None]:
sfs_figures.show_sfs_last_timepoint_plots(sim_options_subsampling_strong, PLOT_OPTIONS)

In [None]:
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages, early_variants_only=True
)

In [None]:
%%time
sfs_figures.show_entropy_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages, early_variants_only=False
)

In [None]:
burden_figures.show_burden_plots(sim_options_subsampling_strong, PLOT_OPTIONS, mitchell_ages)

In [None]:
variant_figures.show_variant_plots(
    sim_options_subsampling_strong, PLOT_OPTIONS, PATH2SIMS, DETECTION_THRESH
)

## Competition vs neutral vs 1 clone (logistic fn)

In [None]:
# TODO 1 clone logistic fn