# HSC
Markov process with fixed-size population with k-types such that the type 0 is the wild-type with growth rate of `B0`. 

A cells can get a mutation conferring a proliferative advantage upon cell division. We model this process with a Bernouilli trial with success probability of `u`, with units of 1 mutation/division. To compute `u` we can do `u =  MU0 / (B0 * NCELLS)` for the symmetric division case.

## How to use it
Install a version of python greather or equal to 3.11 and then install `seaborn`, `scipy`, `pandas`, `ipykernel` with pip.
Then, install `futils` and `hscpy` in editable mode.
Finally, on the cluster, make this env availbale as a ipython kernel.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import sys
from pathlib import Path
from random import choices

from hscpy import sfs, mitchell
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import PlotOptions, simulations

from futils import parse_version

PATH2BIN = Path("~").expanduser() / "hsc/target/release"
assert PATH2BIN.is_dir()

NCELLS = 200_000
USE_SCRATCH = True
SAMPLE = 368

SAVE = False
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"
PLOT_OPTIONS = PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVE)

In [None]:
%%bash -s "$PATH2BIN" --out version
$1/hsc --version

In [None]:
VERSION = parse_version(version)
PATH2SAVE = Path(f"./{VERSION}")

print("Running hsc with version:", VERSION)

if USE_SCRATCH:
    PATH2SIMS = Path("/data/scratch/")
else:
    PATH2SIMS = Path("/data/home/")
PATH2SIMS /= f"hfx923/hsc-draft/{VERSION}"

if socket.gethostname() == "5X9ZYD3":
    PATH2MITCHELL = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2MITCHELL = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2MITCHELL = Path("~").expanduser()
    
PATH2SAVE = Path(f"./{VERSION}")

## Mitchell's data

In [None]:
summary = summary = mitchell.load_and_process_mitchell(PATH2MITCHELL / "Summary_cut.csv", drop_donor_KX007=True)
summary.dtypes

In [None]:
print(summary.describe())
print(f"\n\ncell types: \n{summary.cell_type.value_counts()}")
print(f"\n\nsample types: \n{summary.sample_type.value_counts()}")
print(f"\n\ntimepoints: \n{summary.timepoint.value_counts()}")
print(
    f'\n\nages and cells: \n{summary[["donor_id", "cells", "age"]].drop_duplicates()}'
)
print(
    f'\n\nmutations per donor: \n{summary[["donor_id", "number_mutations"]].groupby("donor_id").sum()}'
)

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if SAVE:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

In [None]:
descr = (
    summary.loc[summary.age == 0, ["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .describe()
)
descr

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="percent",
)
sns.move_legend(ax, bbox_to_anchor=(1.01, 1), loc="upper left", frameon=False)
if SAVE:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
descr[("number_mutations", "mean")].mean() / (2 * np.log(200_000 - 2))

In [None]:
descr[("number_mutations", "std")] ** 2

In [None]:
%%time
names_mitchell = summary.donor_id.unique()
ages_mitchell = summary.age.unique().tolist()
# there are two donors with the same age 0
assert len(ages_mitchell) + 1 == len(names_mitchell)
target_sfs = {
    donor: mitchell.sfs_donor_mitchell(
        donor, PATH2MITCHELL, remove_indels=False
    )
    for age, donor in zip([0] + ages_mitchell, names_mitchell)
}

## Compare simulations against the data

In [None]:
%%time
# compute the correction for the SFS with sampled distributions from
# https://www.biorxiv.org/content/10.1101/2022.11.07.515470v2
corrected_variants_one_over_1_squared = dict()
for donor in summary[["donor_id", "age", "cells"]].drop_duplicates().sort_values(by="age", ascending=False).itertuples():
    print(
        f"apply sampling correction to SFS of donor {donor.donor_id} with age {donor.age}"
    )
    corrected_variants_one_over_1_squared[donor.donor_id] = sfs.compute_variants(
        sfs.Correction.ONE_OVER_F_SQUARED,
        pop_size=NCELLS,
        sample_size=donor.cells,
    )

In [None]:
%%time
# load all sfs
path2sfs = Path(PATH2SIMS / f"{SAMPLE}cells/sfs/")
ages_sims = sorted([mitchell.parse_path2folder_xdoty_years(path) for path in path2sfs.iterdir()])
assert ages_sims == ages_mitchell
# load data
sfs_sims = mitchell.load_all_sfs_by_age(path2sfs)

In [None]:
# pick some runs to plot
sims2plot = list()
records = list()
for sfs_ in sfs_sims[0]:
    my_dict = sfs_.parameters.into_dict()
    if abs(my_dict["s"] - 0.11) <= 0.01 and abs(my_dict["mu"] - 2.1) <= 0.5:
        sims2plot.append(sfs_.parameters.idx)
        records.append(my_dict)
sns.pairplot(pd.DataFrame.from_records(records)[["mu", "s", "std"]])

In [None]:
markers = {"D", "o"}
subsample = choices(sims2plot, k=len(markers))

for age, name in zip([0] + ages_mitchell, names_mitchell):
    fig, ax = plt.subplots(1, 1)
    fig1, ax1 = plt.subplots(1, 1)
    for sfs_, marker in zip(filter(lambda sim: sim.parameters.idx in subsample, sfs_sims[age]), markers):
        simulations.plot_rates(ax1, PATH2SIMS, sfs_.parameters.idx, xlims=[0.95, 1.5])
        ax1.set_label(f"run {sfs_.parameters.idx} with mu {sfs_.parameters.mu}")
        my_dict = sfs_.parameters.into_dict()
        print(f"idx={my_dict['idx']}, s={my_dict['s']}, std={my_dict['std']}, mu={my_dict['mu']}")
        sfs_figures.plot_sfs(ax, sfs_.sfs, True, PLOT_OPTIONS, marker=marker, mew=2, linestyle="", color="yellowgreen", alpha=0.5, label=f"run with id {sfs_.parameters.idx}")
   
    try:
        sfs_figures.plot_sfs_correction(
            ax,
            corrected_variants_one_over_1_squared[name], 
            True, PLOT_OPTIONS, linestyle="-", color="grey", label=r"$1/f^2$ sampled", linewidth=2
        )
    except KeyError:
        pass
    sfs_figures.plot_sfs(ax, target_sfs[name], normalise=True, options=PLOT_OPTIONS, color="purple", mew=2, linestyle="", marker="x", label=f"{name}, {age} years")
    sfs_figures.plot_sfs_avg(ax, [sfs_.sfs for sfs_ in filter(lambda sim: sim.parameters.idx in sims2plot, sfs_sims[age])], PLOT_OPTIONS, color="blue", alpha=0.6, label=f"avg of {len(sims2plot)} runs")
    ax.legend()
    ax1.legend()
    fig.show()