# Descriptive analysis of Mitchell's paper
We plot the burden and the SFS for the data published in [Mitchell's et al. Nature 2022](https://www.nature.com/articles/s41586-022-04786-y).

For the SFS we show also some simulations with the parameters that are inferred from the ABC.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import subprocess
from pathlib import Path

from hscpy import mitchell, realisation, parameters
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import PlotOptions, simulations

from futils import parse_version, snapshot

NCELLS = 100_000
SEED = 10
RUN_SIMS = False
SAVEFIG = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
EXTENSION = ".svg"
PLOT_OPTIONS = PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVEFIG)
PATH2HSC = Path("~").expanduser() / "hsc"
assert PATH2HSC.is_dir()

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2MITCHELL = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2MITCHELL = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2MITCHELL = Path("~").expanduser()

In [None]:
%%bash -s "$PATH2HSC" --out version
$1/target/release/hsc  --version

In [None]:
VERSION = parse_version(version)
print("Running hsc with version:", VERSION)

## Mitchell's data

In [None]:
summary = mitchell.load_and_process_mitchell(
    PATH2MITCHELL / "Summary_cut.csv", drop_donor_KX007=True
)
summary.dtypes

In [None]:
print(summary.describe())
print(f"\n\ncell types: \n{summary.cell_type.value_counts()}")
print(f"\n\nsample types: \n{summary.sample_type.value_counts()}")
print(f"\n\ntimepoints: \n{summary.timepoint.value_counts()}")
print(
    f'\n\nages and cells: \n{summary[["donor_id", "cells", "age"]].drop_duplicates()}'
)
print(
    f'\n\nmutations per donor: \n{summary[["donor_id", "number_mutations"]].groupby("donor_id").sum()}'
)

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if PLOT_OPTIONS.save:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

In [None]:
descr = (
    summary.loc[summary.age == 0, ["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .describe()
)
descr

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="percent",
)
sns.move_legend(
    ax,
    ncol=2,
    # bbox_to_anchor=(1.01, 1),
    loc="upper right",
    frameon=False,
    fontsize="small",
)
if PLOT_OPTIONS.save:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
descr[("number_mutations", "mean")].mean() / (2 * np.log(200_000 - 2))

In [None]:
descr[("number_mutations", "std")] ** 2

## Plot the SFS 
Combine different data for this plot:
1. 1/f^2 sampled prediction (computed here in python)
2. Mitchell's SFS (loaded and computed here in python)
2. 1/f sampled prediction from Nate's (loaded from external file)
3. SFS from simulations (need to generate them)

### Generate/load/compute the data

####  1. 1/f2 predictions

In [None]:
%%time
# compute the correction for the sims' SFS with sampled distributions from
# https://www.biorxiv.org/content/10.1101/2022.11.07.515470v2
corrected_variants_one_over_1_squared = dict()
for donor in (
    summary[["donor_id", "age", "cells"]]
    .drop_duplicates()
    .sort_values(by="age", ascending=False)
    .itertuples()
):
    print(
        f"apply sampling correction to SFS of donor {donor.donor_id} with age {donor.age}"
    )
    corrected_variants_one_over_1_squared[
        donor.donor_id
    ] = realisation.compute_variants(
        realisation.Correction.ONE_OVER_F_SQUARED,
        pop_size=NCELLS,
        sample_size=donor.cells,
    )

#### 2. Mitchell's SFS

In [None]:
%%time
# compute SFS from Mitchell's data
names_mitchell = summary.donor_id.unique()
ages_mitchell = summary.age.unique().tolist()
# there are two donors with the same age 0
assert len(ages_mitchell) + 1 == len(names_mitchell)
mitchell_sfs = {
    donor: mitchell.sfs_donor_mitchell(donor, age, PATH2MITCHELL, remove_indels=False)
    for age, donor in zip([0] + ages_mitchell, names_mitchell)
}

#### 3. 1/f sample predictions

In [None]:
# theoretical homeostatic neutral SFS data, from Nate's paper in Elife: for each patient (skipping the neonates) 
# I evolved until their specific age, and then sampled to the same size as in the data
mapping = {age: f"homeostasisSFS_pid{i}.csv" for i, age in enumerate(summary.age.unique().tolist()[1:], 3)}
mapping

#### 4. SFS from simulations

In [None]:
# simulations, params are from ABC
sims = dict()

name, age = 'CB001', 0
eta, sigma, mu, tau = 0.03, 0.01, 1, 1
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

# assume same params for both 0 age donors
name, age = 'CB002', 0
eta, sigma, mu, tau = 0.03, 0.01, 1, 1
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'KX001', 29
eta, sigma, mu, tau = 0.03, 0.01, 1, 1.8
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'KX002', 38
eta, sigma, mu, tau = 0.02, 0.01, 1, 2.6
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'SX001', 48
eta, sigma, mu, tau = 0.03, 0.01, 1, 2.6
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'AX001', 63
eta, sigma, mu, tau = 0.02, 0.01, 1, 3
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'KX008', 76 # this does not work, try changing the seed?
eta, sigma, mu, tau = 0.06, 0.04, 16, 5
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'KX004', 77
eta, sigma, mu, tau = 0.12, 0.04, 16, 1.4
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=SEED,
)

name, age = 'KX003', 81
eta, sigma, mu, tau = 0.08, 0.04, 23, 3
sims[name] = realisation.SimulationCMD(
    cells=NCELLS,
    sample=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
    eta=eta,
    sigma=sigma,
    mu=mu,
    tau=tau,
    age=age,
    name=name,
    seed=26,
)

In [None]:
if RUN_SIMS:
    for name, cmd in sims.items():
        print("running sims for donor: ", name)
        _ = subprocess.run(
            cmd.cmd(f"{PATH2HSC}/target/release/hsc", f"./mitchell/{name}"),
            shell=True, 
            check=True
        )

### Plot

In [None]:
for age, name in zip([0] + summary.age.unique().tolist(), summary.donor_id.unique()):
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=PLOT_OPTIONS.figsize)
    
    # 1/f^2 sampled predictions
    normalisation_x = sfs_figures.ToCellFrequency(
        sample_size=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
        to_one=False,
    )
    sfs_figures.plot_sfs_correction(
        ax,
        corrected_variants_one_over_1_squared[name],
        normalise=True,
        options=PLOT_OPTIONS,
        normalise_x=normalisation_x,
        linestyle="-",
        color="grey",
        label=r"$1/f^2$ sampled",
        linewidth=2,
    )
    
    # 1/f sampled predictions from Nate's simulations
    one_over_f_csv = mapping.get(age)
    if one_over_f_csv:
        one_over_f = pd.read_csv(one_over_f_csv)
        one_over_f.drop(index=one_over_f[one_over_f["_f"] == 0.0].index, inplace=True)
        sfs_one_over_f = {cell: muts for cell, muts in zip(
            (one_over_f["_f"] * normalisation_x.nb_cells).tolist(),
            one_over_f["n_f"].tolist(),
        )}
        sfs_figures.plot_sfs(
            ax,
            sfs_one_over_f,
            normalise=True,
            normalise_x=normalisation_x,
            options=PLOT_OPTIONS,
            color="grey",
            lw=2,
            linestyle="--",
            label=r"$1/f$ sampled",
        )
        
    # simulations
    sfs_sims = realisation.load_all_sfs_by_age(Path(f"mitchell/{name}/{sims[name].sample}cells/sfs"))[sims[name].age][-1].sfs
    sfs_figures.plot_sfs(
        ax,
        sfs_sims,
        normalise=True,
        normalise_x=normalisation_x,
        options=PLOT_OPTIONS,
        color="cyan",
        mew=2,
        linestyle="",
        marker=".",
        label=f"simulation",
    )
    
    # mitchell's data
    sfs_figures.plot_sfs(
        ax,
        mitchell_sfs[name][3],
        normalise=True,
        normalise_x=normalisation_x,
        options=PLOT_OPTIONS,
        color="purple",
        mew=2,
        linestyle="",
        marker="x",
        label=f"{name}, {age} years",
    )


    ax.legend(
        fontsize="small",
        loc="upper right",
        frameon=False,
    )

    if PLOT_OPTIONS.save:
        fig.savefig(f"./sfs_age{age}_{name}{PLOT_OPTIONS.extension}")
    fig.show()