# SFS
We plot the burden and the SFS for the data published in [Mitchell's et al. Nature 2022](https://www.nature.com/articles/s41586-022-04786-y).

For the SFS we show also some simulations with the parameters that are inferred from the ABC.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import subprocess
from pathlib import Path

from hscpy import mitchell, realisation, parameters
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import PlotOptions, simulations

from futils import parse_version, snapshot

NCELLS = 100_000
SEED = 10
RUN_SIMS = False
SAVEFIG = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
EXTENSION = ".svg"
PLOT_OPTIONS = PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVEFIG)
PATH2HSC = Path("~").expanduser() / "hsc"
assert PATH2HSC.is_dir()

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2MITCHELL = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2MITCHELL = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2MITCHELL = Path("~").expanduser()

In [None]:
%%bash -s "$PATH2HSC" --out version
$1/target/release/hsc  --version

In [None]:
VERSION = parse_version(version)
print("Running hsc with version:", VERSION)

In [None]:
donors = mitchell.donors()
donors

## Plot the SFS 
Combine different data for this plot:
1. 1/f^2 sampled prediction (computed here in python)
2. Mitchell's SFS (loaded and computed here in python)
2. 1/f sampled prediction from Nate's (loaded from external file)
3. SFS from simulations (need to generate them)

### Generate/load/compute the data

####  1. 1/f2 predictions

In [None]:
%%time
# compute the correction for the sims' SFS with sampled distributions from
# https://www.biorxiv.org/content/10.1101/2022.11.07.515470v2
corrected_variants_one_over_1_squared = dict()
for donor in donors.itertuples():
    print(
        f"apply sampling correction to SFS of donor {donor.name} with age {donor.age} with sample size {donor.cells}"
    )
    corrected_variants_one_over_1_squared[
        donor.name
    ] = realisation.compute_variants(
        realisation.Correction.ONE_OVER_F_SQUARED,
        pop_size=NCELLS,
        sample_size=donor.cells,
    )

#### 2. Mitchell's SFS

In [None]:
%%time
# there are two donors with the same age 0
mitchell_sfs = {
    donor.name: mitchell.sfs_donor_mitchell(donor.name, donor.age, PATH2MITCHELL, remove_indels=False)
    for donor in donors.itertuples()
}

#### 3. 1/f sample predictions

In [None]:
# theoretical homeostatic neutral SFS data, from Nate's paper in Elife: for each patient (skipping the neonates) 
# I evolved until their specific age, and then sampled to the same size as in the data
mapping = {age: f"homeostasisSFS_pid{i}.csv" for i, age in enumerate(donors.age.unique().tolist()[1:], 3)}
mapping

#### 4. SFS from simulations
The data have been generated with the cmd `sfs.sh parameters.txt`.

### Plot

In [None]:
for donor in donors.itertuples():
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=PLOT_OPTIONS.figsize)
    
    # 1/f^2 sampled predictions
    normalisation_x = sfs_figures.ToCellFrequency(
        sample_size=donor.cells
    )
    sfs_figures.plot_sfs_correction(
        ax,
        corrected_variants_one_over_1_squared[donor.name],
        normalise=True,
        options=PLOT_OPTIONS,
        normalise_x=normalisation_x,
        linestyle="-",
        color="grey",
        label=r"$1/f^2$ sampled",
        linewidth=2,
    )
    
    # 1/f sampled predictions from Nate's simulations
    one_over_f_csv = mapping.get(donor.age)
    if one_over_f_csv:
        one_over_f = pd.read_csv(one_over_f_csv)
        one_over_f.drop(index=one_over_f[one_over_f["_f"] == 0.0].index, inplace=True)
        sfs_one_over_f = {cell: muts for cell, muts in zip(
            (one_over_f["_f"] * normalisation_x.nb_cells).tolist(),
            one_over_f["n_f"].tolist(),
        )}
        sfs_figures.plot_sfs(
            ax,
            sfs_one_over_f,
            normalise=True,
            normalise_x=normalisation_x,
            options=PLOT_OPTIONS,
            color="grey",
            lw=2,
            linestyle="--",
            label=r"$1/f$ sampled",
        )
        
    # simulations
    # TODO show avg instead of one realisation only
    sfs_sims = realisation.load_all_sfs_by_age(Path(f"{VERSION}/{donor.cells}cells/sfs"))[donor.age]
    sfs_figures.plot_sfs(
        ax,
        sfs_sims[-1].sfs,
        normalise=True,
        normalise_x=normalisation_x,
        options=PLOT_OPTIONS,
        color="cyan",
        mew=2,
        linestyle="",
        marker=".",
        label="simulation",
    )
    
    sfs_figures.plot_sfs_avg(
        ax,
        [sfs_.sfs for sfs_ in sfs_sims],
        options_plot=PLOT_OPTIONS,
        normalise_x=normalisation_x,
        lw=3,
        color="cyan",
        alpha=0.3,
        label="avg"
    )
    
    # mitchell's data
    sfs_figures.plot_sfs(
        ax,
        mitchell_sfs[donor.name][3],
        normalise=True,
        normalise_x=normalisation_x,
        options=PLOT_OPTIONS,
        color="purple",
        mew=2,
        linestyle="",
        marker="x",
        label=f"{donor.age} years",
    )


    ax.legend(
        fontsize="small",
        loc="upper right",
        frameon=False,
    )

    if PLOT_OPTIONS.save:
        fig.savefig(f"./sfs_age{donor.age}_{donor.name}{PLOT_OPTIONS.extension}")
    fig.show()