In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import socket
import seaborn as sns
from pathlib import Path
from random import choices

from hscpy import mitchell, parse_path2folder_xdoty_years, realisation, parameters
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import PlotOptions, simulations

from futils import parse_version, snapshot

PATH2BIN = Path("~").expanduser() / "hsc/target/release"
assert PATH2BIN.is_dir()

NCELLS = 100_000
SAMPLE = 368
USE_SCRATCH = True

SAVEFIG = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
EXTENSION = ".svg"
PLOT_OPTIONS = PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVEFIG)

In [None]:
%%bash -s "$PATH2BIN" --out version
$1/hsc --version

In [None]:
VERSION = parse_version(version)
PATH2SAVE = Path(f"./{VERSION}")

print("Running hsc with version:", VERSION)

if USE_SCRATCH:
    PATH2SIMS = Path("/data/scratch/")
else:
    PATH2SIMS = Path("/data/home/")
PATH2SIMS /= f"hfx923/hsc-draft/{VERSION}"

if socket.gethostname() == "5X9ZYD3":
    PATH2MITCHELL = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2MITCHELL = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2MITCHELL = Path("~").expanduser()

PATH2SAVE = Path(f"./{VERSION}")

## Mitchell's data

In [None]:
summary = mitchell.load_and_process_mitchell(
    PATH2MITCHELL / "Summary_cut.csv", drop_donor_KX007=True
)
summary.dtypes

In [None]:
print(summary.describe())
print(f"\n\ncell types: \n{summary.cell_type.value_counts()}")
print(f"\n\nsample types: \n{summary.sample_type.value_counts()}")
print(f"\n\ntimepoints: \n{summary.timepoint.value_counts()}")
print(
    f'\n\nages and cells: \n{summary[["donor_id", "cells", "age"]].drop_duplicates()}'
)
print(
    f'\n\nmutations per donor: \n{summary[["donor_id", "number_mutations"]].groupby("donor_id").sum()}'
)

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if PLOT_OPTIONS.save:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

In [None]:
descr = (
    summary.loc[summary.age == 0, ["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .describe()
)
descr

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="percent",
)
sns.move_legend(ax, bbox_to_anchor=(1.01, 1), loc="upper left", frameon=False)
if PLOT_OPTIONS.save:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
descr[("number_mutations", "mean")].mean() / (2 * np.log(200_000 - 2))

In [None]:
descr[("number_mutations", "std")] ** 2

## Compare the SFS of the simulations against the data

In [None]:
%%time
names_mitchell = summary.donor_id.unique()
ages_mitchell = summary.age.unique().tolist()
# there are two donors with the same age 0
assert len(ages_mitchell) + 1 == len(names_mitchell)
target_sfs = {
    donor: mitchell.sfs_donor_mitchell(donor, PATH2MITCHELL, remove_indels=False)
    for age, donor in zip([0] + ages_mitchell, names_mitchell)
}

In [None]:
%%time
# compute the correction for the SFS with sampled distributions from
# https://www.biorxiv.org/content/10.1101/2022.11.07.515470v2
corrected_variants_one_over_1_squared = dict()
for donor in (
    summary[["donor_id", "age", "cells"]]
    .drop_duplicates()
    .sort_values(by="age", ascending=False)
    .itertuples()
):
    print(
        f"apply sampling correction to SFS of donor {donor.donor_id} with age {donor.age}"
    )
    corrected_variants_one_over_1_squared[
        donor.donor_id
    ] = realisation.compute_variants(
        realisation.Correction.ONE_OVER_F_SQUARED,
        pop_size=NCELLS,
        sample_size=donor.cells,
    )

In [None]:
%%time
path2sfs = Path(PATH2SIMS / f"{SAMPLE}cells/sfs/")
ages_sims = sorted([parse_path2folder_xdoty_years(path) for path in path2sfs.iterdir()])
assert ages_sims == ages_mitchell

# load some runs with specific parameters
filtered_by_abc = parameters.filter_simulations(
    path2sfs, mu=1, mean=0.09, std=0.01, b0=1
)
filtered_by_abc.sort_values(by="s", inplace=True)
view = filtered_by_abc[
    (filtered_by_abc.s > filtered_by_abc.s.quantile(0.01))
    & (filtered_by_abc.s < filtered_by_abc.s.quantile(0.99))
]
filtered_idx_by_abc = set(view.idx)
print(f"{len(filtered_idx_by_abc)} runs")
sfs_sims = realisation.load_all_sfs_by_age(path2sfs, filtered_idx_by_abc)
sns.pairplot(filtered_by_abc[["mu", "s", "std"]].drop_duplicates())
view

In [None]:
SHOW_SAMPLE_ONLY = True  # do not show sims

if not SHOW_SAMPLE_ONLY:
    # TODO this
    subsample = choices(list(filtered_idx_by_abc), k=len(markers))
    markers = {"D", "o"}
    for age, name in zip([0] + ages_mitchell, names_mitchell):
        fig, axes = plt.subplots(
            1, 2, width_ratios=[4, 1], layout="constrained", figsize=(9.5, 5)
        )
        fig1, ax1 = plt.subplots(1, 1)
        for sfs_, marker in zip(
            filter(lambda sim: sim.parameters.idx in subsample, sfs_sims[age]), markers
        ):
            simulations.plot_rates(ax1, PATH2SIMS, sfs_.parameters.idx, xlims=[0.95, 1.5])
            ax1.set_label(f"run {sfs_.parameters.idx} with mu {sfs_.parameters.mu}")
            my_dict = sfs_.parameters.into_dict()
            print(
                f"idx={my_dict['idx']}, s={my_dict['s']}, std={my_dict['std']}, mu={my_dict['mu']}"
            )
            sfs_figures.plot_sfs(
                axes[0],
                sfs_.sfs,
                True,
                PLOT_OPTIONS,
                marker=marker,
                mew=2,
                linestyle="",
                color="yellowgreen",
                alpha=0.5,
                # label=f"run with id {sfs_.parameters.idx}",
                label=f"simulation",
            )
        sfs_figures.plot_sfs_avg(
            axes[0],
            [
                sfs_.sfs
                for sfs_ in filter(
                    lambda sim: sim.parameters.idx in filtered_idx_by_abc, sfs_sims[age]
                )
            ],
            PLOT_OPTIONS,
            color="blue",
            alpha=0.6,
            label=f"avg of {len(filtered_idx_by_abc)} runs",
        )
        ax1.legend()

        sfs_figures.plot_sfs_correction(
            axes[0],
            corrected_variants_one_over_1_squared[name],
            True,
            PLOT_OPTIONS,
            linestyle="-",
            color="grey",
            label=r"$1/f^2$ sampled",
            linewidth=2,
        )

        sfs_figures.plot_sfs(
            axes[0],
            target_sfs[name],
            normalise=True,
            options=PLOT_OPTIONS,
            color="purple",
            mew=2,
            linestyle="",
            marker="x",
            label=f"{name}, {age} years",
        )

        axes[1].legend(
            *axes[0].get_legend_handles_labels(),
            fontsize="small",
            loc=6,
            frameon=False,
        )
        axes[1].set_xticks([])
        axes[1].set_yticks([])
        axes[1].spines.right.set_visible(False)
        axes[1].spines.left.set_visible(False)
        axes[1].spines.top.set_visible(False)
        axes[1].spines.bottom.set_visible(False)

        if PLOT_OPTIONS.save:
            fig.savefig(f"./sfs_age{age}{PLOT_OPTIONS.extension}")
        fig.show()
else:
    for age, name in zip([0] + ages_mitchell, names_mitchell):
        normalisation_x = sfs_figures.ToCellFrequency(
            sample_size=summary.loc[summary["donor_id"] == name, "cells"].drop_duplicates().squeeze(),
            to_one=True
        )
        fig, ax = plt.subplots(1, 1, layout="constrained", figsize=PLOT_OPTIONS.figsize)
        sfs_figures.plot_sfs_correction(
            ax,
            corrected_variants_one_over_1_squared[name],
            normalise=True,
            options=PLOT_OPTIONS,
            normalise_x=normalisation_x,
            linestyle="-",
            color="grey",
            label=r"$1/f^2$ sampled",
            linewidth=2,
        )

        sfs_figures.plot_sfs(
            ax,
            target_sfs[name],
            normalise=True,
            normalise_x=normalisation_x,
            options=PLOT_OPTIONS,
            color="purple",
            mew=2,
            linestyle="",
            marker="x",
            label=f"{name}, {age} years",
        )

        ax.legend(
            fontsize="medium",
            loc='upper right',
            frameon=False,
        )

        if PLOT_OPTIONS.save:
            fig.savefig(f"./sfs_age{age}{PLOT_OPTIONS.extension}")
        fig.show()

## Compare the variant fraction of the simulations against the data

In [None]:
from hscpy import variant, mitchell

In [None]:
path2variants = PATH2SIMS / f"{NCELLS}cells" / "variant_fraction"

In [None]:
%%time
counts = variant.load_all_var_counts_by_age(
    PATH2SIMS / f"{SAMPLE}cells/variant_fraction"
)
counts = variant.variant_counts_df(counts)

fig, ax = plt.subplots(1, 1)
sns.lineplot(counts, x="age", y="variant counts", ax=ax)
plt.show()

In [None]:
%%time
counts = variant.load_all_var_counts_by_age(
    PATH2SIMS / f"{200_000}cells/variant_fraction"
)
counts = variant.variant_counts_df(counts)

fig, ax = plt.subplots(1, 1)
sns.lineplot(counts, x="age", y="variant counts", ax=ax)
plt.show()