# Descriptive analysis of Mitchell's paper
From the file `summary_cut.csv` which contains some incosistency with the mutMatrix files.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import socket
import seaborn as sns
import subprocess
from pathlib import Path

from hscpy import mitchell, realisation, parameters
from hscpy.figures import sfs as sfs_figures
from hscpy.figures import PlotOptions, simulations

from futils import parse_version, snapshot


SAVEFIG = True
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
EXTENSION = ".svg"
PLOT_OPTIONS = PlotOptions(figsize=FIGSIZE, extension=EXTENSION, save=SAVEFIG)

In [None]:
if socket.gethostname() == "5X9ZYD3":
    PATH2MITCHELL = Path("/mnt/c/Users/terenz01/Documents/SwitchDrive/PhD/hsc")
elif socket.gethostname() == "LAPTOP-CEKCHJ4C":
    PATH2MITCHELL = Path("/mnt/c/Users/fra_t/Documents/PhD/hsc")
else:
    PATH2MITCHELL = Path("~").expanduser()

In [None]:
summary = mitchell.load_and_process_mitchell(
    PATH2MITCHELL / "Summary_cut.csv", drop_donor_KX007=True
)
summary.dtypes

In [None]:
print(summary.describe())
print(f"\n\ncell types: \n{summary.cell_type.value_counts()}")
print(f"\n\nsample types: \n{summary.sample_type.value_counts()}")
print(f"\n\ntimepoints: \n{summary.timepoint.value_counts()}")
print(
    f'\n\nages and cells: \n{summary[["donor_id", "cells", "age"]].drop_duplicates()}'
)
print(
    f'\n\nmutations per donor: \n{summary[["donor_id", "number_mutations"]].groupby("donor_id").sum()}'
)

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if PLOT_OPTIONS.save:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

In [None]:
descr = (
    summary.loc[summary.age == 0, ["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .describe()
)
descr

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="percent",
)
sns.move_legend(
    ax,
    ncol=2,
    # bbox_to_anchor=(1.01, 1),
    loc="upper right",
    frameon=False,
    fontsize="small",
)
if PLOT_OPTIONS.save:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
descr[("number_mutations", "mean")].mean() / (2 * np.log(200_000 - 2))

In [None]:
descr[("number_mutations", "std")] ** 2

In [None]:
summary

In [None]:
summary[["age", "donor_id"]].drop_duplicates().sort_values("age").donor_id.to_list()

In [None]:
fig, ax = plt.subplots(1, 1)
y = summary[["number_mutations", "donor_id"]].groupby("donor_id").var().reindex(
    summary[["age", "donor_id"]].drop_duplicates().sort_values("age").donor_id.to_list()
).squeeze()
ax.plot(list(summary.age.unique()), y.iloc[1:], marker=".")
ax.set_ylabel("Variance")
ax.set_xlabel("time [years]")
# ax.set_yscale("log")
plt.show()

In [None]:
summary[["number_mutations", "donor_id"]].groupby("donor_id").var()