# Quantifying the number of missing clones from simulations

## Context and motivation

In Mitchell et al. we have many more clones compared to Fabre et al. but also to target seq in general. There are at least 3 explanations for this:
1. biased target seq
2. different resolutions
3. biased WGS single-colony assay

### Biased target seq
The other idea is that there is a bias in the data. Samples are collected from blood and thus it is likely that not all stem cells are present in the data, as not all stem cells contribute to blood equally at a certain timepoint.

### Different resolutions
VAF of Mitchell et al. is 0.5% (clone size of 1%), whereas target seq is at 2%.

### Biased WGS single-colony assay
Is this because cologenic assays (aka WGS single-colony) can be biased by chance when we reduce the 10^5 stem cell pool to a subsample of approx. 300 cells? 

To test this:
1. from the whole stem cell pool get all clone frequencies
2. subsample to the accurate number of cells per donor
3. count clones with freq >= 2% ?
4. compare this to Mitchell et al. data

In [None]:
from futils import timeserie
from pathlib import Path
import seaborn as sns
import socket
from pathlib import Path
from typing import List, NewType
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib import font_manager
import matplotlib as mpl
import glob
import re
import numpy as np
import pandas as pd

if socket.gethostname() == "LAPTOP-CEKCHJ4C":
    font_dirs = [Path("~").expanduser() / ".local/share/fonts/otf/TexGyreHeros/"]
else:
    font_dirs = [Path("~").expanduser() / ".local/fonts/TeX-Gyre-Heros/"]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
mpl.rcParams["font.family"] = ["TeX Gyre Heros", "sans-serif"]
# use sims from ABC instead of the ones specificly for this nb
ABC_SIMS = True
PATH2SIMS = Path("/data/scratch/hfx923/hsc-draft/v4.3.13/100000cells/variant_fraction")
N_SIMS = 10_000
SAMPL_SIZ = sorted([300, 10_000])
TOT_VAR = 3_000
assert len(SAMPL_SIZ) <= 2
# assumption is 3_000 possible variants which means
# that the size of each vector in csv file is 3_000
VAR_IDX = np.arange(0, TOT_VAR, dtype=np.uint64)
# clone sizes at which we subsample
PCT = [0.005, 0.01, 0.02, 0.04]
FIGSIZ = (5.5, 3)

rng = np.random.default_rng()
re_idx = re.compile(r"(?P<idx>\d+)idx.csv")
re_age = re.compile(r"(?P<age>\d+)dot(\d+)years")
CloneId = NewType("CloneId", int)
CloneFreq = NewType("CloneFreq", float)
CloneCounts = NewType("CloneCounts", int)

## Analyse the simulations to quantify missing clones

In [None]:
def create_sample_from_clones_freq(
    size: int, clones_freq: np.typing.NDArray[CloneFreq], rng: np.random.Generator
) -> np.typing.NDArray[CloneId]:
    return rng.choice(VAR_IDX, size=size, p=clones_freq)


def variant_counts(my_sample: List[CloneId]) -> np.typing.NDArray[CloneCounts]:
    return np.unique_counts(my_sample).counts


def variants_above_threshold(
    variant_counts: np.typing.NDArray[CloneCounts], threshold: CloneCounts
) -> int:
    return (variant_counts >= threshold).sum()

In [None]:
# 1. read variant csv file: vector of size N_VARIANTS (3_000) with
#    proportion of cells in each entry representing the variant freq
# 2. tot nb of clones is nb of entries in 1. with value > 0, i.e. at
#    least one cell for that variant
# 3. subsample to two different sample sizes: SAMPL_SIZ. To subsample,
#    pick at random `size` clone idx from VAR_IDX with prob given by
#    their frequencies in the original data
# 4. from this subsample, drop neutral clone which has id 0
# 5. compute variants counts (groupby and count) from variants idx
# 6. store the number of variants that are present in at least `thresh`
#    cells, that is keep variants with a count of at leat `thresh`
clones_sims = list()
for walking in PATH2SIMS.walk():
    bp, my_dir_names = walking[0], walking[1]
    for my_dir_name in my_dir_names:
        my_dir = bp / my_dir_name
        age = int(re_age.search(my_dir.name).group("age"))
        assert age >= 0, f"wrong age {age}"
        for i, file in enumerate(tqdm(my_dir.glob("*.csv"), mininterval=2)):
            idx = int(re_idx.search(file.name).group("idx"))
            # 1. vector of size N_VARIANTS variant freqs in each entry
            variants_f = np.array(timeserie.load_timeserie(file))
            # 2. boolean mask
            tot_clones = (variants_f > 0).sum()
            for sample_size in SAMPL_SIZ:
                # 3. subsample `sample_size` variant idx from VAR_IDX to two different
                # sample sizes with freq given by `variants_f`
                subsample = create_sample_from_clones_freq(sample_size, variants_f, rng)
                # 4. drop neutral clone which has id 0
                # 5. compute variant counts from variant idx
                counts = variant_counts(subsample[subsample > 0])
                for j, thresh in enumerate([int(round(sample_size * p)) for p in PCT]):
                    # 6. store the variants that are present in at least `thresh` cells
                    clones_sims.append(
                        (
                            age,
                            idx,
                            variants_above_threshold(counts, thresh),
                            sample_size,
                            PCT[j] * 100,
                            tot_clones,
                        )
                    )
            if i >= N_SIMS:
                break

print(f"Loaded {len(clones_sims)} entries")
columns = ["Age", "Id", "Sampled clones", "Sample", "Clone size", "Clones"]
df = pd.DataFrame(
    clones_sims,
    columns=columns,
    # dtype=[(col, col_float.get(col, "uint64")) for col in columns]
)
for col in columns:
    if col == "Clone size":
        df[col] = df[col].astype(float)
    else:
        df[col] = df[col].astype(np.uint64)
df["Diff"] = (df["Clones"] - df["Sampled clones"]).astype(np.int64)
df = df.pivot(
    index=["Age", "Id"],
    columns=["Sample", "Clone size"],
    values=["Sampled clones", "Diff"],
)
df

In [None]:
for age in df.index.get_level_values(0).unique():
    view_age = df[df.index.get_level_values("Age") == age]
    x = sorted(df.columns.get_level_values("Sample").unique())
    fig, axes = plt.subplots(
        1, 2, figsize=FIGSIZ, sharey=True, sharex=True, layout="tight"
    )
    for ax, xx in zip(axes, x):
        if not age:
            sns.histplot(
                view_age.loc[:, view_age.columns.get_level_values("Sample") == xx][
                    "Diff"
                ],
                discrete=True,
                stat="density",
                ax=ax,
                legend=False,
            )
        else:
            sns.histplot(
                view_age.loc[:, view_age.columns.get_level_values("Sample") == xx][
                    "Diff"
                ],
                stat="density",
                ax=ax,
                legend=False,
            )
            ax.set_xlim([0, 600])
        ax.set_xlabel(rf"$C_{{tot}} - C_{{{xx}}}$")
    fig.suptitle(f"Number of missing clones for age {age} years", y=0.95)
    fig.savefig(f"clones_missing_{SAMPL_SIZ[0]}_{SAMPL_SIZ[1]}_{age}years.svg")
    plt.show()

    fig, axes = plt.subplots(1, 2, figsize=FIGSIZ, layout="tight")
    assert len(x) == 2
    c_diff = (
        (
            view_age.loc[:, view_age.columns.get_level_values("Sample") == x[-1]][
                "Diff"
            ].droplevel(axis=1, level=0)
            - view_age.loc[:, view_age.columns.get_level_values("Sample") == x[0]][
                "Diff"
            ].droplevel(axis=1, level=0)
        )
        .astype(int)
        .melt()
    )
    sns.histplot(
        c_diff, x="value", hue="Clone size", ax=axes[0], element="poly", discrete=True
    )
    axes[0].set_xlabel(rf"$C_{{{x[-1]}}} - C_{{{x[0]}}}$")
    sns.boxenplot(
        c_diff,
        y="value",
        x="Clone size",
        ax=axes[1],
    )
    axes[1].plot(
        axes[1].get_xticks(),
        c_diff.groupby("Clone size").mean(),
        c="#88419d",
        marker="x",
        ls="",
        mew=2,
        ms=8,
    )
    axes[1].set_xticks(ticks=axes[1].get_xticks(), labels=[ele * 100 for ele in PCT])
    axes[1].set_xlabel("Clone size [%]")
    axes[1].set_ylabel(rf"$C_{{{x[-1]}}} - C_{{{x[0]}}}$")
    fig.suptitle(f"Difference in subsampled clones for age {age} years", y=0.95)
    axes[0].set_xlim([-20, 20])
    axes[1].set_ylim([-20, 20])
    fig.savefig(f"clones_difference_{SAMPL_SIZ[0]}_{SAMPL_SIZ[1]}_{age}years.svg")
    plt.show()