In Mitchell et al. we have many more clones compared to Fabre et al. but also to target seq in general.

There are at least 3 explanations for this:
1. biased target seq
2. different resolutions
3. biased WGS single-colony assay

## Biased target seq
The other idea is that there is a bias in the data. Samples are collected from blood and thus it is likely that not all stem cells are present in the data, as not all stem cells contribute to blood equally at a certain timepoint.

## Different resolutions
VAF of Mitchell et al. is 0.5% (clone size of 1%), whereas target seq is at 2%.

## Biased WGS single-colony assay
Is this because cologenic assays (aka WGS single-colony) can be biased by chance when we reduce the 10^5 stem cell pool to a subsample of approx. 300 cells? 

To test this:
1. from the whole stem cell pool get all clone frequencies
2. subsample to the accurate number of cells per donor
3. count clones with freq >= 2% ?
4. compare this to Mitchell et al. data

In [None]:
from futils import timeserie
from hscpy import mitchell
from pathlib import Path
from typing import List, NewType
from tqdm import tqdm
import matplotlib.pyplot as plt
import glob
import re
import numpy as np
import pandas as pd

In [None]:
CloneId = NewType("CloneId", int)
CloneFreq = NewType("CloneFreq", float)
CloneCounts = NewType("CloneCounts", int)


def create_sample_from_clones_freq(
    size: int, clones_freq: List[CloneFreq], rng: np.random.Generator
) -> List[CloneId]:
    return list(rng.choice(range(0, len(clones_freq)), size=size, p=clones_freq))


def variant_counts(my_sample: List[CloneId]) -> List[CloneCounts]:
    return list(pd.Series(my_sample, dtype=int).value_counts().values)


def variants_above_threshold(
    variant_counts: List[CloneCounts], threshold: CloneCounts
) -> int:
    return len(list(filter(lambda count: count >= threshold, variant_counts)))


path2clones_82yo = Path(
    "/data/scratch/hfx923/hsc-draft/v4.3.12/100000cells/variant_fraction/82dot0years/"
)
assert path2clones_82yo.is_dir()
donors = mitchell.donors()
rng = np.random.default_rng()
idx_re = re.compile(r"(?P<idx>\d+)idx.csv")
sample_size = donors.loc[donors.age == 81, "cells"].squeeze()
# threshold_VAF = 0.02  # as in target seq data
threshold_VAF = 0.005  # as in target seq data
threshold_SIZ = threshold_VAF * 2
threshold = round(sample_size * threshold_SIZ)
print(threshold)
donors

In [None]:
counts_detected = dict()
counts_detected_full_pop = dict()

for file in tqdm(glob.glob(path2clones_82yo / "*.csv")):
    idx = int(idx_re.search(file).group("idx"))
    variants_f = timeserie.load_timeserie(file)
    # nb of clones from the whole stem cell pool
    counts_detected_full_pop[idx] = (np.array(variants_f) >= threshold_SIZ).sum()
    # nb of clones from a subsample of it
    # I could have also just loaded the sims from {sample_size}cells/variant_fraction
    data = create_sample_from_clones_freq(sample_size, variants_f, rng)
    counts = variant_counts(data)
    counts_detected[idx] = variants_above_threshold(counts, threshold)

print(f"{len(counts_detected)} simulations stored")
counts_detected

In [None]:
counts_detected_original = dict()

for file in tqdm(
    glob.glob(
        f"/data/scratch/hfx923/hsc-draft/v4.3.12/{sample_size}cells/variant_fraction/81dot0years/*.csv"
    )
):
    idx = int(idx_re.search(file).group("idx"))
    counts_detected_original[idx] = (
        np.array(timeserie.load_timeserie(file)) >= threshold_SIZ
    ).sum()

In [None]:
colors = ["#e41a1c", "#377eb8", "#4daf4a"]
max_bin = 16
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
for s, c, d in zip(
    ["Sampling", "Sampling original", "Whole population"],
    colors,
    [
        counts_detected.values(),
        counts_detected_original.values(),
        counts_detected_full_pop.values(),
    ],
):
    ax.hist(d, bins=range(0, max_bin), align="left", color=c, label=s, histtype="step")
ax.set_title(f"Clones at age 82, {threshold_VAF:.2%}VAF")
ax.set_xlabel("Number of clones")
ax.set_ylabel("Counts")
ax.set_xlim([0, max_bin])
# ax.set_yscale("log")
ax.legend()
plt.show()

In [None]:
counts = (
    pd.DataFrame.from_dict(counts_detected, orient="index")
    .join(pd.Series(counts_detected_original, name="Sample"))
    .join(pd.Series(counts_detected_full_pop, name="Population"))
)
counts["Diff"] = counts.Sample - counts.Population
counts["Diff_Python"] = counts[0] - counts.Population
assert (counts[counts.Population == 0].Diff >= 0).all()
counts

In [None]:
# Diff Python (assuming Python code is OK) is the most reliable
# because the sample from the simulations in Rust are saved at 81 yo
# whereas the number of clones from the total popution are saved at 82 yo
# so the bias might be due to the 1 year difference, especially as we
# see this for a high number of clones?
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
for ls, c, col in zip(
    ["--", "-"], ["#1b9e77", "#d95f02", "#7570b3"], ["Diff", "Diff_Python"]
):
    grouped = counts[["Population", col]].groupby("Population")
    mean_diff = grouped.mean()
    pops = mean_diff.index
    var_diff = grouped.var()
    ax.plot(pops, mean_diff, c=c, label=col, ls=ls)
    ax.fill_between(
        pops,
        y1=(mean_diff - var_diff).values.ravel(),
        y2=(mean_diff + var_diff).values.ravel(),
        alpha=0.4,
        color=c,
    )
ax.set_title(f"Number of clones $C$ at {threshold_VAF:.0%}VAF, 82 yo")
ax.set_xlabel(r"Clones within stem cell pool $C_\text{pool}$")
ax.set_ylabel(r"$C_\text{sample} - C_\text{pool}$")
ax.set_ylim([-4, 4])
ax.set_xlim([0, 12])
ax.legend()
plt.show()


fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="tight")
grouped = counts[["Population", "Diff_Python"]].groupby("Population")
mean_diff = grouped.mean()
pops = mean_diff.index
var_diff = grouped.std()
ax.plot(pops, mean_diff, c=c, ls=ls)
ax.fill_between(
    pops,
    y1=(mean_diff - var_diff).values.ravel(),
    y2=(mean_diff + var_diff).values.ravel(),
    alpha=0.4,
    color=c,
)
ax.set_title(f"Number of clones $C$ at {threshold_VAF:.1%}VAF, 82 yo")
ax.set_xlabel(r"Clones within stem cell pool $C_\text{pool}$")
ax.set_ylabel(r"$C_\text{sample} - C_\text{pool}$")
ax.set_ylim([-10, 10])
ax.set_xlim([0, 15])
plt.savefig(
    f"diff_clones_sample_pop_{threshold_VAF}VAF.png", dpi=800, facecolor="white"
)
plt.show()

In [None]:
# don't compare sample vs pool here because sims are not paired!
# use the diff plot instead
colors = ["#e41a1c", "#377eb8", "#4daf4a"]
max_bin = 16
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="tight")
for s, c, d in zip(
    [r"$C_\text{sample}$", r"$C_\text{pool}$"],
    colors,
    [counts_detected.values(), counts_detected_full_pop.values()],
):
    ax.hist(d, bins=range(0, max_bin), align="left", color=c, label=s, histtype="step")
ax.set_title(f"Clones at age 82, {threshold_VAF:.1%}VAF")
ax.set_xlabel("Number of clones")
ax.set_ylabel("Counts")
ax.set_xlim([0, max_bin])
# ax.set_yscale("log")
ax.legend()
plt.savefig(
    f"diff_clones_sample_pop_hist_{str(threshold_VAF).replace('.', '')}VAF.png",
    dpi=800,
    facecolor="white",
)
plt.show()