They have different cells with the exact same genotype matrix (polytomies), hence the sampled 1/f^2 theoretical expectation is off compared to the data?
Note that I think some mutations could have been acquired in vitro.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Set
from bimuma import donor, summary

CELL_TYPES = pd.read_excel(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/chapman2021/41586_2021_3548_MOESM18_ESM.xlsx",
    sheet_name=0,
    usecols=[0, 1],
)
CELL_TYPES["Sample"] += "_hum"
CELL_TYPES.Cell_type = CELL_TYPES.Cell_type.astype("category")
CELL_TYPES.Cell_type.value_counts()

In [None]:
class DonorChapman:
    def __init__(self, donor_: donor.Donor, name: str):
        self.donor = donor_
        self.name = name
        # bias here to take into account polytomies (cells with the same genome)
        # 2.4 is the mutation rate estimated in the paper
        # self.donor.sfs[1] = 2.4 * len(self.donor.cells)


def load_donor(
    donor_8pcw: bool, name: str, cells2keep: List[str] | None = None
) -> DonorChapman:
    binary_mut = read_data(donor_8pcw)
    if cells2keep:
        binary_mut = summary.filter_cells_from_matrix(binary_mut, cells2keep)
        assert len(cells2keep) == binary_mut.matrix.shape[1]
        cells = len(cells2keep)
    print(f"{binary_mut.polytomies} cells with the same genotypes (polytomies)")
    return DonorChapman(donor.Donor(binary_mut), name)


def load_donor_14pwc(organs: str = "all") -> DonorChapman:
    if organs == "femur_1":
        cells2keep = load_cell_types_donor_14pwc({"F1"})
    elif organs == "femur_2":
        cells2keep = load_cell_types_donor_14pwc({"F2"})
    elif organs == "femurs":
        cells2keep = load_cell_types_donor_14pwc({"F1", "F2"})
    elif organs == "liver":
        cells2keep = load_cell_types_donor_14pwc({"L"})
    elif organs == "all":
        cells2keep = load_cell_types_donor_14pwc({"F1", "F2", "L"})
        assert len(cells2keep) == CELL_TYPES.shape[0]
    else:
        raise ValueError(f"unknown value {organs} for `organs`")
    return load_donor(False, "14pwc " + organs, cells2keep)


def load_donor_8pwc() -> DonorChapman:
    return load_donor(True, "8pcw")


def read_data(donor_8pcw: bool = True) -> summary.BinaryMutationMatrix:
    data = pd.read_excel(
        "/mnt/c/Users/fra_t/Documents/PhD/hsc/chapman2021/41586_2021_3548_MOESM8_ESM.xlsx",
        sheet_name=0 if donor_8pcw else 2,
        index_col=0,
    )
    return summary.BinaryMutationMatrix(data)


def load_cell_types_donor_14pwc(organs_set: Set[str]) -> List[str]:
    """Donor 14 pwc has cells from femur 1, 2 and liver"""
    cells2keep = CELL_TYPES.loc[
        CELL_TYPES.Cell_type.isin(organs_set), "Sample"
    ].to_list()
    assert len(cells2keep), f"Cannot find {organs_set} in cell types data set"
    return cells2keep

In [None]:
donors = [
    load_donor_8pwc(),
    load_donor_14pwc("all"),
    load_donor_14pwc("femur_1"),
    load_donor_14pwc("femur_2"),
    load_donor_14pwc("femurs"),
    load_donor_14pwc("liver"),
]

In [None]:
len(donors[0].donor.cells) + len(donors[1].donor.cells)

In [None]:
for d in donors:
    b = d.donor.burden
    fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
    ax.bar(x=b.keys(), height=b.values())
    ax.set_title(f"{d.name} with {len(d.donor.cells)} cells")
    ax.set_ylabel("Counts")
    ax.set_xlabel("Number of SNVs in cells")
    plt.show()

    sfs = d.donor.sfs
    # sfs_1_over_f_squared = donor.donor.sfs_1_over_f_squared_corrected(500, donor.donor.cells)
    fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
    ax.scatter(x=sfs.keys(), y=sfs.values())
    ax.set_ylabel("Number of variants")
    ax.set_xlabel("Number of cells")
    # ax.plot(x=sfs_1_over_f_squared.keys(), y=sfs_1_over_f_squared.values())
    ax.set_title(f"{d.name} with {len(d.donor.cells)} cells")
    ax.set_yscale("log")
    ax.set_xscale("log")
    plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
for d, m in zip([donors[0], donors[-1]], (".", "x", "v", "1", "2", "3")):
    sfs = d.donor.sfs
    # sfs_1_over_f_squared = donor.donor.sfs_1_over_f_squared_corrected(500, donor.donor.cells)
    # normalise
    max_cell, max_var = max(sfs.keys()), max(sfs.values())
    cell_freq, var_freq = (
        [ele / max_cell for ele in sfs.keys()],
        [ele / max_var for ele in sfs.values()],
    )
    ax.scatter(x=cell_freq, y=var_freq, label=d.name, alpha=0.8, marker=m)
ax.set_ylabel("Variant density")
ax.set_xlabel("Variant frequency")
# ax.plot(x=sfs_1_over_f_squared.keys(), y=sfs_1_over_f_squared.values())
ax.set_yscale("log")
ax.set_xscale("log")
ax.legend()
plt.show()