In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from scipy import stats
from typing import Tuple

SAVE = False
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"

In [None]:
try:
    summary = pd.read_csv("../Summary_cut.csv", index_col=0)
except FileNotFoundError:
    summary = pd.read_csv(
        "/mnt/c/Users/fra_t/Documents/PhD/Summary_cut.csv", index_col=0
    )
summary.cell_type = summary.cell_type.astype("category")
summary.sample_type = summary.sample_type.astype("category")
summary.sort_values(by="age", inplace=True)
summary.reset_index(inplace=True)
# neglect some duplicated colonies e.g. summary.colony_ID == "11_E07"
summary = summary.merge(
    summary[["donor_id", "age"]]
    .groupby("donor_id")
    .count()
    .reset_index()
    .rename(columns={"age": "cells"}),
    on="donor_id",
    validate="many_to_one",
    how="left",
)
summary.dtypes

In [None]:
summary.describe()

In [None]:
summary.cell_type.value_counts()

In [None]:
summary.sample_type.value_counts()

In [None]:
summary.timepoint.value_counts()

In [None]:
summary

In [None]:
summary[["donor_id", "cells", "age"]].drop_duplicates()

In [None]:
summary[["donor_id", "number_mutations"]].groupby(
    "donor_id"
).sum()  # mutations per donor

In [None]:
sns.regplot(data=summary, x="age", y="number_mutations", scatter_kws={"marker": "x"})

In [None]:
mean_mutations = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .mean()
    .reset_index()
    .merge(
        summary[["donor_id", "age"]].drop_duplicates(),
        on="donor_id",
        how="inner",
        validate="one_to_one",
    )
    .sort_values(by="age")
)

In [None]:
x = mean_mutations.age.to_numpy()
y = mean_mutations.number_mutations.to_numpy()
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y, rcond=None)[0]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=FIGSIZE)
ax.plot(x, y, "o")
ax.plot(x, m * x + c, "b", linestyle="--")
# ax.plot(x, 16*x + c, 'r', linestyle="--")
ax.set_xlabel("age [years]")
ax.set_ylabel("avg number of SNVs")
ax.set_title(f"y=mx+c with m={m:.2f}, c={c:.2f}")
fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="count",
)
sns.move_legend(ax, bbox_to_anchor=(1.01, 1), loc="upper left", frameon=False)
if SAVE:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if SAVE:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

## Entropy
Based on the code they [developped](https://github.com/emily-mitchell/normal_haematopoiesis/blob/23d221e8d125d78c1e8bcbe05d41d0f3594b0cfb/4_phylogeny_analysis/scripts/shannon_diversity.Rmd#L147), I think they define entropy as in [here](http://math.bu.edu/people/mkon/J6A.pdf) using the phylogenetic tree.
We just compute the entropy from the number of cells: we consider a class being the cells with the same number of mutations and compute the abbundance of those classes, that is the abbundance of cells with the same number of mutations.

In [None]:
summary[["donor_id", "number_mutations"]].groupby(
    "donor_id"
).value_counts().reset_index()

In [None]:
summary[["donor_id", "number_mutations"]].groupby("donor_id").value_counts().reset_index()

In [None]:
entropies = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .value_counts()
    .reset_index()[["donor_id", 0]]
    .groupby("donor_id")
    .count()
    .rename(columns={"count": "entropy"})
)

In [None]:
# count the number of cells with the same number of mutations
entropies = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .value_counts()
    .reset_index()[["donor_id", 0]]
    .groupby("donor_id")
    .agg(stats.entropy)
    .rename(columns={"count": "entropy"})
)
summary = summary.merge(entropies, how="left", on="donor_id", validate="many_to_one")

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
summary.drop_duplicates(subset=[0, "donor_id"]).plot(
    x="age", y=0, ax=ax, kind="scatter"
)
plt.show()

## SFS and entropy using the genotype matrix?
Based on the code they [developped](https://github.com/emily-mitchell/normal_haematopoiesis/blob/23d221e8d125d78c1e8bcbe05d41d0f3594b0cfb/4_phylogeny_analysis/scripts/shannon_diversity.Rmd#L147), I think they define entropy as in [here](http://math.bu.edu/people/mkon/J6A.pdf) using the phylogenetic tree?

I think they consider the number of cells carrying one variant at blood collection, only for variants that were present before 100 mutations appeared in the stem cell compartment. That is old variants present today.

In [None]:
def load_patient(patient: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    mut_matrix = pd.read_csv(f"../mutMatrix{patient}.csv", index_col=0)
    mut_type = pd.read_csv(f"../mutType{patient}.csv", usecols=[1], dtype="category")
    # mut_matrix = mut_matrix.where(~(ax001==0.5), 1)  # map 0.5 to 1
    mut_matrix = mut_matrix.applymap(int)  # map 0.5 to 0
    mut_matrix = pd.DataFrame(mut_matrix, dtype=int)
    return mut_matrix, mut_type


def filter_mutations(m_matrix: pd.DataFrame, m_type: pd.DataFrame) -> pd.DataFrame:
    return m_matrix.iloc[m_type[m_type=="SNV"].dropna().index, :]


def compute_entropy(m_matrix: pd.DataFrame) -> float:
    # count the number of cells for each variant
    summed = m_matrix.sum(axis=1)
    # summed = summed[summed > 3]
    return stats.entropy(summed.to_numpy())

In [None]:
entropies = dict.fromkeys(summary.donor_id.unique())

In [None]:
for donor in entropies.keys():
    if donor == "KX003":
        continue
    filtered_matrix = filter_mutations(*load_patient(donor))
    
    fig, ax = plt.subplots(1, 1)
    sfs = filtered_matrix.sum(axis=1).value_counts()
    ax.scatter(sfs.index.to_numpy(), sfs.to_numpy(), label=donor)
    ncells = filtered_matrix.shape[1]
    x = np.arange(1, ncells, dtype=int)
    y = 1.0 / x * ncells
    ax.plot(x, y, linestyle="--", c="black", alpha=0.7, label="1/f")
    ax.set_yscale("log")
    ax.set_xscale("log")
    ax.legend()
    ax.set_title("SFS")
    fig.show()
    
    fig, ax = plt.subplots(1, 1)
    filtered_matrix.sum(axis=0).plot(kind="hist", bins=100, ax=ax)
    ax.set_title(f"single-cell mutational burden for donor {donor}")
    ax.set_xlabel("SNV from the genotype matrix")
    fig.show()
    entropies[donor] = compute_entropy(filtered_matrix)

In [None]:
summary = pd.merge(
    left=pd.Series(entropies, name="entropy"), left_index=True,
    right=summary, right_on="donor_id",
    validate="one_to_many", how="right"
)

In [None]:
fig, ax = plt.subplots(1, 1)
for donor, entropy in entropies.items():
    toplot = summary.loc[summary.donor_id == donor, ["age", "entropy"]].drop_duplicates()
    ax.scatter(toplot.age, toplot.entropy, label=toplot.index.tolist()[0])
ax.legend()
fig.show()