In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats

SAVE = False
BIGLABELS = False
FIGSIZE = [5, 3] if BIGLABELS else [6.4, 4.8]  # default matplotlib
PDF = True
EXTENSION = ".pdf" if PDF else ".png"

In [None]:
try:
    summary = pd.read_csv("../Summary_cut.csv", index_col=0)
except FileNotFoundError:
    summary = pd.read_csv(
        "/mnt/c/Users/fra_t/Documents/PhD/Summary_cut.csv", index_col=0
    )
summary.cell_type = summary.cell_type.astype("category")
summary.sample_type = summary.sample_type.astype("category")
summary.sort_values(by="age", inplace=True)
summary.reset_index(inplace=True)
# neglect some duplicated colonies e.g. summary.colony_ID == "11_E07"
summary = summary.merge(
    summary[["donor_id", "age"]]
    .groupby("donor_id")
    .count()
    .reset_index()
    .rename(columns={"age": "cells"}),
    on="donor_id",
    validate="many_to_one",
    how="left",
)
summary.dtypes

In [None]:
summary.describe()

In [None]:
summary.cell_type.value_counts()

In [None]:
summary.sample_type.value_counts()

In [None]:
summary.timepoint.value_counts()

In [None]:
summary

In [None]:
summary[["donor_id", "cells", "age"]].drop_duplicates()

In [None]:
sns.regplot(data=summary, x="age", y="number_mutations", scatter_kws={"marker": "x"})

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=FIGSIZE)
sns.histplot(
    data=summary,
    x="number_mutations",
    hue="donor_id",
    kde=True,
    binwidth=10,
    ax=ax,
    stat="count",
)
sns.move_legend(ax, bbox_to_anchor=(1.01, 1), loc="upper left", frameon=False)
if SAVE:
    plt.savefig(f"./mitchell_burden{EXTENSION}")
plt.show()

In [None]:
for i in summary.donor_id.unique():
    fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
    sns.histplot(
        data=summary[summary.donor_id == i],
        x="number_mutations",
        hue="donor_id",
        kde=True,
        bins=50,
        ax=ax,
        stat="count",
    )
    if SAVE:
        plt.savefig(f"./{i}_burden{EXTENSION}")
    plt.show()

## Entropy
Based on the code they [developped](https://github.com/emily-mitchell/normal_haematopoiesis/blob/23d221e8d125d78c1e8bcbe05d41d0f3594b0cfb/4_phylogeny_analysis/scripts/shannon_diversity.Rmd#L147), I think they define entropy as in [here](http://math.bu.edu/people/mkon/J6A.pdf) using the phylogenetic tree.
We just compute the entropy from the number of cells: we consider a class being the cells with the same number of mutations and compute the abbundance of those classes, that is the abbundance of cells with the same number of mutations.

In [None]:
summary[["donor_id", "number_mutations"]].groupby(
    "donor_id"
).value_counts().reset_index()

In [None]:
entropies = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .value_counts()
    .reset_index()[["donor_id", "count"]]
    .groupby("donor_id")
    .count()
    .rename(columns={"count": "entropy"})
)

In [None]:
# count the number of cells with the same number of mutations
entropies = (
    summary[["donor_id", "number_mutations"]]
    .groupby("donor_id")
    .value_counts()
    .reset_index()[["donor_id", "count"]]
    .groupby("donor_id")
    .agg(stats.entropy)
    .rename(columns={"count": "entropy"})
)
summary = summary.merge(entropies, how="left", on="donor_id", validate="many_to_one")

In [None]:
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 4))
summary.drop_duplicates(subset=["entropy", "donor_id"]).plot(
    x="age", y="entropy", ax=ax, kind="scatter"
)
plt.show()

### The genotype matrix?