In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import matplotlib as mpl
from matplotlib.lines import Line2D

In [None]:
def plot_trajectories_gene(ax, df, genes, colors):
    for gene, c in zip(genes, colors):
        df_per_gene = df.loc[df.Gene == gene, ["Age_binned", "VAF", "SardID"]]
        for id_ in df_per_gene.SardID.unique():
            ax.plot(
                df_per_gene.loc[df_per_gene.SardID == id_, "Age_binned"],
                df_per_gene.loc[df_per_gene.SardID == id_, "VAF"],
                color=c,
                marker=".",
                alpha=0.8,
            )
    ax.set_xlabel("Age (years)")
    ax.legend(
        handles=[
            Line2D([0], [0], color=c, lw=1, marker=".", label=g)
            for g, c in zip(genes, colors)
        ],
        frameon=True,
        fontsize=8,
    )
    return ax

In [None]:
fabre = pd.read_csv(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/fabre2022/ALLvariants_exclSynonymous_Xadj.txt",
    sep="\t",
)
bins = range(50, 108, 2)
fabre["is_sequencing_artefact"] = fabre.VAF < 0.02
tot_donors = fabre.SardID.unique().shape[0]
tot_donors_valid = fabre.loc[~fabre.is_sequencing_artefact, "SardID"].unique().shape[0]
print(
    f"{tot_donors_valid} donors with valid variant over a total of {tot_donors} donors"
)
fabre["Age_binned"] = (
    pd.cut(fabre.Age, bins=bins).map(lambda ele: ele.mid + 1).astype(int)
)
fabre.Age = fabre.Age.map(round).astype(int)
fabre["Type"] = fabre.Type.fillna("Unknown")
fabre.Type = fabre.Type.astype("category")
print(f"{fabre.Gene.unique().shape[0]} genes")
print(f"[{fabre.Age.min()}-{fabre.Age.max()}] age interval")
fabre

In [None]:
fabre.Type

In [None]:
fabre[fabre.Type == "frameshift_deletion"]

In [None]:
fabre[~fabre.is_sequencing_artefact].Age.hist(bins=bins)

In [None]:
fabre.loc[~fabre.is_sequencing_artefact, "VAF"].hist(bins=np.arange(0, 0.5, 0.02))

In [None]:
fabre.loc[~fabre.is_sequencing_artefact, ["Gene", "SardID", "VAF"]].groupby(
    ["SardID", "Gene"]
).count().sort_values(by="VAF", ascending=False).reset_index().Gene.value_counts()

In [None]:
sns.lineplot(
    fabre[~fabre.is_sequencing_artefact],
    x="Age_binned",
    y="VAF",
)

In [None]:
largest_clones = (
    fabre.loc[~fabre.is_sequencing_artefact, ["Age", "SardID", "VAF"]]
    .groupby(["SardID", "Age"])
    .max()
    .dropna()
    .reset_index()
)
largest_clones["is_largest_clone"] = True
shape_bf_merge = fabre.shape
fabre = pd.merge(
    left=fabre,
    right=largest_clones,
    how="left",
    on=["SardID", "Age", "VAF"],
    validate="many_to_one",
)
print(fabre.shape)
assert fabre.shape[0] == shape_bf_merge[0]
fabre["is_largest_clone"] = fabre.is_largest_clone.fillna(0).astype(bool)
fabre

In [None]:
sns.lineplot(
    fabre[(~fabre.is_sequencing_artefact) & (fabre.is_largest_clone)],
    x="Age_binned",
    y="VAF",
)

In [None]:
for age in sorted(fabre.Age_binned.unique()):
    fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
    ax.hist(
        fabre.loc[
            (~fabre.is_sequencing_artefact)
            & (fabre.is_largest_clone)
            & (fabre.Age_binned == age),
            "VAF",
        ],
        bins=np.arange(0, 0.5, 0.02),
    )
    ax.set_title(age)
    ax.set_ylabel("counts")
    ax.set_xlabel("VAF")
    plt.show()

In [None]:
# largest clone per donor, select only genes with AF >= THRSH
THRSH = 0.1
largest_largest_clones = fabre.loc[
    (fabre.is_largest_clone) & (fabre.VAF > THRSH),
    ["VAF", "Age_binned", "SardID", "Gene"],
]
genes = ["ASXL1", "DNMT3A", "JAK2", "TET2", "TP53", "NF1"]
colors = mpl.colormaps["Dark2"].colors[: len(genes)]
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
ax = plot_trajectories_gene(ax, largest_largest_clones, genes, colors)
ax.set_ylabel(f"Largest clones VAF (VAF>{THRSH})")
# plt.savefig("largest_clones_genes_trajectories.png", dpi=600)
plt.show()

In [None]:
detected_clones.SardID.unique().shape

In [None]:
fabre.SardID.unique().shape

In [None]:
detected_clones = (
    fabre.loc[
        (~fabre.is_sequencing_artefact) & (fabre.VAF >= 0.01),
        ["VAF", "Age_binned", "SardID"],
    ]
    .groupby(["SardID", "Age_binned"])
    .count()
)
detected_clones.rename(columns={"VAF": "Detected clones"}, inplace=True)
detected_clones.reset_index(inplace=True)
fabre = pd.merge(
    left=fabre,
    right=detected_clones,
    how="left",
    on=["SardID", "Age_binned"],
    validate="many_to_one",
)
fabre["Detected clones"] = fabre["Detected clones"].fillna(0).astype(int)

fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    fabre[["SardID", "Age_binned", "Detected clones"]].drop_duplicates(),
    x="Age_binned",
    y="Detected clones",
    legend=False,
    marker="x",
    markeredgewidth=1.2,
    markeredgecolor="orange",
    markerfacecolor="orange",
    ax=ax,
)
ax.set_ylabel("Number of expanded clones")
ax.set_xlabel("Age (years)")
ax.set_ylim([0, 4])
plt.savefig("expanded_clones_robertson.png", dpi=600)
plt.show()
detected_clones.drop(columns="SardID").groupby("Age_binned").describe()

In [None]:
today = datetime.date.today().strftime("%d_%m_%Y")
fabre.to_csv(f"fabre_cleaned_{today}.csv", index=False)
pd.read_csv(f"fabre_cleaned_{today}.csv").columns