In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime
import matplotlib as mpl
from matplotlib.lines import Line2D

## Bick et al. 2020 Nature
The problem is that we dont have age here in the table :(

In [None]:
header = pd.read_excel(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/bick2020/41586_2020_2819_MOESM1_ESM.xlsx",
    header=7,
)
print(header.Title.tolist())
header

## Uddin et al. 2022 Immunity & Ageing
Here we have age, but only 11 genes... :(

In [None]:
data = pd.read_excel(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/uddin2022/12979_2022_278_MOESM2_ESM.xlsx",
    header=4,
)
data.drop(columns=data.columns[0], inplace=True)
data

In [None]:
data.VAF.hist()
data.VAF.describe()

In [None]:
data["Sample ID"].value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "JAK2:NM_004972:exon14:c.G1849T:p.V617F")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_ylim([0.001, 1])
ax.set_title("JAK2 V617F")
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "DNMT3A:NM_022552:exon22:c.G2580A:p.W860X")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_title("DNMT3A W860X")
ax.set_ylim([0.001, 1])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "ASXL1:NM_015338:exon12:c.3911delT:p.F1305Lfs*145")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_title("ASXL1")
ax.set_ylim([0.001, 1])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "DNMT3A:NM_022552:exon23:c.G2645A:p.R882H")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_title("DNMT3A R882H")
ax.set_ylim([0.001, 1])
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1)
sns.scatterplot(data, x="Age (Blood Draw)", y="VAF", hue="Gene", ax=ax)
ax.set_yscale("log")
ax.set_ylim([9 * 10 ** (-5), 1])
plt.show()

## Roberston et al. 2022 Nature medicine
Get the data from [here](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE178936), use only `1PCT` because we are interested in VAF > 0.01.
Indeed, the other tables `2PCT` are filtered versions of the tables `1PCT`, where only variants with VAF > 0.02 are kept.

Use both synonymous and non-synonymous tables.

To get the age of the donors, we need to do a bit of work. In the paper, they do not discriminate between the cohort `LBC1936` and the cohort `LBC1921` but those two are different.
To assign the average age to the correct donors, we can use the column `Participiant_ID`.

In [None]:
# set a threshold on valid mutations (see analysis below at the protein level: `search hitchhiking`)
VALID_VARIANT_AF = 0.02

In [None]:
def plot_trajectories_gene(ax, df, genes, colors):
    for gene, c in zip(genes, colors):
        df_per_gene = df.loc[
            df.PreferredSymbol == gene, ["Age", "AF", "participant_id"]
        ]
        for id_ in df_per_gene.participant_id.unique():
            ax.plot(
                df_per_gene.loc[df_per_gene.participant_id == id_, "Age"],
                df_per_gene.loc[df_per_gene.participant_id == id_, "AF"],
                color=c,
                marker=".",
                alpha=0.8,
            )
    ax.set_xlabel("Age (years)")
    ax.legend(
        handles=[
            Line2D([0], [0], color=c, lw=1, marker=".", label=g)
            for g, c in zip(genes, colors)
        ],
        frameon=True,
        fontsize=8,
    )
    return ax

In [None]:
# A) load data
# 1. synonymous
data_synonymous = pd.read_csv(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/robertson2022/GSE178936_LBC_ARCHER.1PCT_VAF.Feb22.synonymous.tsv",
    sep="\t",
)
data_synonymous["synonymous"] = True
# 2. non-synonymous
data = pd.read_csv(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/robertson2022/GSE178936_LBC_ARCHER.1PCT_VAF.Feb22.non-synonymous.tsv",
    sep="\t",
)
data["synonymous"] = False
# 3. concat
data = pd.concat([data, data_synonymous], axis=0)
data.reset_index(inplace=True, drop=True)

# B) processing
data["cohort"] = data.participant_id.str.extract(r"CHIP_(.+)_.*").astype("category")
# waves start at timepoint 0 and increase by 3 years at every timepoint
mapping = {i: ele for i, ele in enumerate((range(0, 3 * 5, 3)), 1)}
data["Age"] = data.wave.map(mapping)
data.loc[data.cohort == "LBC21", "Age"] += 79
data.loc[data.cohort == "LBC36", "Age"] += 70
data.dropna(subset="HGVSp", inplace=True)
# drop patients with chemio (see Robertson et al. 2022 Fig1H)
data[data.PreferredSymbol == "JAK2"].sort_values(by="AF", ascending=False)
# find the two participants that received the chemio based on the VAF of JAK2
# this will remove data that have mutations on many genes such as TP53, KMT2A, DNMT3A and NOTCH1
data.drop(
    index=data[
        data.participant_id.isin(
            data[
                (data.PreferredSymbol == "JAK2") & (data.AF > 0.4)
            ].participant_id.to_list()
        )
    ].index,
    inplace=True,
)
# drop donors at age 82 because not enough data
print(data.Age.value_counts().sort_index())
data.drop(index=data[data.Age == 82].index, inplace=True)
print("Dropping donors age 82 because not enough data")
# set artefact based on the paper and the figures shown later on
data["is_sequencing_artefact"] = data.AF < VALID_VARIANT_AF
print(
    f"{data.is_sequencing_artefact.sum() / data.shape[0]:.2%} of entries with VAF lower than {VALID_VARIANT_AF}"
)
data["is_outlier"] = data.AF_Outlier_Pvalue >= 0.05  # not sure about this
print(f"{data.is_outlier.sum() / data.shape[0]:.2%} percentage of outliers?")
print(f"{data.PreferredSymbol.unique().shape[0]} unique genes")
data["clone frequency"] = data.AF * 2
print(f"{data.participant_id.unique().shape[0]} donors")
print(
    f"{data.participant_id.unique().shape[0] - data.loc[~data.is_sequencing_artefact, 'participant_id'].unique().shape[0]} donors without any valid variant"
)
print(f"[{data.Age.min()}-{data.Age.max()}] age interval")
data[~data.is_sequencing_artefact].Age.hist(bins=range(50, 105, 2))
data

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.histplot(
    data[~data.is_sequencing_artefact], x="AF", hue="synonymous", ax=ax, binwidth=0.01
)
plt.show()

# LBC21 are older donors
data.loc[~data.is_sequencing_artefact, ["AF", "cohort"]].groupby(
    "cohort", observed=False
).describe()

In [None]:
# the most present genes at patient level?
data.loc[
    ~data.is_sequencing_artefact, ["participant_id", "PreferredSymbol", "AF"]
].groupby(["participant_id", "PreferredSymbol"]).count().sort_values(
    by="AF", ascending=False
).reset_index().PreferredSymbol.value_counts()

In [None]:
# increasing VAF at first, then decreasing
fig, ax = plt.subplots(1, 1, layout="constrained", figsize=(4, 3))
sns.lineplot(
    data.loc[(~data.is_sequencing_artefact) & (~data.synonymous), ["AF", "Age"]],
    x="Age",
    y="AF",
    marker=".",
    markersize=5,
    markeredgewidth=2,
    markeredgecolor="orange",
    markerfacecolor="orange",
)
data.loc[~data.is_sequencing_artefact, ["AF", "Age"]].groupby("Age").describe()

In [None]:
genes = ["TET2", "DNMT3A", "JAK2", "NF1"]
colors = mpl.colormaps["Dark2"].colors[: len(genes)]
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
ax = plot_trajectories_gene(
    ax, data[(~data.is_sequencing_artefact) & (~data.synonymous)], genes, colors
)
ax.set_ylabel("Clones VAF")
plt.savefig("genes_trajectories.png", dpi=600)
plt.show()

### Biggest clones assuming donors are the same process

In [None]:
# for each donor, get the largest frequency. Then, aggregate per age to compute
# some stats on those data
# largest clone per patient
largest_clones = (
    data.loc[
        (~data.is_sequencing_artefact) & (~data.synonymous),
        ["participant_id", "AF", "Age"],
    ]
    .groupby(["participant_id", "Age"])
    .max()
).reset_index()
# largest_clones["Largest clone frequency"] = 2 * largest_clones.AF
largest_clones["is_largest_clone"] = True
# largest_clones.drop(columns="AF", inplace=True)
shape_bf_merge = data.shape
print(shape_bf_merge)
data = pd.merge(
    left=data,
    right=largest_clones,
    how="left",
    on=["participant_id", "Age", "AF"],
    validate="many_to_one",
)
print(data.shape)
assert data.shape[0] == shape_bf_merge[0]
data["is_largest_clone"] = data.is_largest_clone.fillna(0).astype(bool)

In [None]:
# VAF in fn of age
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    data.loc[data.is_largest_clone, ["participant_id", "Age", "clone frequency"]],
    x="Age",
    y="clone frequency",
    errorbar=("ci", 95),
    legend=False,
    ax=ax,
    marker=".",
    markersize=5,
    markeredgewidth=2,
    markeredgecolor="orange",
    markerfacecolor="orange",
    n_boot=10000,
    seed=10,
)
ax.set_xlabel("Age (years)")
ax.set_ylabel("Largest clone frequency")
# ax.set_ylim([0, 0.4])
ax.set_xlim([68, 92])
plt.savefig("largest_clone_avg_robertson.png", dpi=600)
plt.show()

largest_clones.drop(columns=["participant_id", "is_largest_clone"]).groupby(
    "Age"
).describe()

In [None]:
# are the genes from 79 different from genes at age 88?
gene_occurrences = (
    data.loc[data.is_largest_clone, ["Age", "participant_id", "PreferredSymbol"]]
    .groupby(["Age", "PreferredSymbol"])
    .count()
    .reset_index()
)
for age in [79, 88]:
    print(age)
    print(
        gene_occurrences.loc[
            gene_occurrences.Age == age, ["PreferredSymbol", "participant_id"]
        ].sort_values(by="PreferredSymbol")
    )

In [None]:
for age in sorted(data.Age.unique()):
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=(4, 3))
    data_age = data.loc[
        (data.Age == age) & (data.is_largest_clone), ["clone frequency"]
    ].drop_duplicates()
    sns.histplot(data_age, x="clone frequency", bins=np.arange(0, 1.1, 0.05), ax=ax)
    ax.set_xlim([0, 1])
    ax.set_title(f"{age} y.o.")
    ax.set_xlabel("Largest clone frequency")
    plt.savefig(f"largest_clone_distr_per_timepoint_robertson_{age}years.svg")
    plt.show()

In [None]:
# largest clone per donor
fig, ax = plt.subplots(1, 1, layout="constrained")
g = sns.lineplot(
    data.loc[
        (data.is_largest_clone) & (data.AF >= 0.06),
        ["AF", "Age", "participant_id", "PreferredSymbol"],
    ],
    x="Age",
    y="AF",
    hue="PreferredSymbol",
    errorbar=("ci", 95),
    # legend=False,
    palette="Dark2",
    ax=ax,
    marker="x",
    markersize=3,
    markeredgewidth=1,
    markeredgecolor="blue",
)
# sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1), title="Gene", fontsize=8)
ax.set_ylim([0.05, 0.5])
plt.show()

In [None]:
# largest clone per donor, select only genes with AF >= 0.06
largest_largest_clones = data.loc[
    (data.is_largest_clone) & (data.AF > 0.0601),
    ["AF", "Age", "participant_id", "PreferredSymbol"],
]
genes = largest_largest_clones.PreferredSymbol.unique()
colors = mpl.colormaps["Dark2"].colors[: len(genes)]
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
ax = plot_trajectories_gene(ax, largest_largest_clones, genes, colors)
ax.set_ylabel("Largest clones VAF (VAF>0.06)")
plt.savefig("largest_clones_genes_trajectories.png", dpi=600)
plt.show()

# largest clone per donor, select all genes
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    data.loc[data.is_largest_clone, ["AF", "Age", "participant_id"]],
    x="Age",
    y="AF",
    hue="participant_id",
    legend=False,
    # palette="Blues",
    ax=ax,
    marker="x",
    markersize=3,
    markeredgewidth=1,
    markeredgecolor="blue",
)
ax.set_ylim([0, 0.5])
ax.set_ylabel("Largest clones VAF")
ax.set_xlabel("Age (years)")
plt.show()

In [None]:
detected_clones = (
    data.loc[data.AF >= 0.01, ["AF", "Age", "participant_id"]]
    .groupby(["participant_id", "Age"])
    .count()
)
detected_clones.rename(columns={"AF": "Detected clones with artefacts"}, inplace=True)
detected_clones.reset_index(inplace=True)
data = pd.merge(
    left=data,
    right=detected_clones,
    how="left",
    on=["participant_id", "Age"],
    validate="many_to_one",
)
data["Detected clones with artefacts"] = (
    data["Detected clones with artefacts"].fillna(0).astype(int)
)
detected_clones.groupby("Age").describe()

In [None]:
detected_clones = (
    data.loc[
        (~data.is_sequencing_artefact) & (data.AF >= 0.01),
        ["AF", "Age", "participant_id"],
    ]
    .groupby(["participant_id", "Age"])
    .count()
)
detected_clones.rename(columns={"AF": "Detected clones"}, inplace=True)
detected_clones.reset_index(inplace=True)
data = pd.merge(
    left=data,
    right=detected_clones,
    how="left",
    on=["participant_id", "Age"],
    validate="many_to_one",
)
data["Detected clones"] = data["Detected clones"].fillna(0).astype(int)
detected_clones.groupby("Age").describe()

In [None]:
# TODO: I think we should drop age > 81 if we want to compare to Fig3F
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    data[["participant_id", "Age", "Detected clones"]].drop_duplicates(),
    x="Age",
    y="Detected clones",
    legend=False,
    marker="x",
    markeredgewidth=1.2,
    markeredgecolor="black",
    ax=ax,
)
ax.set_ylabel("Number of expanded clones")
ax.set_xlabel("Age (years)")
ax.set_ylim([0, 3])
plt.savefig("expanded_clones_robertson.png", dpi=600)
plt.show()

In [None]:
today = datetime.date.today().strftime("%d_%m_%Y")
data.to_csv(f"robertson_cleaned_{today}.csv", index=False)
pd.read_csv(f"robertson_cleaned_{today}.csv").columns

In [None]:
# hitchhiking: how can we distinguish between hichhikers and clones?
# this is probably not hitchhiking otherwise we would observe that also
# with higher AF, which is not the case.
# Based on this we discard all variants with AF < 0.02 for the largest clones
# genes = {"NF1", "NOTCH1", "TP53"}
genes = sorted(["KMT2A", "DNMT3A", "NOTCH1", "TP53", "NF1", "RAD21"])
for participant in data.loc[
    data["Detected clones with artefacts"] > 30, "participant_id"
].unique():
    xlims = (
        data.loc[data["participant_id"] == participant, "Age"].min() - 1,
        data.loc[data["participant_id"] == participant, "Age"].max() + 1,
    )
    fig, axes = plt.subplots(
        2, 3, figsize=(6, 3.5), layout="constrained", sharey=True, sharex=True
    )
    tmp = data.loc[
        (data["participant_id"] == participant) & (data["PreferredSymbol"].isin(genes)),
        ["AF", "HGVSp", "Age", "PreferredSymbol"],
    ]
    ylims = 0, tmp.AF.max() + 0.001
    for gene, ax in zip(genes, axes.ravel()):
        sns.lineplot(
            data=tmp[tmp.PreferredSymbol == gene],
            x="Age",
            y="AF",
            hue="HGVSp",
            marker="x",
            markersize=3,
            markeredgewidth=1,
            markeredgecolor="blue",
            legend=False,
            ax=ax,
        )
        ax.set_title(gene)
        ax.set_ylim(ylims)
        ax.set_xlim(xlims)
        ax.fill_between(
            x=np.arange(xlims[0], xlims[1] + 1),
            y1=0,
            y2=0.01,
            color="red",
            alpha=0.1,
            edgecolor="face",
        )
    fig.suptitle(participant)
    plt.savefig(f'{str("_").join(genes)}_{participant}.png', dpi=600)
    plt.show()

In [None]:
sns.relplot(
    data.loc[
        (data["PreferredSymbol"] == "JAK2")
        & (~data.Variant_Classification.isin(["RNA", "5'UTR"]))
    ],
    x="Age",
    y="AF",
    hue="participant_id",
    col="Variant_Classification",
    kind="line",
    marker=".",
)
plt.show()