In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime

## Bick et al. 2020 Nature
The problem is that we dont have age here in the table :(

In [None]:
header = pd.read_excel(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/bick2020/41586_2020_2819_MOESM1_ESM.xlsx",
    header=7,
)
print(header.Title.tolist())
header

## Uddin et al. 2022 Immunity & Ageing
Here we have age, but only 11 genes... :(

In [None]:
data = pd.read_excel(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/uddin2022/12979_2022_278_MOESM2_ESM.xlsx",
    header=4,
)
data.drop(columns=data.columns[0], inplace=True)
data

In [None]:
data.VAF.hist()
data.VAF.describe()

In [None]:
data["Sample ID"].value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "JAK2:NM_004972:exon14:c.G1849T:p.V617F")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_ylim([0.001, 1])
ax.set_title("JAK2 V617F")
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "DNMT3A:NM_022552:exon22:c.G2580A:p.W860X")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_title("DNMT3A W860X")
ax.set_ylim([0.001, 1])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "ASXL1:NM_015338:exon12:c.3911delT:p.F1305Lfs*145")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_title("ASXL1")
ax.set_ylim([0.001, 1])
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.lineplot(
    data[
        (data.Annotation == "DNMT3A:NM_022552:exon23:c.G2645A:p.R882H")
        & (data.VAF >= 0.001)
    ],
    x="Age (Blood Draw)",
    y="VAF",
    hue="Sample ID",
    ax=ax,
    estimator=None,
    # marker=".",
)
ax.legend().set_visible(False)
ax.set_yscale("log")
ax.set_title("DNMT3A R882H")
ax.set_ylim([0.001, 1])
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1)
sns.scatterplot(data, x="Age (Blood Draw)", y="VAF", hue="Gene", ax=ax)
ax.set_yscale("log")
ax.set_ylim([9 * 10 ** (-5), 1])
plt.show()

## Roberston et al. 2022 Nature medicine
Get the data from [here](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE178936), use only `1PCT` because we are interested in VAF > 0.01.
Indeed, the other tables `2PCT` are filtered versions of the tables `1PCT`, where only variants with VAF > 0.02 are kept.

Use both synonymous and non-synonymous tables.

To get the age of the donors, we need to do a bit of work. In the paper, they do not discriminate between the cohort `LBC1936` and the cohort `LBC1921` but those two are different.
To assign the average age to the correct donors, we can use the column `Participiant_ID`.

In [None]:
drivers = pd.read_excel(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/robertson2022/41591_2022_1883_MOESM3_ESM.xlsx",
    sheet_name=1,
    header=6,
)
drivers

In [None]:
drivers[["wave", "Largest_VAF"]].groupby("wave").describe()

In [None]:
fig, ax = plt.subplots(1, 1)
sns.lineplot(
    drivers,
    x="wave",
    y="Largest_VAF",
    legend=False,
    errorbar=("ci", 95),
    marker="x",
    ax=ax,
    n_boot=10000,
    seed=10,
)
ax.set_xticks([1, 2, 3, 4, 5])
ax.set_ylim([0, 0.4])
plt.show()

In [None]:
# A) load data
# 1. synonymous
data_synonymous = pd.read_csv(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/robertson2022/GSE178936_LBC_ARCHER.1PCT_VAF.Feb22.synonymous.tsv",
    sep="\t",
)
data_synonymous["synonymous"] = True
# 2. non-synonymous
data = pd.read_csv(
    "/mnt/c/Users/fra_t/Documents/PhD/hsc/robertson2022/GSE178936_LBC_ARCHER.1PCT_VAF.Feb22.non-synonymous.tsv",
    sep="\t",
)
data["synonymous"] = False
# 3. concat
data = pd.concat([data, data_synonymous], axis=0)
data.reset_index(inplace=True, drop=True)

# B) processing
data["cohort"] = data.participant_id.str.extract(r"CHIP_(.+)_.*").astype("category")
# waves start at timepoint 0 and increase by 3 years at every timepoint
mapping = {i: ele for i, ele in enumerate((range(0, 3 * 5, 3)), 1)}
data["Age"] = data.wave.map(mapping)
data.loc[data.cohort == "LBC21", "Age"] += 79
data.loc[data.cohort == "LBC36", "Age"] += 70
data.dropna(subset="HGVSp", inplace=True)
data

In [None]:
# drop patients with chemio (see Robertson et al. 2022 Fig1H)
data[data.PreferredSymbol == "JAK2"].sort_values(by="AF", ascending=False)
# find the two participants that received the chemio based on the VAF of JAK2
# this will remove data that have mutations on many genes such as TP53, KMT2A, DNMT3A and NOTCH1
lost = (
    data[
        data.participant_id.isin(
            data[
                (data.PreferredSymbol == "JAK2") & (data.AF > 0.4)
            ].participant_id.to_list()
        )
    ]
    .PreferredSymbol.unique()
    .tolist()
)
print("loosing data on genes: ", lost)
data.drop(
    index=data[
        data.participant_id.isin(
            data[
                (data.PreferredSymbol == "JAK2") & (data.AF > 0.4)
            ].participant_id.to_list()
        )
    ].index,
    inplace=True,
)

In [None]:
# max VAF per mutation per donor
t = (
    data[data.AF > 0.01]
    .sort_values(["participant_id", "Age", "AF"], ascending=False)[
        ["participant_id", "AF", "Age"]
    ]
    .groupby(["participant_id", "Age"])
    .head(1)
)
t[["AF", "Age"]].groupby("Age").describe()

In [None]:
# largest clone per donor
fig, ax = plt.subplots(1, 1)
sns.lineplot(
    data[data.AF > 0.01]
    .sort_values(["participant_id", "Age", "AF"], ascending=False)[
        ["participant_id", "AF", "Age"]
    ]
    .groupby(["participant_id", "Age"])
    .head(1),
    x="Age",
    y="AF",
    hue="participant_id",
    legend=False,
    palette="Blues",
    ax=ax,
    marker="x",
    markersize=3,
    markeredgewidth=1,
    markeredgecolor="blue",
)
# ax.set_ylim([0, 0.4])
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
sns.histplot(data[data.AF > 0], x="AF", hue="synonymous", ax=ax, binwidth=0.01)
# ax.set_yscale("log")
plt.show()

In [None]:
data[["AF", "synonymous"]].groupby("synonymous", observed=False).describe()

In [None]:
data[["AF", "cohort"]].groupby("cohort", observed=False).describe()

In [None]:
data[["AF", "Age"]].groupby("Age").describe()

In [None]:
data[["AF", "Age"]].groupby(["Age"]).describe()

In [None]:
# this makes me want to drop 82
print(data.Age.value_counts())
data.drop(index=data[data.Age == 82].index, inplace=True)
print(data.Age.value_counts())

### Biggest clones assuming donors are the same process

In [None]:
# for each donor, get the largest frequency. Then, aggregate per age to compute
# some stats on those data
# largest clone per patient
largest_clones = (
    data.loc[data.AF >= 0.01, ["participant_id", "AF", "Age"]]
    .groupby(["participant_id", "Age"])
    .max()
).reset_index()
largest_clones["Largest clone frequency"] = 2 * largest_clones.AF
largest_clones.drop(columns="AF", inplace=True)
data = pd.merge(
    left=data,
    right=largest_clones,
    on=["participant_id", "Age"],
    validate="many_to_one",
)
largest_clones.groupby("Age").describe()

In [None]:
for age in sorted(data.Age.unique()):
    fig, ax = plt.subplots(1, 1, layout="constrained", figsize=(4, 3))
    data_age = data.loc[
        data.Age == age, ["participant_id", "Age", "Largest clone frequency"]
    ].drop_duplicates()
    sns.histplot(
        data_age, x="Largest clone frequency", bins=np.arange(0, 1.1, 0.1), ax=ax
    )
    ax.set_xlim([0, 1])
    ax.set_title(f"{age} y.o.")
    ax.set_xlabel("Largest clone frequency")
    plt.savefig(f"largest_clone_distr_per_timepoint_robertson_{age}years.svg")
    plt.show()

In [None]:
# largest clone per patient
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    data[["participant_id", "Age", "Largest clone frequency"]].drop_duplicates(),
    x="Age",
    y="Largest clone frequency",
    errorbar=("ci", 95),
    legend=False,
    ax=ax,
    marker="x",
    markersize=3,
    markeredgewidth=1,
    markeredgecolor="blue",
    n_boot=10000,
    seed=10,
)
ax.set_xlabel("Age (years)")
ax.set_ylim([0, 1])
ax.set_xlim([68, 92])
plt.savefig("largest_clone_avg_robertson.png", dpi=600)
plt.show()

In [None]:
detected_clones = (
    data.loc[data.AF >= 0.01, ["AF", "Age", "participant_id"]]
    .groupby(["participant_id", "Age"])
    .count()
)
detected_clones.rename(columns={"AF": "Detected clones"}, inplace=True)
detected_clones.reset_index(inplace=True)
data = pd.merge(
    left=data,
    right=detected_clones,
    on=["participant_id", "Age"],
    validate="many_to_one",
)
detected_clones.groupby("Age").describe()

In [None]:
# TODO: I think we should drop age > 81 if we want to compare to Fig3F
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    data[["participant_id", "Age", "Detected clones"]].drop_duplicates(),
    x="Age",
    y="Detected clones",
    legend=False,
    marker="x",
    markeredgewidth=1.2,
    markeredgecolor="black",
    ax=ax,
)
ax.set_ylabel("Number of expanded clones")
ax.set_xlabel("Age (years)")
ax.set_ylim([10, 40])
plt.savefig("expanded_clones_robertson.png", dpi=600)
plt.show()

In [None]:
detected_clones_non_syn = (
    data.loc[(data.AF >= 0.01) & (~data.synonymous), ["AF", "Age", "participant_id"]]
    .groupby(["participant_id", "Age"])
    .count()
)
detected_clones_non_syn.rename(
    columns={"AF": "Detected clones non synonymous"}, inplace=True
)
detected_clones_non_syn.reset_index(inplace=True)
data = pd.merge(
    left=data,
    right=detected_clones_non_syn,
    on=["participant_id", "Age"],
    validate="many_to_one",
)
# 446?? hitchhiking!
print(data["Detected clones"].unique())
print(data["Detected clones non synonymous"].unique())
detected_clones_non_syn.groupby("Age").describe()

In [None]:
# TODO: I think we should drop age > 81 if we want to compare to Fig3F
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")
sns.lineplot(
    data[["participant_id", "Age", "Detected clones non synonymous"]].drop_duplicates(),
    x="Age",
    y="Detected clones non synonymous",
    legend=False,
    marker="x",
    markeredgewidth=1.2,
    markeredgecolor="black",
    ax=ax,
)
ax.set_ylabel("Number of expanded clones")
ax.set_xlabel("Age (years)")
ax.set_ylim([0, 35])
plt.savefig("expanded_clones_nonsynonymous_robertson.png", dpi=600)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3), layout="constrained")

sns.lineplot(
    data[["participant_id", "Age", "Largest clone frequency"]].drop_duplicates(),
    x="Age",
    y="Largest clone frequency",
    hue="participant_id",
    legend=False,
    # palette="Blues",
    ax=ax,
    marker="x",
    markersize=3,
    markeredgewidth=1,
    markeredgecolor="blue",
)
ax.set_ylim([0, 1])
ax.set_xlabel("Age (years)")
plt.savefig("largest_clone_robertson.png", dpi=600)
plt.show()

In [None]:
today = datetime.date.today().strftime("%d_%m_%Y")
data.to_csv(f"robertson_cleaned_{today}.csv", index=False)
pd.read_csv(f"robertson_cleaned_{today}.csv").columns

### Trajectories

#### Protein substitution

In [None]:
sns.lineplot(
    data[
        (data["PreferredSymbol"] == "JAK2")
        & (data["protein_substitution"] == "p.Val617Phe")
    ],
    x="Age",
    y="AF",
)
data.loc[
    (data["PreferredSymbol"] == "JAK2")
    & (data["protein_substitution"] == "p.Val617Phe"),
    ["AF", "Age"],
].groupby("Age").describe()

In [None]:
sns.lineplot(
    data[data.HGVSp == "NP_005924.2:p.Pro773ArgfsTer8"],
    x="Age",
    y="AF",
)
data.HGVSp.value_counts()

#### Gene level

In [None]:
data[
    (data["PreferredSymbol"] == "DNMT3A")
    & (data.Variant_Classification == "Missense_Mutation")
]

In [None]:
data.loc[
    (data["participant_id"] == "CHIP_LBC36_037") & (data.AF >= 0.01), ["AF", "HGVSp"]
].groupby("HGVSp").describe()

In [None]:
data.loc[
    (data["participant_id"] == "CHIP_LBC36_037") & (data.AF >= 0.01),
    ["AF", "PreferredSymbol", "Age"],
]

In [None]:
# the most present genes at patient level?
data[["participant_id", "PreferredSymbol", "AF"]].groupby(
    ["participant_id", "PreferredSymbol"]
).count().sort_values(
    by="AF", ascending=False
).reset_index().PreferredSymbol.value_counts().head(
    n=20
)

In [None]:
# hitchhiking: how can we distinguish between hichhikers and clones?
# genes = {"NF1", "NOTCH1", "TP53"}
genes = sorted(["KMT2A", "DNMT3A", "NOTCH1", "TP53", "NF1", "RAD21"])
for participant in data.loc[data["Detected clones"] > 30, "participant_id"].unique():
    xlims = (
        data.loc[data["participant_id"] == participant, "Age"].min() - 1,
        data.loc[data["participant_id"] == participant, "Age"].max() + 1,
    )
    fig, axes = plt.subplots(
        2, 3, figsize=(6, 3.5), layout="constrained", sharey=True, sharex=True
    )
    tmp = data.loc[
        (data["participant_id"] == participant) & (data["PreferredSymbol"].isin(genes)),
        ["AF", "HGVSp", "Age", "PreferredSymbol"],
    ]
    ylims = 0, tmp.AF.max() + 0.001
    for gene, ax in zip(genes, axes.ravel()):
        sns.lineplot(
            data=tmp[tmp.PreferredSymbol == gene],
            x="Age",
            y="AF",
            hue="HGVSp",
            marker="x",
            markersize=3,
            markeredgewidth=1,
            markeredgecolor="blue",
            legend=False,
            ax=ax,
        )
        ax.set_title(gene)
        ax.set_ylim(ylims)
        ax.set_xlim(xlims)
        ax.fill_between(
            x=np.arange(xlims[0], xlims[1] + 1),
            y1=0,
            y2=0.01,
            color="red",
            alpha=0.1,
            edgecolor="face",
        )
    fig.suptitle(participant)
    plt.savefig(f'{str("_").join(genes)}_{participant}.png', dpi=600)
    plt.show()

In [None]:
print(
    f'There are {data.loc[data["Detected clones"] > 40, "participant_id"].drop_duplicates().shape[0]} donors with more than 40 expanded clones'
)
sns.lineplot(
    data.loc[
        (data["participant_id"] == "CHIP_LBC36_037")
        & (data.AF >= 0.01)
        & (~data.synonymous),
        ["AF", "Age", "HGVSp", "PreferredSymbol"],
    ],
    x="Age",
    y="AF",
    hue="HGVSp",
    legend=False,
    # palette="Greens",
    marker="x",
    markersize=3,
    markeredgewidth=1,
    markeredgecolor="blue",
    # estimator=None,
)
plt.show()

In [None]:
sns.relplot(
    data.loc[
        (data["PreferredSymbol"] == "JAK2")
        & (~data.Variant_Classification.isin(["RNA", "5'UTR"]))
    ],
    x="Age",
    y="AF",
    kind="line",
    marker=".",
)
plt.show()

In [None]:
sns.relplot(
    data.loc[
        (data["PreferredSymbol"] == "JAK2")
        & (~data.Variant_Classification.isin(["RNA", "5'UTR"]))
    ],
    x="Age",
    y="AF",
    hue="participant_id",
    col="Variant_Classification",
    kind="line",
    marker=".",
)
plt.show()

In [None]:
sns.relplot(
    data.loc[
        (data["PreferredSymbol"] == "TET2")
        & (~data.Variant_Classification.isin(["RNA", "5'UTR"]))
    ],
    x="Age",
    y="AF",
    hue="participant_id",
    col="Variant_Classification",
    kind="line",
    legend=False,
)
plt.show()

In [None]:
sns.lineplot(
    data[(data["PreferredSymbol"] == "JAK2") & (data.type == "snp")],
    x="Age",
    y="AF",
    ls="-.",
)
sns.lineplot(
    data[(data["PreferredSymbol"] == "JAK2") & (data.type == "snp")],
    x="Age",
    y="AF",
    hue="participant_id",
    legend=False,
)
plt.show()


sns.lineplot(
    data[data["PreferredSymbol"] == "TET2"],
    x="Age",
    y="AF",
    hue="Variant_Classification",
)
plt.show()

sns.lineplot(
    data[(data["PreferredSymbol"] == "DNMT3A") & (data.type == "snp")],
    x="Age",
    y="AF",
)
plt.show()