# Aggregate statistics on mutations

In [None]:
# get variables from `snakemake`
sequential_to_reference_csv = snakemake.input.sequential_to_reference
usher_mut_counts_csv = snakemake.input.usher_mut_counts
usher_recent_mut_counts_csv = snakemake.input.usher_recent_mut_counts
pango_lineage_counts_csv = snakemake.input.pango_lineage_counts
alignment_counts_csv = snakemake.input.alignment_counts
mutation_stats_csv = snakemake.output.mutation_stats

In [None]:
import pandas as pd

Read the data:

In [None]:
sequential_to_reference = pd.read_csv(sequential_to_reference_csv)

usher_mut_counts = pd.read_csv(usher_mut_counts_csv)

usher_recent_mut_counts = pd.read_csv(usher_recent_mut_counts_csv)

pango_lineage_counts = pd.read_csv(pango_lineage_counts_csv)

alignment_counts = pd.read_csv(alignment_counts_csv)

In [None]:
alignment_counts

Aggregate the different counts of the mutations:

In [None]:
aas = list("ACDEFGHIKLMNPQRSTVWY*-")

df = (
    sequential_to_reference
    .rename(columns={"aa": "wildtype_aa"})
    .merge(pd.DataFrame({"mutant_aa": aas}), how="cross")
    .assign(
        reference_site=lambda x: x["reference_site"].astype(str),
        sequential_site=lambda x: x["sequential_site"],
    )
    .merge(
        usher_mut_counts
        .rename(columns={"site": "reference_site", "count": "UShER_count"})
        .assign(reference_site=lambda x: x["reference_site"].astype(str)),
        how="outer",
        on=["reference_site", "mutant_aa"],
        validate="one_to_one",
    )
    .merge(
        usher_recent_mut_counts
        .rename(columns={"site": "reference_site", "count": "UShER_recent_count"})
        .assign(reference_site=lambda x: x["reference_site"].astype(str)),
        how="outer",
        on=["reference_site", "mutant_aa"],
        validate="one_to_one",
    )
    .merge(
        alignment_counts
        .assign(reference_site=lambda x: x["site"].astype(str))
        .rename(columns={"count": "alignment_count", "mutant": "mutant_aa"})
        [["reference_site", "alignment_count", "mutant_aa"]],
        how="outer",
        on=["reference_site", "mutant_aa"],
        validate="one_to_one",
    )
    .merge(
        pango_lineage_counts
        [["reference_site", "mutant_aa"]]
        .assign(
            reference_site=lambda x: x["reference_site"].astype(str),
            in_pango_lineage=True,
        ),
        how="outer",
        on=["reference_site", "mutant_aa"],
        validate="one_to_one",
    )
    .query("sequential_site.notnull()")
    .assign(
        mutated_from_reference=lambda x: x["wildtype_aa"] != x["reference_aa"],
        UShER_count=lambda x: x["UShER_count"].fillna(0).astype(int),
        UShER_recent_count=lambda x: x["UShER_recent_count"].fillna(0).astype(int),
        alignment_count=lambda x: x["alignment_count"].fillna(0).astype(int),
        in_pango_lineage=lambda x: x["in_pango_lineage"].fillna(False).astype(bool),
        sequential_site=lambda x: x["sequential_site"].astype(int),
    )
    .sort_values(["sequential_site", "mutant_aa"])
)

assert df["sequential_site"].nunique() == len(
    df.groupby(["sequential_site", "reference_site", "wildtype_aa", "reference_aa"])
)
assert df["sequential_site"].max() * len(aas) == len(df)

df.to_csv(mutation_stats_csv, index=False)

df