# Get information on Pango lineages

In [None]:
# get variables from snakemake
variant_mutations_csv = snakemake.input.variant_mutations
pango_lineages_json = snakemake.params.pango_lineages_json
variant_lineage = snakemake.params.variant_lineage
mutations_by_lineage_csv = snakemake.output.mutations_by_lineage
mutations_all_csv = snakemake.output.mutations_all

In [None]:
import json
import urllib

import pandas as pd

Read Pango lineage definitions:

In [None]:
print(f"Reading Pango lineage definitions from {pango_lineages_json}")
with urllib.request.urlopen(pango_lineages_json) as url:
    pango_lineages = json.load(url)
print(f"Read definitions for {len(pango_lineages)} lineages")

In [None]:
def parse_spike_muts(d):
    """Parse spike mutations from dict for a lineage."""
    return [
        (mut.split(":")[1][0], int(mut.split(":")[1][1: -1]), mut.split(":")[1][-1])
        for mut in d["aaSubstitutions"] + d["aaDeletions"]
        if mut and mut.startswith("S:")
    ]

lineages_d = {}
for lineage, lineage_d in pango_lineages.items():
    spike_muts = parse_spike_muts(lineage_d)
    lineages_d[lineage] = {
        "date": lineage_d["designationDate"] if lineage_d["designationDate"] else pd.NA,
        "parent": lineage_d["parent"] if lineage_d["parent"] else pd.NA,
        "mutations": [f"{wt}{r}{m}" for (wt, r, m) in spike_muts],
        "n_mutations": len(spike_muts),
    }

lineages_df = (
    pd.DataFrame.from_dict(lineages_d, orient="index").reset_index(names="lineage")
)

(
    lineages_df
    .assign(mutations=lambda x: x["mutations"].map(" ".join))
    .to_csv(mutations_by_lineage_csv, index=False)
)

Check variant used in experiments has expected mutations (noting that lineage definitions lack insertions):

In [None]:
print(f"Checking that {variant_lineage=} used in experiments has expected mutations")

variant_mutations = pd.read_csv(variant_mutations_csv)

mutations_from_lineage = lineages_d[variant_lineage]["mutations"]

missing_from_variant = set(mutations_from_lineage) - set(variant_mutations["mutation"])
if missing_from_variant:
    raise ValueError(
        f"Variant for experiment is missing mutations {missing_from_variant=}"
    )
else:
    print("Variant for experiment has all the expected mutations")    

extra_in_variant = variant_mutations.query("mutation not in @mutations_from_lineage")

if len(extra_in_variant):
    print("Variant for experiment also has the following extra mutations:")
    display(extra_in_variant)
    if (extra_in_variant["mutation_type"] == "insertion").all():
        print("Extra mutations are insertions; OK as not annotated in Pango definitions")
    else:
        raise ValueError(f"extra non-insertion mutations:\n{extra_in_variant}")
else:
    print("Variant in experiment has no extra mutations")

Get all amino-acid identities in any Pango lineage that differ from the variant used in the experiments:

In [None]:
mutations_all = (
    lineages_df
    .explode("mutations")
    [["mutations"]]
    .query("mutations.notnull()")
    .drop_duplicates()
    .assign(
        reference_aa=lambda x: x["mutations"].str[0],
        reference_site=lambda x: x["mutations"].str[1: -1].astype(int),
        mutant=lambda x: x["mutations"].str[-1],
    )
    .drop(columns="mutations")
    .merge(
        (
            variant_mutations
            .query("mutation_type != 'insertion'")
            .assign(
                reference_site=lambda x: x["mutation"].str[1: -1].astype(int),
                variant_aa=lambda x: x["mutation"].str[-1],
            )
            [["reference_site", "variant_aa"]]
        ),
        how="left",
        validate="many_to_one",
        on="reference_site",
    )
    .query("variant_aa != '-'")
    .assign(
        variant_aa=lambda x: x["variant_aa"].where(
            x["variant_aa"].notnull(), x["reference_aa"]
        ),
        site=lambda x: x["reference_site"].astype(str),
        mutant_aa=lambda x: x["mutant"].where(
            x["mutant"] != x["variant_aa"],
            x["reference_aa"],
        )
    )
    [["reference_site", "reference_aa", "variant_aa", "mutant_aa"]]
    .drop_duplicates()
    .sort_values(["reference_site", "mutant_aa"])
)

assert (mutations_all["variant_aa"] != mutations_all["mutant_aa"]).all()

print(f"Found {len(mutations_all)} unique mutations in Pango lineages")

mutations_all.to_csv(mutations_all_csv, index=False)