# Aggregate the mutation counts from GISAID and UShER

In [1]:
import os

import pandas as pd

import yaml

In [2]:
if not os.path.isfile("config.yaml"):
    os.chdir("../")  # if running interactively from subdir

with open("config.yaml") as f:
    config = yaml.safe_load(f)

In [3]:
sequential_to_reference = pd.read_csv(config["sequential_to_reference"])
alignment_counts = pd.read_csv(config["alignment_counts"])
usher_mut_counts = pd.read_csv(config["usher_mut_counts"])
usher_recent_mut_counts = pd.read_csv(config["usher_recent_mut_counts"])

In [15]:
sequential_to_reference = sequential_to_reference[sequential_to_reference.reference_site.apply(lambda x: x.isnumeric())]

In [23]:
sequential_to_reference['reference_site'] = sequential_to_reference['reference_site'].astype(str).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sequential_to_reference['reference_site']= sequential_to_reference['reference_site'].astype(str).astype(int)


In [25]:
aas = "ACDEFGHIKLMNPQRSTVWY*-"

df = (
    alignment_counts
    .rename(
        columns={
            "site": "reference_site",
            "mutant": "mutant_aa",
            "count": "GISAID_count",
        }
    )
    .drop(columns="wildtype")
    .merge(
        usher_mut_counts
        .rename(columns={"site": "reference_site", "count": "UShER_count"}),
        how="outer",
        on=["reference_site", "mutant_aa"]
    )
    .merge(
        usher_recent_mut_counts
        .rename(columns={"site": "reference_site", "count": "UShER_recent_count"}),
        how="outer",
        on=["reference_site", "mutant_aa"],
    )
    .merge(
        sequential_to_reference.rename(columns={"aa": "wildtype_aa"}),
        on="reference_site",
        validate="many_to_one",
    )
    [[
        "sequential_site",
        "reference_site",
        "wildtype_aa",
        "reference_aa",
        "mutant_aa",
        "GISAID_count",
        "UShER_count",
        "UShER_recent_count",
    ]]
    .assign(mutated_from_reference=lambda x: x["wildtype_aa"] != x["reference_aa"])
)

assert (
    df
    .query("mutated_from_reference")
    .assign(reference_as_mutant=lambda x: x["reference_aa"] == x["mutant_aa"])
    .groupby("reference_site")
    .aggregate({"reference_as_mutant": "any"})
    .all()
    .all()
), "some mutations from reference not in data frame as they have no counts"

# expand to include all mutations, even those with zero counts
cols_to_tile = ["sequential_site", "reference_site", "wildtype_aa", "reference_aa"]
df = (
    df
    .merge(
        pd.DataFrame(
            [
                (*tup, aa)
                for tup in df[cols_to_tile].drop_duplicates().itertuples(index=False)
                for aa in aas
            ],
            columns=[*cols_to_tile, "mutant_aa"],
        ),
        how="outer",
    )
    .assign(
        GISAID_count=lambda x: x["GISAID_count"].fillna(0).astype(int),
        UShER_count=lambda x: x["UShER_count"].fillna(0).astype(int),
        UShER_recent_count=lambda x: x["UShER_recent_count"].fillna(0).astype(int),
        mutated_from_reference=lambda x: x["wildtype_aa"] != x["reference_aa"],
    )
    .sort_values(["sequential_site", "mutant_aa"])
    .reset_index(drop=True)
)
    
assert df.notnull().all().all()

df.to_csv(config["mutation_stats"], index=False)

df

Unnamed: 0,sequential_site,reference_site,wildtype_aa,reference_aa,mutant_aa,GISAID_count,UShER_count,UShER_recent_count,mutated_from_reference
0,1,1,M,M,*,0,0,0,False
1,1,1,M,M,-,0,0,0,False
2,1,1,M,M,A,0,0,0,False
3,1,1,M,M,C,0,0,0,False
4,1,1,M,M,D,0,0,0,False
...,...,...,...,...,...,...,...,...,...
28458,1248,1252,S,S,S,0,447,4,False
28459,1248,1252,S,S,T,106,8,0,False
28460,1248,1252,S,S,V,0,0,0,False
28461,1248,1252,S,S,W,0,0,0,False
