# Visualize distribution of amino-acid variants

In [None]:
# Imports
import pandas as pd
import altair as alt

# Plotting colors
# re-arranged for plot
tol_muted_adjusted = [
    "#AA4499",
    "#88CCEE",
    "#EE7733",
    "#44AA99",
    "#1f78b4",
    "#CC6677",
    "#117733",
    "#999933",
    "#DDCC77",
    "#CC3311",
    "#882255",
    "#000000",
    "#DDDDDD",
]

# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

In [None]:
# this cell is tagged as `parameters` for papermill parameterization
variant_data = None

In [None]:
# # Uncomment for running interactive
# variant_data = "../results/variants/codon_variants.csv"

In [None]:
# Load data
variant_df = pd.read_csv(variant_data)

# Group all variants with >= 8 amino-acid mutations
variant_df["n_aa_substitutions"] = variant_df["n_aa_substitutions"].apply(lambda x: 8 if x >= 8 else x)

distribution_plot = alt.Chart(variant_df).mark_bar(color="#000000", size=7).encode(
    x=alt.X(
        "n_aa_substitutions", 
        axis=alt.Axis(
            title="AA muts",
            values=[0, 1, 2, 3, 4, 5, 6, 7, 8],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labelFontSize=8,
            labelFontWeight="normal",
            titleFontWeight="normal",
        ),
    ),
    y=alt.Y(
        "count()",
        axis=alt.Axis(
            title="number of variants",
            values=[0, 5000, 10000, 15000],
            domainWidth=1,
            domainColor="black",
            tickColor="black",
            labelFontSize=8,
            labelFontWeight="normal",
            titleFontWeight="normal",
        ),
        scale=alt.Scale(domain=[0,15000]),
    ),
    facet=alt.Facet(
        "library", 
        title=None, 
        columns=2,
        header=alt.Header(
            labelFontSize=8,
            labelFontWeight="bold",
        ),
    ),
).properties(
    width=75, 
    height=100,
).configure_axis(
    grid=False,
    labelFontSize=8,
    titleFontSize=8,
    labelFontWeight="normal",
) 

distribution_plot