# Configure structure based analysis for `dms-viz`

Imports:

In [None]:
import gzip
import os
import requests
import subprocess
import textwrap
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import pandas as pd

Define variables. This next cell is tagged `parameters` for `papermill` parameterization:

In [None]:
pdb_id = None
phenotypes_csv = None
per_antibody_escape_csv = None
site_numbering_map = None
dms_viz_json = None
dms_viz_sitemap = None
dms_viz_phenotypes = None
pdb_file = None
dms_viz_subdir = None

Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:

In [None]:
phenotypes = pd.read_csv(phenotypes_csv)

sitemap = (
    pd.read_csv(site_numbering_map)
    .sort_values("sequential_site")
    .assign(
        HA_chain=lambda x: x["HA1_HA2_H5_site"].str.split().str[1].str[1: -1],
        first_ha2_site=lambda x: x.query("HA_chain == 'HA2'")["mature_H5_site"].min(),
        protein_site=lambda x: x["mature_H5_site"].where(
            x["HA_chain"] == "HA1",
            x["mature_H5_site"] - x["first_ha2_site"] + 1,
        ),
        chains=lambda x: x["HA_chain"].map({"HA1": "A", "HA2": "B"}),
    )
    .merge(phenotypes[["site", "wildtype"]].drop_duplicates().rename(columns={"site": "reference_site"}))
    [["sequential_site", "reference_site", "protein_site", "wildtype", "HA_chain", "chains"]]
)

sitemap.to_csv(dms_viz_sitemap, index=False)

Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) as the crystallographic unit doesn't correspond to that:

In [None]:
r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb1.gz")
assert r.status_code == 200
pdb_content = gzip.decompress(r.content).decode("utf-8")
with open(pdb_file, "w") as f:
    f.write(pdb_content)

Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]

records = []
for chain in ["A", "B"]:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = res.id[1]
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chains", "protein_site", "pdb_aa"])

mismatched_sites = sitemap.merge(pdb_df, how="left")

print(
    f"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, "
    f"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and "
    f"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ."
)

print("Sites that differ:")
display(mismatched_sites.query("pdb_aa.notnull() and (wildtype != pdb_aa)").reset_index(drop=True))

Write the phenotypes after adding the per-species antibodies as the escape to a CSV file:

In [None]:
per_species_escape = (
    pd.read_csv(per_antibody_escape_csv)
    .pivot_table(
        index=["site", "wildtype", "mutant"],
        columns="antibody",
        values="escape",
    )
    .rename(columns={"ferret": "ferret_sera_escape", "mouse": "mouse_sera_escape"})
    .reset_index()
)

In [None]:
phenotypes = (
    phenotypes
    .merge(
        per_species_escape,
        on=["site", "wildtype", "mutant"],
        how="outer",
        validate="one_to_one",
    )
    .assign(
        ferret_sera_escape=lambda x: x["ferret_sera_escape"].where(x["mutant"] != x["wildtype"], 0),
        mouse_sera_escape=lambda x: x["mouse_sera_escape"].where(x["mutant"] != x["wildtype"], 0),
    )
    .drop(columns="species sera escape")
    .assign(
        mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"],
    )
    .rename(
        columns={
            "entry in 293T cells": "cell_entry",
            "SA26 usage increase": "a26_usage",
        }
    )
)

print(f"Phenotypes has following columns: {phenotypes.columns.tolist()}")

phenotypes.to_csv(dms_viz_phenotypes, index=False)

Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/).
First, set up some options:

In [None]:
phenotype_cols = {
    # phenotype columns and additional arguments to `configure-dms-viz`
    "cell_entry": ["--floor", "False", "--summary-stat", "mean"],
    "ferret_sera_escape": ["--floor", "True", "--summary-stat", "sum"],
    "mouse_sera_escape": ["--floor", "True", "--summary-stat", "sum"],
    "stability": ["--floor", "True", "--summary-stat", "mean"],
    "a26_usage": ["--floor", "True", "--summary-stat", "sum"],
}

# additional tooltips to show
tooltip_cols = {
    c: c.replace("_", " ")
    for c in list(phenotype_cols) + ["mutation", "sequential_site", "mature_H5_site", "region"]
}

assert set(tooltip_cols).issubset(phenotypes.columns)

filter_cols = ["cell_entry"]
filter_limits = {"cell_entry": [phenotypes["cell_entry"].min(), -3, 0]}

Now make the JSONs for each phenotype, and then combine them:

In [None]:
pheno_jsons = []
for pheno_col, pheno_args in phenotype_cols.items():
    pheno_json = os.path.join(dms_viz_subdir, f"{pheno_col}.json")
    print(f"Writing phenotype {pheno_col} to {pheno_json}")
    cmds = [
        "configure-dms-viz", "format",
        "--name", pheno_col.replace("_", " "),
        "--input", dms_viz_phenotypes,
        "--metric", pheno_col,
        "--structure", pdb_file,
        "--sitemap", dms_viz_sitemap,
        "--included-chains", "A B",
        "--tooltip-cols", str({k: v for (k, v) in tooltip_cols.items() if k != pheno_col}),
        "--alphabet", "RKHDEQNSTYWFAILMVGPC",
        "--output", pheno_json,
        "--title", f"Effects of mutations to an influenza H5 HA (clade 2.3.4.4b) on {pheno_col.replace('_', ' ')}",
        "--description", pheno_col,
        *pheno_args,
    ]
    pheno_filter_cols = {c: c for c in filter_cols if c != pheno_col}
    if pheno_filter_cols:
        cmds += ["--filter-cols", str(pheno_filter_cols)]
        cmds += ["--filter-limits", str({c: filter_limits[c] for c in pheno_filter_cols})]
    subprocess.run(cmds, check=True)
    pheno_jsons.append(pheno_json)

markdown_description = os.path.join(dms_viz_subdir, "description.md")
with open(markdown_description, "w") as f:
    f.write(
        textwrap.dedent(
            """\
            # Effects of mutations to influenza H5 HA as measured by deep mutational scanning
            This is an interactive structure-based visualization of the effects of mutations to
            a clade 2.3.4.4b H5 HA on several different key protein phenotypes.
            See the paper ([Dadonaite et al (2024)](https://www.biorxiv.org/content/10.1101/2024.05.23.595634v1)) and
            the interactive homepage with results at
            [https://dms-vep.org/Flu_H5_American-Wigeon_South-Carolina_2021-H5N1_DMS/](https://dms-vep.org/Flu_H5_American-Wigeon_South-Carolina_2021-H5N1_DMS/)
            for more details about this study.

            Use the *Dataset* dropdown in the left toolbar to choose which phenotype to show.
            You can click on points on the site lineplot to highlight them on the structure
            and show the effects of individual mutations.
            
            Specifically see [https://dms-vep.org/Flu_H5_American-Wigeon_South-Carolina_2021-H5N1_DMS/numbering.html](https://dms-vep.org/Flu_H5_American-Wigeon_South-Carolina_2021-H5N1_DMS/numbering.html)
            to understand the H3 numbering scheme used here.
            """
        )
    )

print(f"Concatenating phenotype JSONs to {dms_viz_json}")
subprocess.run(
    [
        "configure-dms-viz", "join",
        "--input", ", ".join(pheno_jsons),
        "--output", dms_viz_json,
        "--description", markdown_description,
    ],
    check=True,
)