# Configure structure based analysis for `dms-viz`

Imports:

In [None]:
import gzip
import os
import requests
import subprocess
import textwrap
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import matplotlib

import pandas as pd

import seaborn

Define variables. This next cell is tagged `parameters` for `papermill` parameterization:

In [None]:
pdb_id = None
phenotypes_csv = None
per_antibody_escape_csv = None
site_numbering_map = None
dms_viz_json = None
dms_viz_sitemap = None
dms_viz_phenotypes = None
pdb_file = None
dms_viz_subdir = None

Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:

In [None]:
phenotypes = pd.read_csv(phenotypes_csv)

sitemap = (
    pd.read_csv(site_numbering_map)
    .sort_values("sequential_site")
    .assign(
        HA_chain=lambda x: x["HA1_HA2_H5_site"].str.split().str[1].str[1: -1],
        first_ha2_site=lambda x: x.query("HA_chain == 'HA2'")["mature_H5_site"].min(),
        protein_site=lambda x: x["mature_H5_site"].where(
            x["HA_chain"] == "HA1",
            x["mature_H5_site"] - x["first_ha2_site"] + 1,
        ),
        chains=lambda x: x["HA_chain"].map({"HA1": "A", "HA2": "B"}),
    )
    .merge(phenotypes[["site", "wildtype"]].drop_duplicates().rename(columns={"site": "reference_site"}))
    [["sequential_site", "reference_site", "protein_site", "wildtype", "HA_chain", "chains"]]
)

sitemap.to_csv(dms_viz_sitemap, index=False)

Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) as the crystallographic unit doesn't correspond to that:

In [None]:
r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb1.gz")
assert r.status_code == 200
pdb_content = gzip.decompress(r.content).decode("utf-8")
with open(pdb_file, "w") as f:
    f.write(pdb_content)

Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]

records = []
for chain in ["A", "B"]:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = res.id[1]
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chains", "protein_site", "pdb_aa"])

mismatched_sites = sitemap.merge(pdb_df, how="left")

print(
    f"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, "
    f"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and "
    f"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ."
)

print("Sites that differ:")
display(mismatched_sites.query("pdb_aa.notnull() and (wildtype != pdb_aa)").reset_index(drop=True))

Write the phenotypes after adding the antibodyd escape to a CSV file:

In [None]:
per_antibody_escape = (
    pd.read_csv(per_antibody_escape_csv)
    .drop(columns="antibody_set")
    .merge(
        phenotypes[
            [
                "site",
                "mutant",
                "entry in 293T cells",
                "sequential_site",
                "mature_H5_site",
                "HA1_HA2_H5_site",
            ]
        ],
        on=["site", "mutant"],
        validate="many_to_one",
        how="left",
    )
    .rename(columns={"entry in 293T cells": "cell_entry"})
    .assign(
        mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"],
    )
)

antibodies = list(per_antibody_escape["antibody"].unique())

print(f"Read escape for {antibodies=}")

print(f"Writing the phenotypes to {dms_viz_phenotypes}")
per_antibody_escape.to_csv(dms_viz_phenotypes, index=False, float_format="%.4g")

print(f"{per_antibody_escape.columns=}")

Get enough colors:

In [None]:
def get_hex_color_palette(num_colors):
    colors = seaborn.color_palette("hls", num_colors)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    return hex_colors

nconditions = len(antibodies)
if nconditions > 4:
    colors = ",".join(get_hex_color_palette(nconditions))
else:
    colors = "#0072B2,#CC79A7,#4C3549,#009E73"

print(f"Using {colors=}")
seaborn.palplot(seaborn.color_palette(colors.split(",")))

Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/).
First, set up some options:

In [None]:
tooltip_cols = [
    "mutation",
    "sequential_site",
    "mature_H5_site",
    "cell_entry",
]

cmds = [
    "configure-dms-viz", "format",
    "--name", "VRC antibodies",
    "--input", dms_viz_phenotypes,
    "--output", dms_viz_json,
    "--structure", pdb_file,
    "--metric", "escape",
    "--condition", "antibody",
    "--sitemap", dms_viz_sitemap,
    "--colors", colors,
    "--alphabet", "RKHDEQNSTYWFAILMVGPC",
    "--summary-stat", "sum",
    "--floor", "True",
    "--tooltip-cols", str({c: c.replace("_", " ") for c in tooltip_cols}),
    "--filter-cols", "{'cell_entry': 'cell entry'}",
    "--filter-limits", f"{{'cell_entry': [{float(per_antibody_escape['cell_entry'].min())}, -3, 0]}}",
    "--title", "H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning",
    "--description", "H5 HA escape from VRC antibodies as measured by pseudovirus deep mutational scanning",
]

print(f"Running the following commands:\n{cmds}")
subprocess.run(cmds, check=True)