# Configure structure based analysis for `dms-viz`

Imports:

In [1]:
import gzip
import os
import requests
import subprocess
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import pandas as pd

Define variables:

In [2]:
pdb_id = "4kwm"

phenotypes_csv = "results/summaries/phenotypes.csv"
per_antibody_escape_csv = "results/summaries/phenotypes_per_antibody_escape.csv"

site_numbering_map = "data/site_numbering_map.csv"

# output files
dms_viz_subdir = "results/dms-viz/"
os.makedirs(dms_viz_subdir, exist_ok=True)
pdb_file = os.path.join(dms_viz_subdir, f"{pdb_id}.pdb")
dms_viz_sitemap = os.path.join(dms_viz_subdir, "sitemap.csv")
dms_viz_json = os.path.join(dms_viz_subdir, "dms-viz.json")
dms_viz_phenotypes = os.path.join(dms_viz_subdir, "phenotypes.csv")

Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:

In [3]:
phenotypes = pd.read_csv(phenotypes_csv)

sitemap = (
    pd.read_csv(site_numbering_map)
    .sort_values("sequential_site")
    .assign(
        HA_chain=lambda x: x["HA1_HA2_H5_site"].str.split().str[1].str[1: -1],
        first_ha2_site=lambda x: x.query("HA_chain == 'HA2'")["mature_H5_site"].min(),
        protein_site=lambda x: x["mature_H5_site"].where(
            x["HA_chain"] == "HA1",
            x["mature_H5_site"] - x["first_ha2_site"] + 1,
        ),
        chains=lambda x: x["HA_chain"].map({"HA1": "A", "HA2": "B"}),
    )
    .merge(phenotypes[["site", "wildtype"]].drop_duplicates().rename(columns={"site": "reference_site"}))
    [["sequential_site", "reference_site", "protein_site", "wildtype", "HA_chain", "chains"]]
)

sitemap.to_csv(dms_viz_sitemap, index=False)

Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) as the crystallographic unit doesn't correspond to that:

In [4]:
r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb1.gz")
assert r.status_code == 200
pdb_content = gzip.decompress(r.content).decode("utf-8")
with open(pdb_file, "w") as f:
    f.write(pdb_content)

Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:

In [5]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]

records = []
for chain in ["A", "B"]:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = res.id[1]
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chains", "protein_site", "pdb_aa"])

mismatched_sites = sitemap.merge(pdb_df, how="left")

print(
    f"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, "
    f"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and "
    f"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ."
)

print("Sites that differ:")
display(mismatched_sites.query("pdb_aa.notnull() and (wildtype != pdb_aa)").reset_index(drop=True))

Of 566 sites, 453 match, 80 are missing from PDB, and 33 differ.
Sites that differ:


Unnamed: 0,sequential_site,reference_site,protein_site,wildtype,HA_chain,chains,pdb_aa
0,16,10,-1,S,HA1,A,P
1,61,54a,45,N,HA1,A,D
2,69,62,53,K,HA1,A,R
3,88,81,72,R,HA1,A,N
4,98,90,82,R,HA1,A,K
5,110,101,94,S,HA1,A,N
6,111,102,95,L,HA1,A,F
7,120,111,104,M,HA1,A,L
8,131,122,115,L,HA1,A,Q
9,139,128,123,P,HA1,A,S


Write the phenotypes after adding the per-species antibodies as the escape to a CSV file:

In [6]:
per_species_escape = (
    pd.read_csv(per_antibody_escape_csv)
    .pivot_table(
        index=["site", "wildtype", "mutant"],
        columns="antibody",
        values="escape",
    )
    .rename(columns={"ferret": "ferret_sera_escape", "mouse": "mouse_sera_escape"})
    .reset_index()
)

In [7]:
phenotypes = (
    phenotypes
    .merge(
        per_species_escape,
        on=["site", "wildtype", "mutant"],
        how="outer",
        validate="one_to_one",
    )
    .assign(
        ferret_sera_escape=lambda x: x["ferret_sera_escape"].where(x["mutant"] != x["wildtype"], 0),
        mouse_sera_escape=lambda x: x["mouse_sera_escape"].where(x["mutant"] != x["wildtype"], 0),
    )
    .drop(columns="species sera escape")
    .assign(
        mutation=lambda x: x["wildtype"] + x["site"].astype(str) + x["mutant"],
    )
    .rename(
        columns={
            "entry in 293T cells": "cell_entry",
            "SA26 usage increase": "a26_usage",
        }
    )
)

print(f"Phenotypes has following columns: {phenotypes.columns.tolist()}")

phenotypes.to_csv(dms_viz_phenotypes, index=False)

Phenotypes has following columns: ['site', 'wildtype', 'mutant', 'cell_entry', 'stability', 'a26_usage', 'sequential_site', 'reference_H1_site', 'mature_H5_site', 'HA1_HA2_H5_site', 'region', 'nt changes to codon', 'ferret_sera_escape', 'mouse_sera_escape', 'mutation']


Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/).
First, set up some options:

In [8]:
phenotype_cols = {
    "cell_entry": {},
    "ferret_sera_escape": {},
    "mouse_sera_escape": {},
    "stability": {},
    "a26_usage": {},
}

# additional tooltips to show
tooltip_cols = {
    c: c.replace("_", " ")
    for c in list(phenotype_cols) + ["mutation", "sequential_site", "mature_H5_site", "region"]
}

assert set(tooltip_cols).issubset(phenotypes.columns)

filter_cols = ["cell_entry"]
filter_limits = {"cell_entry": [phenotypes["cell_entry"].min(), -3, 0]}

Now make the JSONs for each phenotype, and then combine them:

In [9]:
pheno_jsons = []
for pheno_col, pheno_d in phenotype_cols.items():
    pheno_json = os.path.join(dms_viz_subdir, f"{pheno_col}.json")
    print(f"Writing phenotype {pheno_col} to {pheno_json}")
    cmds = [
        "configure-dms-viz", "format",
        "--name", pheno_col.replace("_", " "),
        "--input", dms_viz_phenotypes,
        "--metric", pheno_col,
        "--structure", pdb_file,
        "--sitemap", dms_viz_sitemap,
        "--included-chains", "A B",
        "--tooltip-cols", str({k: v for (k, v) in tooltip_cols.items() if k != pheno_col}),
        "--alphabet", "RKHDEQNSTYWFAILMVGPC",
        "--output", pheno_json,
    ]
    pheno_filter_cols = {c: c for c in filter_cols if c != pheno_col}
    if pheno_filter_cols:
        cmds += ["--filter-cols", str(pheno_filter_cols)]
        cmds += ["--filter-limits", str({c: filter_limits[c] for c in pheno_filter_cols})]
    subprocess.run(cmds, check=True)
    pheno_jsons.append(pheno_json)

print(f"Concatenating phenotype JSONs to {dms_viz_json}")
subprocess.run(
    ["configure-dms-viz", "join", "--input", ", ".join(pheno_jsons), "--output", dms_viz_json],
    check=True,
)

Writing phenotype cell_entry to results/dms-viz/cell_entry.json

Formatting data for visualization using the 'cell_entry' column from 'results/dms-viz/phenotypes.csv'...

Using sitemap from 'results/dms-viz/sitemap.csv'.


About 93.20% (452 of 485) of the wildtype residues in the data match the corresponding residues in the structure.
About 14.16% (80 of 565) of the data sites are missing from the structure.

Success! The visualization JSON was written to 'results/dms-viz/cell_entry.json'
Writing phenotype ferret_sera_escape to results/dms-viz/ferret_sera_escape.json

Formatting data for visualization using the 'ferret_sera_escape' column from 'results/dms-viz/phenotypes.csv'...

Using sitemap from 'results/dms-viz/sitemap.csv'.



About 92.62% (414 of 447) of the wildtype residues in the data match the corresponding residues in the structure.
About 14.53% (76 of 523) of the data sites are missing from the structure.

Success! The visualization JSON was written to 'results/dms-viz/ferr

CompletedProcess(args=['configure-dms-viz', 'join', '--input', 'results/dms-viz/cell_entry.json, results/dms-viz/ferret_sera_escape.json, results/dms-viz/mouse_sera_escape.json, results/dms-viz/stability.json, results/dms-viz/a26_usage.json', '--output', 'results/dms-viz/dms-viz.json'], returncode=0)