# Configure structure based analysis for `dms-viz`

Imports:

In [1]:
import os
import tempfile
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import pandas as pd

Define variables:

In [2]:
pdb_id = "4kwm"

phenotypes_csv = "results/summaries/phenotypes.csv"

site_numbering_map = "data/site_numbering_map.csv"

dms_viz_sitemap = "_dms_viz_site_map.csv"

Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:

In [3]:
phenotypes = pd.read_csv(phenotypes_csv)

sitemap = (
    pd.read_csv(site_numbering_map)
    .sort_values("sequential_site")
    .assign(
        HA_chain=lambda x: x["HA1_HA2_H5_site"].str.split().str[1].str[1: -1],
        first_ha2_site=lambda x: x.query("HA_chain == 'HA2'")["mature_H5_site"].min(),
        protein_site=lambda x: x["mature_H5_site"].where(
            x["HA_chain"] == "HA1",
            x["mature_H5_site"] - x["first_ha2_site"] + 1,
        ),
        chains=lambda x: x["HA_chain"].map({"HA1": "A", "HA2": "B"}),
    )
    .merge(phenotypes[["site", "wildtype"]].drop_duplicates().rename(columns={"site": "reference_site"}))
    [["sequential_site", "reference_site", "protein_site", "wildtype", "HA_chain", "chains"]]
)

sitemap.to_csv(dms_viz_sitemap, index=False)

Get the biological assembly:
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download

Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:

In [4]:
with tempfile.TemporaryDirectory() as tempdir:
    print(tempdir)
    Bio.PDB.PDBList().retrieve_pdb_file(pdb_id, pdir=tempdir, file_format="pdb")
    temppdbfile = os.path.join(tempdir, f"pdb{pdb_id}.ent")
    assert os.path.isfile(temppdbfile)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=temppdbfile)[0]

records = []
for chain in ["A", "B"]:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = res.id[1]
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chains", "protein_site", "pdb_aa"])

mismatched_sites = sitemap.merge(pdb_df, how="left")

print(
    f"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, "
    f"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and "
    f"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ."
)

print("Sites that differ:")
display(mismatched_sites.query("pdb_aa.notnull() and (wildtype != pdb_aa)").reset_index(drop=True))

/tmp/tmpsyos_2ml
Downloading PDB structure '4kwm'...
Of 566 sites, 453 match, 80 are missing from PDB, and 33 differ.
Sites that differ:


Unnamed: 0,sequential_site,reference_site,protein_site,wildtype,HA_chain,chains,pdb_aa
0,16,10,-1,S,HA1,A,P
1,61,54a,45,N,HA1,A,D
2,69,62,53,K,HA1,A,R
3,88,81,72,R,HA1,A,N
4,98,90,82,R,HA1,A,K
5,110,101,94,S,HA1,A,N
6,111,102,95,L,HA1,A,F
7,120,111,104,M,HA1,A,L
8,131,122,115,L,HA1,A,Q
9,139,128,123,P,HA1,A,S


In [5]:
list(pdb_obj.get_chains())

[<Chain id=A>,
 <Chain id=B>,
 <Chain id=C>,
 <Chain id=D>,
 <Chain id=E>,
 <Chain id=F>]

Run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/):

In [6]:
! configure-dms-viz format \
  --name "entry in 293T cells" \
  --input {phenotypes_csv} \
  --metric stability \
  --structure {pdb_id} \
  --sitemap {dms_viz_sitemap} \
  --included-chains "A B" \
  --output dms_viz.json

[32m
Formatting data for visualization using the 'stability' column from 'results/summaries/phenotypes.csv'...[0m
[32m
Using sitemap from '_dms_viz_site_map.csv'.[0m
[31m
[31m
[33mAbout 92.24% (392 of 425) of the wildtype residues in the data match the corresponding residues in the structure.[0m
[33mAbout 14.14% (70 of 495) of the data sites are missing from the structure.[0m
[32m
Success! The visualization JSON was written to 'dms_viz.json'[0m
