# Configure `dms-viz` JSONs

Imports:

In [None]:
import gzip
import os
import requests
import subprocess
import warnings

import Bio.PDB.PDBParser
import Bio.PDB.Polypeptide

import matplotlib.colors

import pandas as pd

import seaborn

Get variables from `snakemake`:

In [None]:
config = snakemake.params.config
pdb_id = config["pdb_id"]
escape_or_phenotype = config["escape_or_phenotype"]
phenotypes_list = config["phenotypes"]
antibody_list = config["antibodies"]

phenotypes_csv = snakemake.input.phenotypes_csv
if escape_or_phenotype == "escape":
    per_antibody_escape_csv = snakemake.input.per_antibody_escape_csv

dms_viz_json = snakemake.output.json
dms_viz_json_no_description = snakemake.output.json_no_description
dms_viz_sitemap = snakemake.output.sitemap
dms_viz_phenotypes = snakemake.output.phenotypes
pdb_file = snakemake.output.pdb_file
description_md = snakemake.output.description_md

description_suffix = snakemake.params.description_suffix

name = snakemake.wildcards.struct

Build the [sitemap](https://dms-viz.github.io/dms-viz-docs/preparing-data/data-requirements/#reference-site) used by `dms-viz`:

In [None]:
phenotypes_all = pd.read_csv(phenotypes_csv)

sitemap = (
    phenotypes_all
    [["sequential_site", "site", "wildtype"]]
    .rename(columns={"site": "reference_site"})
    .drop_duplicates()
    .sort_values("sequential_site")
)

assert len(sitemap) == sitemap["sequential_site"].nunique() == sitemap["reference_site"].nunique()

sitemap.to_csv(dms_viz_sitemap, index=False)

Get the biological assembly (see https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/biological-assemblies#Anchor-download) in case the crystallographic unit doesn't correspond to that:

In [None]:
pdb_id = config["pdb_id"]

print(f"Getting PDB file for {pdb_id}")

r = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb1.gz")
assert r.status_code == 200
pdb_content = gzip.decompress(r.content).decode("utf-8")
with open(pdb_file, "w") as f:
    f.write(pdb_content)

Check the sites mismatched between the sitemap and the protein structure in terms of residue identity:

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    pdb_obj = Bio.PDB.PDBParser().get_structure(id=pdb_id, file=pdb_file)[0]

all_chains = [chain.id for chain in pdb_obj]
print(f"PDB has {all_chains=}")
if "included-chains" in config:
    included_chains = config["included-chains"]
    print(f"{included_chains=}")
    assert set(included_chains).issubset(all_chains)
else:
    included_chains = all_chains

records = []
for chain in included_chains:
    for res in pdb_obj[chain].get_residues():
        if not res.id[0].isspace():
            continue
        aa = Bio.PDB.Polypeptide.protein_letters_3to1[res.resname]
        r = res.id[1]
        records.append((chain, r, aa))
pdb_df = pd.DataFrame(records, columns=["chain", "reference_site", "pdb_aa"])

mismatched_sites = sitemap.merge(pdb_df, how="left")

print(
    f"Of {len(sitemap)} sites, {len(mismatched_sites.query('wildtype == pdb_aa'))} match, "
    f"{len(mismatched_sites.query('pdb_aa.isnull()'))} are missing from PDB, and "
    f"{len(mismatched_sites.query('pdb_aa.notnull()').query('wildtype != pdb_aa'))} differ."
)

print("Sites that differ:")
display(mismatched_sites.query("pdb_aa.notnull() and (wildtype != pdb_aa)").reset_index(drop=True))

Process the different phenotypes and escape if relevant:

In [None]:
# get extra columns dealing with fact `dms-viz` does not allow them to have spaces
extra_cols = []
for coltype in ["tooltip-cols", "filter-cols"]:
    for col in list(config[coltype]):
        col_nospace = col.replace(" ", "_")
        assert (col in phenotypes_all.columns) or (col_nospace in phenotypes_all.columns), f"no column {col} in {phenotypes_all.columns=}"
        if col != col_nospace:
            assert (col_nospace not in phenotypes_all.columns) or (col not in phenotypes_all.columns)
            assert col_nospace not in config[coltype]
            phenotypes_all = phenotypes_all.rename(columns={col: col_nospace}, errors="ignore")
            config[coltype][col_nospace] = config[coltype][col]
            del config[coltype][col]
            extra_cols.append(col_nospace)
            if coltype == "filter-cols":
                if col in config["filter-limits"]:
                    config["filter-limits"][col_nospace] = config["filter-limits"][col]
                    del config["filter-limits"][col]
        else:
            extra_cols.append(col)
print(f"Keeping {extra_cols=} for filters and tooltips")
assert set(extra_cols).issubset(phenotypes_all.columns)

id_cols = ["site", "wildtype", "mutant"]

if escape_or_phenotype == "phenotype":
    assert phenotypes_list and not antibody_list, f"{phenotypes_list=}, {antibody_list=}"
    assert set(phenotypes_list).issubset(phenotypes_all.columns), f"{phenotypes_list=}\n{phenotypes_all.columns=}"
    metric_column = "effect"
    condition_column = "phenotype"
    phenotypes = (
        phenotypes_all
        .melt(
            id_vars=id_cols + extra_cols,
            value_vars=phenotypes_list,
            var_name=condition_column,
            value_name=metric_column,
        )
    )

elif escape_or_phenotype == "escape":
    assert not phenotypes_list

    phenotypes = pd.read_csv(per_antibody_escape_csv)

    antibodies_all = list(phenotypes["antibody"].unique())
    if antibody_list is not None:
        antibodies = antibody_list
        print(f"Using just {antibodies=} from {antibodies_all=}")
        assert set(antibodies).issubset(antibodies_all), f"{antibodies=}\n{antibodies_all=}"
        assert len(antibodies) >= 1
    else:
        antibodies = antibodies_all
        print(f"Using all {antibodies=}")
    antibodies = antibodies
    metric_column = "escape"
    condition_column = "antibody"
    phenotypes = phenotypes[id_cols + [metric_column, condition_column]]
    phenotypes = phenotypes[phenotypes[condition_column].isin(antibodies)]
    
    if extra_cols:
        join_data = phenotypes_all[id_cols + extra_cols]
        assert len(join_data) == len(join_data.groupby(id_cols))
        assert set(join_data.columns).intersection(phenotypes.columns) == set(id_cols)
        phenotypes = phenotypes.merge(join_data, on=id_cols, validate="many_to_one", how="left")

else:
    raise ValueError(f"invalid {escape_or_phenotype=}")

print(f"Writing the phenotypes to {dms_viz_phenotypes}")
phenotypes.to_csv(dms_viz_phenotypes, index=False)
display(phenotypes)

Get enough colors:

In [None]:
def get_hex_color_palette(num_colors):
    colors = seaborn.color_palette("hls", num_colors)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    return hex_colors

nconditions = phenotypes[condition_column].nunique()
if nconditions > 4:
    colors = ",".join(get_hex_color_palette(nconditions))
else:
    colors = "#0072B2,#CC79A7,#4C3549,#009E73"

print(f"Using {colors=}")
seaborn.palplot(seaborn.color_palette(colors.split(",")))

Get the description:

In [None]:
description_text = "\n\n".join(
    [
        f"## {config['title']}",
        config["description"] if config["description"] is not None else "",
        f"Structure shown is from PDB {pdb_id}.",
        description_suffix,
    ]
).lstrip()

print(description_text)

with open(description_md, "w") as f:
    f.write(description_text)

Now run [configure-dms-viz](https://dms-viz.github.io/dms-viz-docs/preparing-data/command-line-api/):

In [None]:
cmds = [
    "configure-dms-viz", "format",
    "--name", name,
    "--input", dms_viz_phenotypes,
    "--output", dms_viz_json_no_description,
    "--structure", pdb_file,
    "--metric", metric_column,
    "--condition", condition_column,
    "--sitemap", dms_viz_sitemap,
    "--colors", colors,
]

for col in [
    "included-chains",
    "excluded-chains",
]:
    val = config[col]
    if val:
        cmds += [f"--{col}", " ".join(val)]

if config["heatmap-limits"]:
    cmds += ["--heatmap-limits", ", ".join(str(x) for x in config["heatmap-limits"])]

for col in [
    "alphabet",
    "floor",
    "summary-stat",
    "tooltip-cols",
    "filter-cols",
    "filter-limits",
    "title",
    "description",
]:
    cmds += [f"--{col}", str(config[col])]

print(f"Running the following commands:\n{cmds}")
subprocess.run(cmds, check=True)

cmds_join = [
    "configure-dms-viz", "join",
    "--input", dms_viz_json_no_description,
    "--description", description_md,
    "--output", dms_viz_json,
]

print(f"Running the following commands:\n{cmds_join}")
subprocess.run(cmds_join, check=True)