### This script is not currently used in main pipeline, it simply calculates the atomic distances between residues inNipah F and different monoclonal antibodies from the .pdb file

In [None]:
import altair as alt
import numpy as np
import pandas as pd
from Bio import PDB

# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

In [None]:
three_to_one_letter = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLU": "E",
    "GLN": "Q",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
}

In [None]:
def calculate_min_distances(pdb_path, f_chain_ids, mab_chain_ids):
    """
    Calculate the minimum distance between residues in a source chain and residues in target chains.

    Args:
        pdb_path (str): Path to the PDB file.
        source_chain_id (str): ID of the source chain.
        target_chain_ids (list): List of IDs for the target chains.

    Returns:
        pandas.DataFrame: A DataFrame containing the minimum distances and related information.
    """
    # Initialize the PDB parser and load the structure from the given pdb_path
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("structure_id", pdb_path)

    # Retrieve the source chain and target chains from the structure
    F_chains = [structure[0][chain_id] for chain_id in f_chain_ids]
    mab_chains = [structure[0][chain_id] for chain_id in mab_chain_ids]

    # Dictionary to track the closest mab residue for each F site
    site_distances = {}  # Key: f_residue_id, Value: distance info

    # Loop through all F chains and residues
    for f_chain in F_chains:
        for f_residue in f_chain:
            # Skip water and common non-protein residues
            if f_residue.resname in ["HOH", "WAT", "IPA", "NAG", "SO4"]:
                continue
            
            f_site_id = f_residue.id[1]  # The residue number/position
            
            # Initialize or get current minimum for this site
            if f_site_id not in site_distances:
                site_distances[f_site_id] = {
                    "min_distance": float('inf'),
                    "f_chain": f_chain.get_id(),
                    "f_residue_name": f_residue.resname,
                    "mab_chain": None,
                    "mab_residue_id": None,
                    "mab_residue_name": None
                }
            
            current_min = site_distances[f_site_id]["min_distance"]
            
            # Loop through all mab chains and residues
            for mab_chain in mab_chains:
                for mab_residue in mab_chain:
                    if mab_residue.resname in ["HOH", "WAT", "IPA", "SO4"]:
                        continue
                    
                    # Calculate minimum distance between this residue pair
                    for f_atom in f_residue:
                        for mab_atom in mab_residue:
                            distance = f_atom - mab_atom
                            if distance < current_min:
                                current_min = distance
                                site_distances[f_site_id] = {
                                    "min_distance": distance,
                                    "f_chain": f_chain.get_id(),
                                    "f_residue_name": f_residue.resname,
                                    "mab_chain": mab_chain.get_id(),
                                    "mab_residue_id": mab_residue.id[1],
                                    "mab_residue_name": mab_residue.resname
                                }

    # Convert dictionary to DataFrame
    data = []
    for f_site_id, info in site_distances.items():
        if info["mab_chain"] is not None:  # Only include sites where we found a close mab residue
            data.append({
                "site": f_site_id,
                "f_chain": info["f_chain"],
                "f_residue_name": info["f_residue_name"],
                "mab_chain": info["mab_chain"],
                "mab_site": info["mab_residue_id"],
                "mab_residue_name": info["mab_residue_name"],
                "min_distance": info["min_distance"]
            })

    df = pd.DataFrame(data)
    df = df.sort_values('site')  # Sort by F site ID for clarity
    return df

In [None]:
# Usage of above function
ab = "12B2"
path = "../../data/pdb/12B2_7ki4.pdb"
source_chain = ["B","A"]
target_chains = ["H", "L"]

df_dist = calculate_min_distances(path, source_chain, target_chains)

df_dist["wildtype"] = df_dist["f_residue_name"].replace(three_to_one_letter)
df_dist["mab_residue"] = df_dist["mab_residue_name"].replace(three_to_one_letter)

display(df_dist)
df_dist[
    [
        "wildtype",
        "site",
        "f_chain",
        "mab_site",
        "mab_residue",
        "mab_chain",
        "min_distance",
    ]
].round(2).to_csv(f"../../results/atomic_distances/{ab}_distances.csv", index=False)

In [None]:
ab = "2D3"
path = "../../data/pdb/2D3_7up9.pdb"
source_chain = ["A","G"]
target_chains = ["H", "L"]

df_dist = calculate_min_distances(path, source_chain, target_chains)

df_dist["wildtype"] = df_dist["f_residue_name"].replace(three_to_one_letter)
df_dist["mab_residue"] = df_dist["mab_residue_name"].replace(three_to_one_letter)

display(df_dist)
df_dist[
    [
        "wildtype",
        "site",
        "f_chain",
        "mab_site",
        "mab_residue",
        "mab_chain",
        "min_distance",
    ]
].round(2).to_csv(f"../../results/atomic_distances/{ab}_distances.csv", index=False)

In [None]:
ab = "4H3"
path = "../../data/pdb/4H3_7uop.pdb"
source_chain = ["A"]
target_chains = ["H", "L"]

df_dist = calculate_min_distances(path, source_chain, target_chains)

df_dist["wildtype"] = df_dist["f_residue_name"].replace(three_to_one_letter)
df_dist["mab_residue"] = df_dist["mab_residue_name"].replace(three_to_one_letter)

display(df_dist)
df_dist[
    [
        "wildtype",
        "site",
        "f_chain",
        "mab_site",
        "mab_residue",
        "mab_chain",
        "min_distance",
    ]
].round(2).to_csv(f"../../results/atomic_distances/{ab}_distances.csv", index=False)


In [None]:
ab = "1A9"
path = "../../data/pdb/1A9_7upk.pdb"
source_chain = ["B","D"]
target_chains = ["H", "L"]

df_dist = calculate_min_distances(path, source_chain, target_chains)

df_dist["wildtype"] = df_dist["f_residue_name"].replace(three_to_one_letter)
df_dist["mab_residue"] = df_dist["mab_residue_name"].replace(three_to_one_letter)

display(df_dist)
df_dist[
    [
        "wildtype",
        "site",
        "f_chain",
        "mab_site",
        "mab_residue",
        "mab_chain",
        "min_distance",
    ]
].round(2).to_csv(f"../../results/atomic_distances/{ab}_distances.csv", index=False)

In [None]:
ab = "2B12"
path = "../../data/pdb/2B12_7upd.pdb"
source_chain = ["D","G"]
target_chains = ["H", "L"]

df_dist = calculate_min_distances(path, source_chain, target_chains)

df_dist["wildtype"] = df_dist["f_residue_name"].replace(three_to_one_letter)
df_dist["mab_residue"] = df_dist["mab_residue_name"].replace(three_to_one_letter)

display(df_dist)
df_dist[
    [
        "wildtype",
        "site",
        "f_chain",
        "mab_site",
        "mab_residue",
        "mab_chain",
        "min_distance",
    ]
].round(2).to_csv(f"../../results/atomic_distances/{ab}_distances.csv", index=False)

In [None]:
ab = "1F2"
path = "../../data/pdb/1H8_7UPA.pdb"
source_chain = ["A","G"]
target_chains = ["H", "L"]

df_dist = calculate_min_distances(path, source_chain, target_chains)

df_dist["wildtype"] = df_dist["f_residue_name"].replace(three_to_one_letter)
df_dist["mab_residue"] = df_dist["mab_residue_name"].replace(three_to_one_letter)

display(df_dist)
df_dist[
    [
        "wildtype",
        "site",
        "f_chain",
        "mab_site",
        "mab_residue",
        "mab_chain",
        "min_distance",
    ]
].round(2).to_csv(f"../../results/atomic_distances/{ab}_distances.csv", index=False)

In [None]:
df_1A9 = pd.read_csv("../../results/atomic_distances/1A9_distances.csv").assign(antibody="1A9")
df_2B12 = pd.read_csv("../../results/atomic_distances/2B12_distances.csv").assign(antibody="2B12")
df_2D3 = pd.read_csv("../../results/atomic_distances/2D3_distances.csv").assign(antibody="2D3")
df_4H3 = pd.read_csv("../../results/atomic_distances/4H3_distances.csv").assign(antibody="4H3")
df_12B2 = pd.read_csv("../../results/atomic_distances/12B2_distances.csv").assign(antibody="12B2")
df_1F2 = pd.read_csv("../../results/atomic_distances/1F2_distances.csv").assign(antibody="1F2")

combined_df = pd.concat([df_1A9, df_2B12, df_2D3, df_4H3, df_12B2, df_1F2], ignore_index=True)
display(combined_df)
combined_df.to_csv("../../results/atomic_distances/combined_distances.csv", index=False)