In [1]:
import pandas as pd
import numpy as np
import itertools
import string

import sys, os
from pathlib import Path
source_location = Path().resolve()
sys.path.append(source_location)
from scripts.utils import get_sabdab_details

import mdtraj as md
casa_dir = Path("/home/pbarletta/labo/22/AbAgInterface")

In [2]:
df_sabdab_all = pd.read_csv(Path.joinpath(source_location,
    'structures/sabdab_summary_all.tsv'), sep="\t")
df_sabdab_90 = pd.read_csv(Path.joinpath(source_location,
    'structures/sabdab_summary_90.tsv'), sep="\t")

df_buried = pd.read_pickle(Path.joinpath(source_location,
    'data/epitope_buried.pickle'))

df_interactions = pd.read_pickle(Path.joinpath(source_location,
    'data/interactions.pickle'))

In [3]:
# protein_antigens = df_sabdab_90[df_sabdab_90['antigen_type'].str.contains('protein', na = False)]
protein_antigens = df_sabdab_90.query("antigen_type == antigen_type and antigen_type.str.contains('protein')", engine = 'python').drop_duplicates()
ab_protein_antigens = set(protein_antigens.pdb.values)
all_saddab_proteins = set(df_sabdab_90.pdb.values)
print(
    f"SabDab protein antigen:\n"
    f"{len(ab_protein_antigens)} proteins out of {len(all_saddab_proteins)}, "
    f"{round(len(ab_protein_antigens) / len(all_saddab_proteins) * 100, 1)}%"
)

SabDab protein antigen:
1154 proteins out of 2017, 57.2%


In [4]:
ab_both_chains = set(protein_antigens.query("Hchain == Hchain and Lchain == Lchain").pdb.values)
ab_single_H_chain = set(protein_antigens.query("Hchain == Hchain").pdb.values)
ab_single_L_chain = set(protein_antigens.query("Lchain == Lchain").pdb.values)

n_ab_no_Hchain = len(ab_protein_antigens) - len(ab_single_H_chain)
n_ab_no_Lchain = len(ab_protein_antigens) - len(ab_single_L_chain)

print(f"All: {len(ab_protein_antigens)}\nNo Hchain: {n_ab_no_Hchain}\nNo Lchain: {n_ab_no_Lchain}\nBoth chains: {len(ab_both_chains)}")

All: 1154
No Hchain: 0
No Lchain: 0
Both chains: 1154


In [5]:
buried_fullab = df_buried[df_buried.idcode.isin(ab_both_chains)]
print(
    f"Buried surfaces of {len(set(df_buried.idcode.values))} proteins\n"
    f"with both chains: {len(set(buried_fullab.idcode.values))}"
)

Buried surfaces of 2492 proteins
with both chains: 867


In [6]:
buried_fullab.iloc[4805]

idcode                                                                    4ydk
chainID                                                                      H
chain_type                                                                   H
cdr                                                                          3
cdr_seq                                                   VTFYHEGSGYYYRAGNYFDS
cdr_begin                                                                   95
cdr_end                                                                    102
cdr_atoms                    [6701, 6702, 6703, 6704, 6705, 6706, 6707, 670...
epitope_atoms                [950, 2128, 2133, 2134, 2135, 2414, 2483, 2484...
epitope_residues             [105, 257, 257, 257, 257, 275, 280, 280, 280, ...
ag_ab_interface              {3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...
ag_cdrchain_interface        {3584, 4738, 3582, 4742, 4358, 3592, 3593, 436...
ag_cdr_interface             {3584, 4738, 4742, 4358

In [70]:
pdb_idcode = '4ydk'
pdb_filename = Path.joinpath(casa_dir, "tempo", pdb_idcode + ".pdb")
trj_in = md.load(pdb_filename)
serial_to_id = {}
for atomo in trj_in.topology.atoms:
    serial_to_id[atomo.serial] = atomo.index

all_epitope_atoms = np.unique(list(itertools.chain(
    *[ fila.epitope_atoms for index, fila in buried_fullab[buried_fullab.idcode == pdb_idcode].iterrows() ])))

all_cdr_atoms = np.unique(list(itertools.chain(
    *[ fila.cdr_atoms for index, fila in buried_fullab[buried_fullab.idcode == pdb_idcode].iterrows() ])))

ids_C_epitope_atoms = []
ids_ONS_epitope_atoms = []
for id in all_epitope_atoms:
    index = serial_to_id[id]
    elemento = trj_in.topology.atom(index).element.symbol
    if elemento == 'C':
        ids_C_epitope_atoms.append(index)
    elif elemento == 'O' or elemento == 'N' or elemento == 'S':
        ids_ONS_epitope_atoms.append(index)

ids_C_cdr_atoms = []
ids_ONS_cdr_atoms = []
for id in all_cdr_atoms:
    index = serial_to_id[id]
    elemento = trj_in.topology.atom(index).element.symbol
    if elemento == 'C':
        ids_C_cdr_atoms.append(index)
    elif elemento == 'O' or elemento == 'N' or elemento == 'S':
        ids_ONS_cdr_atoms.append(index)

ids_ONS_atoms = ids_ONS_cdr_atoms + ids_ONS_epitope_atoms 

C_ONS_pairs = np.array(list(itertools.product(ids_C_cdr_atoms, ids_ONS_atoms)))
C_C_pairs = np.array(list(itertools.product(ids_C_cdr_atoms, ids_C_epitope_atoms)))

C_ONS_distancias = md.compute_distances(trj_in, C_ONS_pairs).reshape((len(ids_C_cdr_atoms),
    len(ids_ONS_atoms)))

C_C_distancias = md.compute_distances(trj_in, C_C_pairs).reshape((len(ids_C_cdr_atoms),
    len(ids_C_epitope_atoms)))

cutoff = 0.4

In [96]:
def is_shielded(positions, C_cdr_id, C_epi_id, surrounding_ONS_ids):
    C_cdr_xyz = positions[C_cdr_id, :]
    C_epi_xyz = positions[C_epi_id, :]

    vec_C_C = C_epi_xyz - C_cdr_xyz
    n_vec_C_C = vec_C_C / np.linalg.norm(vec_C_C)

    for ONS_id in surrounding_ONS_ids:
        ONS_xyz = positions[ONS_id, :]
        vec_cdr_ONS = ONS_xyz - C_cdr_xyz
        n_vec_cdr_ONS = vec_cdr_ONS / np.linalg.norm(vec_cdr_ONS)
        vec_epi_ONS = ONS_xyz - C_epi_xyz
        n_vec_epi_ONS = vec_epi_ONS / np.linalg.norm(vec_epi_ONS)
        
        if np.dot(n_vec_C_C, n_vec_cdr_ONS) > 0.9 and np.dot(n_vec_C_C, n_vec_epi_ONS) < -0.1:
            return True, ONS_id

    return False, 0

In [95]:
k = 0
for i, j in zip(*np.where(C_C_distancias < cutoff)):
    C_cdr_id = ids_C_cdr_atoms[i]
    C_epi_id = ids_C_epitope_atoms[j]
    surrounding_ONS_ids = [ids_ONS_atoms[i] for i in np.where(C_ONS_distancias[i, :] < cutoff)[0]]
    
    shielded, ONS_id = is_shielded(trj_in.xyz[0], C_cdr_id, C_epi_id, surrounding_ONS_ids)

    C1 = trj_in.topology.atom(C_cdr_id).serial
    C2 = trj_in.topology.atom(C_epi_id).serial
    ONS = trj_in.topology.atom(ONS_id).serial
    if shielded:
        print(f'{k=} -- shielded: {C1=}  --  {C2=} -- {ONS=}')
        k+=1
    else:
        print(f'hydro: {C1=}  --  {C2=} -- {ONS=}')

hydro: C1=5988  --  C2=4279 -- ONS=1
k=0 -- shielded: C1=5988  --  C2=4280 -- ONS=4281
hydro: C1=5992  --  C2=4210 -- ONS=1
hydro: C1=5992  --  C2=4211 -- ONS=1
hydro: C1=6010  --  C2=4311 -- ONS=1
hydro: C1=6777  --  C2=4653 -- ONS=1
hydro: C1=6779  --  C2=4653 -- ONS=1
hydro: C1=6796  --  C2=2512 -- ONS=1
hydro: C1=6807  --  C2=4888 -- ONS=1
hydro: C1=6814  --  C2=4878 -- ONS=1
hydro: C1=6814  --  C2=4879 -- ONS=1
hydro: C1=6824  --  C2=4885 -- ONS=1
hydro: C1=6824  --  C2=4888 -- ONS=1
hydro: C1=6834  --  C2=4279 -- ONS=1
hydro: C1=6835  --  C2=4279 -- ONS=1
k=1 -- shielded: C1=6836  --  C2=4263 -- ONS=4278
hydro: C1=6836  --  C2=4279 -- ONS=1
k=2 -- shielded: C1=6838  --  C2=4263 -- ONS=4264
hydro: C1=6838  --  C2=4279 -- ONS=1
hydro: C1=6855  --  C2=3581 -- ONS=1
hydro: C1=6856  --  C2=3581 -- ONS=1
hydro: C1=6857  --  C2=3564 -- ONS=1
hydro: C1=6858  --  C2=3581 -- ONS=1
hydro: C1=6858  --  C2=4878 -- ONS=1
hydro: C1=6859  --  C2=3564 -- ONS=1
hydro: C1=6859  --  C2=3565 -- ONS=1

In [57]:
C_C_distancias.shape

(297, 92)

In [62]:
(len(ids_C_cdr_atoms), len(ids_ONS_cdr_atoms))

(297, 178)

In [63]:
(len(ids_C_epitope_atoms), len(ids_ONS_epitope_atoms))

(92, 70)

In [65]:
C_ONS_distancias.shape

(297, 248)

In [10]:
with open("tempo/shielded.py", "a") as fil:
    for i, j in zip(*np.where(C_C_distancias < cutoff)):
        C_cdr_id = ids_C_cdr_atoms[i]
        C_epi_id = ids_C_epitope_atoms[j]
        surrounding_ONS_ids = [ids_ONS_atoms[i] for i in np.where(C_ONS_distancias[i, :] < cutoff)[0]]
        
        shielded, ONS_id = is_shielded(trj_in.xyz[0], C_cdr_id, C_epi_id, surrounding_ONS_ids)

        C1 = trj_in.topology.atom(C_cdr_id).serial
        C2 = trj_in.topology.atom(C_epi_id).serial
        ONS = trj_in.topology.atom(ONS_id).serial
        if shielded:
            fil.write(f'cmd.show("spheres", "id {C1}+{C2}+{ONS}")\n')

with open("tempo/hydro.py", "a") as fil:
    for i, j in zip(*np.where(C_C_distancias < cutoff)):
        C_cdr_id = ids_C_cdr_atoms[i]
        C_epi_id = ids_C_epitope_atoms[j]
        surrounding_ONS_ids = [ids_ONS_atoms[i] for i in np.where(C_ONS_distancias[i, :] < cutoff)[0]]
        
        shielded, ONS_id = is_shielded(trj_in.xyz[0], C_cdr_id, C_epi_id, surrounding_ONS_ids)

        C1 = trj_in.topology.atom(C_cdr_id).serial
        C2 = trj_in.topology.atom(C_epi_id).serial
        ONS = trj_in.topology.atom(ONS_id).serial
        if not shielded:
            fil.write(f'cmd.show("spheres", "id {C1}+{C2}")\n')

In [9]:
with open("tempo/shielded.py", "w") as fil:
    fil.write(f'from pymol import cmd\n\n')
    fil.write(f'cmd.load("{buried_fullab.iloc[4805].idcode}.pdb")\n')
    fil.write(f'cmd.color("salmon", "chain G")\n')
    fil.write(f'cmd.color("atomic", "(not elem C)")\n\n')
    for x in np.unique(buried_fullab.iloc[4805].epitope_residues):
        fil.write(f'cmd.show("lines", "resi ' + str(x) + f' and chain G")\n')

    for x in range(buried_fullab.iloc[4805].cdr_begin, buried_fullab.iloc[4805].cdr_end+1):
        fil.write(f'cmd.show("lines", "resi ' + str(x) + f' and chain H")\n')

    cdrH3_extra_residues = [''.join(tuple) for tuple in list(itertools.product(["100"], string.ascii_uppercase[0:12]))]
    for x in cdrH3_extra_residues:
        fil.write(f'cmd.show("lines", "resi ' + str(x) + f' and chain H")\n')

with open("tempo/hydro.py", "w") as fil:
    fil.write(f'from pymol import cmd\n\n')
    fil.write(f'cmd.load("{buried_fullab.iloc[4805].idcode}.pdb")\n')
    fil.write(f'cmd.color("salmon", "chain G")\n')
    fil.write(f'cmd.color("atomic", "(not elem C)")\n\n')

    for x in np.unique(buried_fullab.iloc[4805].epitope_residues):
        fil.write(f'cmd.show("lines", "resi ' + str(x) + f' and chain G")\n')

    for x in range(buried_fullab.iloc[4805].cdr_begin, buried_fullab.iloc[4805].cdr_end+1):
        fil.write(f'cmd.show("lines", "resi ' + str(x) + f' and chain H")\n')

    cdrH3_extra_residues = [''.join(tuple) for tuple in list(itertools.product(["100"], string.ascii_uppercase[0:12]))]
    for x in cdrH3_extra_residues:
        fil.write(f'cmd.show("lines", "resi ' + str(x) + f' and chain H")\n')



In [17]:
buried_fullab.query(f"idcode == '4ydk'")

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,ag_ab_interface,ag_cdrchain_interface,ag_cdr_interface,ab_ag_interface,ag_ab_interface_res,ag_cdrchain_interface_res,ag_cdr_interface_res,ab_ag_interface_res
10761,4ydk,H,H,1,GFTFNNY,26,32,"[5552, 5553, 5554, 5555, 5556, 5557, 5558, 555...",[],[],"{3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...","{3584, 4738, 3582, 4742, 4358, 3592, 3593, 436...",{},"{6323, 6338, 6851, 6853, 6855, 6857, 6859, 686...","[(G, 370, GLU, HG2, 1), (G, 371, ILE, CG2, 1),...","[(G, 370, GLU, HG2, 1), (G, 455, THR, HB, 1), ...",[],"[(H, 71, ARG, N, 1), (H, 71, ARG, HG2, 1), (H,..."
10762,4ydk,H,H,2,SAHGGS,52,56,"[5963, 5964, 5965, 5966, 5967, 5968, 5969, 597...","[1209, 1233, 1238, 1243, 1244, 3530, 3533, 353...","[122, 123, 123, 124, 124, 368, 368, 368, 368, ...","{3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...","{3584, 4738, 3582, 4742, 4358, 3592, 3593, 436...","{4397, 4378, 4379, 4380, 4381, 4383}","{6323, 6338, 6851, 6853, 6855, 6857, 6859, 686...","[(G, 370, GLU, HG2, 2), (G, 371, ILE, CG2, 2),...","[(G, 370, GLU, HG2, 2), (G, 455, THR, HB, 2), ...","[(G, 432, GLN, HB3, 2), (G, 431, GLY, N, 2), (...","[(H, 71, ARG, N, 2), (H, 71, ARG, HG2, 2), (H,..."
10763,4ydk,H,H,3,VTFYHEGSGYYYRAGNYFDS,95,102,"[6701, 6702, 6703, 6704, 6705, 6706, 6707, 670...","[950, 2128, 2133, 2134, 2135, 2414, 2483, 2484...","[105, 257, 257, 257, 257, 275, 280, 280, 280, ...","{3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...","{3584, 4738, 3582, 4742, 4358, 3592, 3593, 436...","{3584, 4738, 4742, 4358, 3592, 3593, 4745, 474...","{6323, 6338, 6851, 6853, 6855, 6857, 6859, 686...","[(G, 370, GLU, HG2, 3), (G, 371, ILE, CG2, 3),...","[(G, 370, GLU, HG2, 3), (G, 455, THR, HB, 3), ...","[(G, 370, GLU, HG2, 3), (G, 455, THR, HB, 3), ...","[(H, 71, ARG, N, 3), (H, 71, ARG, HG2, 3), (H,..."
10764,4ydk,L,K,1,RASQDIANYLN,24,34,"[8957, 8958, 8959, 8960, 8961, 8962, 8963, 896...","[2489, 2494, 2495, 3508, 3509, 3510, 3512, 351...","[280, 280, 280, 365, 365, 365, 365, 365, 365, ...","{3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...","{3523, 3524, 4772, 4771, 4778, 4781, 4912, 491...","{4912, 4913, 4772, 4781}","{6323, 6338, 6851, 6853, 6855, 6857, 6859, 686...","[(G, 370, GLU, HG2, 1), (G, 371, ILE, CG2, 1),...","[(G, 365, SER, HB2, 1), (G, 457, ASP, CG, 1), ...","[(G, 469, ARG, NE, 1), (G, 469, ARG, CZ, 1), (...","[(H, 71, ARG, N, 1), (H, 71, ARG, HG2, 1), (H,..."
10765,4ydk,L,K,2,GATNLHH,50,56,"[9393, 9394, 9395, 9396, 9397, 9398, 9399, 940...","[2488, 2494, 4708, 4711, 4713, 4714, 4715, 471...","[280, 280, 459, 459, 460, 460, 460, 460, 460, ...","{3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...","{3523, 3524, 4772, 4771, 4778, 4781, 4912, 491...",{},"{6323, 6338, 6851, 6853, 6855, 6857, 6859, 686...","[(G, 370, GLU, HG2, 2), (G, 371, ILE, CG2, 2),...","[(G, 365, SER, HB2, 2), (G, 457, ASP, CG, 2), ...",[],"[(H, 71, ARG, N, 2), (H, 71, ARG, HG2, 2), (H,..."
10766,4ydk,L,K,3,QQSFQTVGS,89,97,"[9962, 9963, 9964, 9965, 9966, 9967, 9968, 996...","[3504, 3505, 3506, 3507, 3508, 3509, 3510, 351...","[365, 365, 365, 365, 365, 365, 365, 365, 365, ...","{3584, 3592, 3593, 3596, 3597, 3599, 3600, 360...","{3523, 3524, 4772, 4771, 4778, 4781, 4912, 491...","{3523, 3524, 3519}","{6323, 6338, 6851, 6853, 6855, 6857, 6859, 686...","[(G, 370, GLU, HG2, 3), (G, 371, ILE, CG2, 3),...","[(G, 365, SER, HB2, 3), (G, 457, ASP, CG, 3), ...","[(G, 365, SER, HB2, 3), (G, 365, SER, HB3, 3),...","[(H, 71, ARG, N, 3), (H, 71, ARG, HG2, 3), (H,..."
