In [68]:
import sys, os
from pathlib import Path
import itertools
import pickle
import pandas as pd
import mdtraj as md
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
import logging
from collections import namedtuple
PiPiPair = namedtuple('PiPiPair', ['antibody', 'antigen'])
PionPair = namedtuple('PionPair', ['ring', 'ion'])
from collections import Counter

source_location = Path().resolve()
sys.path.append(source_location)
from scripts.utils import get_sabdab_details

from scripts.abag_interactions_hydrophobic import *
from scripts.abag_interactions_rings import *
from scripts.more_utils import *

casa_dir = Path("/home/pbarletta/labo/22/AbAgInterface")
str_dir = Path.joinpath(casa_dir, "structures/raw")

In [2]:
df_sabdab_all = pd.read_csv(Path.joinpath(source_location,
    'structures/sabdab_summary_all.tsv'), sep="\t")
df_sabdab_90 = pd.read_csv(Path.joinpath(source_location,
    'structures/sabdab_summary_90.tsv'), sep="\t")

df_buried = pd.read_pickle(Path.joinpath(source_location,
    'data/epitope_buried.pickle'))

df_interactions = pd.read_pickle(Path.joinpath(source_location,
    'data/interactions.pickle'))

protein_antigens = df_sabdab_90.query("antigen_type == antigen_type and antigen_type.str.contains('protein')", engine = 'python').drop_duplicates()
ab_protein_antigens = set(protein_antigens.pdb.values)
all_saddab_proteins = set(df_sabdab_90.pdb.values)
print(
    f"SabDab protein antigen:\n"
    f"{len(ab_protein_antigens)} proteins out of {len(all_saddab_proteins)}, "
    f"{round(len(ab_protein_antigens) / len(all_saddab_proteins) * 100, 1)}%"
)    

ab_both_chains = set(protein_antigens.query("Hchain == Hchain and Lchain == Lchain").pdb.values)
ab_single_H_chain = set(protein_antigens.query("Hchain == Hchain").pdb.values)
ab_single_L_chain = set(protein_antigens.query("Lchain == Lchain").pdb.values)

n_ab_no_Hchain = len(ab_protein_antigens) - len(ab_single_H_chain)
n_ab_no_Lchain = len(ab_protein_antigens) - len(ab_single_L_chain)

print(f"All: {len(ab_protein_antigens)}\nNo Hchain: {n_ab_no_Hchain}\nNo Lchain: {n_ab_no_Lchain}\nBoth chains: {len(ab_both_chains)}")

buried_fullab = df_buried[df_buried.idcode.isin(ab_both_chains)]
print(
    f"Buried surfaces of {len(set(df_buried.idcode.values))} proteins\n"
    f"with both chains: {len(set(buried_fullab.idcode.values))}"
)

SabDab protein antigen:
1154 proteins out of 2017, 57.2%
All: 1154
No Hchain: 0
No Lchain: 0
Both chains: 1154
Buried surfaces of 2492 proteins
with both chains: 867


------

There're some PiAnion and PiCation interactions where one ion/ring doesn't belong to a CDR.
This doesn't happen with the hydrophobic nor the Pi-Pi interactions.

In [13]:
pdb_list = list(set(buried_fullab.idcode))

with open(Path.joinpath(casa_dir, 'data', 'hydrophobic.pkl'), 'rb') as file:
    df_hydrophobic_atom_indices, df_hydrophobic_atom_serials,\
    df_hydrophobic_resSeq, df_hydrophobic_chain_ID,\
    df_hydrophobic_chain_type, df_hydrophobic_cdr = pickle.load(file)

with open(Path.joinpath(casa_dir, 'data', 'PiPi.pkl'), 'rb') as file:
    df_PiPi_atom_indices, df_PiPi_atom_serials,\
    df_PiPi_resSeq, df_PiPi_chain_ID,\
    df_PiPi_chain_type, df_PiPi_cdr = pickle.load(file)

with open(Path.joinpath(casa_dir, 'data', 'PiAnion.pkl'), 'rb') as file:
    df_PiAnion_atom_indices, df_PiAnion_atom_serials,\
    df_PiAnion_resSeq, df_PiAnion_chain_ID,\
    df_PiAnion_chain_type, df_PiAnion_cdr = pickle.load(file)

with open(Path.joinpath(casa_dir, 'data', 'PiCation.pkl'), 'rb') as file:
    df_PiCation_atom_indices, df_PiCation_atom_serials,\
    df_PiCation_resSeq, df_PiCation_chain_ID,\
    df_PiCation_chain_type, df_PiCation_cdr = pickle.load(file)

In [88]:
chain_type_dict = {'H': 0, 'K': 0, 'L': 0}
cdr_dict = {'H1': 0, 'H2': 0, 'H3': 0, 'L1': 0, 'L2': 0, 'L3': 0, 
'K1': 0, 'K2': 0, 'K3': 0, 'H0': 0, 'K0': 0, 'L0': 0}
cdr_dict_big_cluster = {'H1': 0, 'H2': 0, 'H3': 0, 'L1': 0, 'L2': 0, 'L3': 0, 
'K1': 0, 'K2': 0, 'K3': 0, 'H0': 0, 'K0': 0, 'L0': 0}
size_largest_cluster = []
clusters_count = []

for pdb_idcode in pdb_list:
    chain_type_of_each_contact = (df_hydrophobic_chain_type.query(
        f"idcode == '{pdb_idcode}'").chain_type)[0]
    cdr_of_each_contact = (df_hydrophobic_cdr.query(
        f"idcode == '{pdb_idcode}'").CDR)[0]
    if isinstance(chain_type_of_each_contact, float):
        print(f"-- BAD: {pdb_idcode} -- ")
        continue
    
    clusters_count.append(len(cdr_of_each_contact))

    flag = True
    list_chain_type_cdr = []
    for cluster_cdr, cluster_chain_type in zip(cdr_of_each_contact, chain_type_of_each_contact):
        for contact_cdr, contact_chain_type in zip(cluster_cdr, cluster_chain_type):
            # Discard antigen carbons
            if contact_chain_type != '':
                chain_type_dict[contact_chain_type] += 1
                
                chain_type_cdr = contact_chain_type + str(contact_cdr)
                cdr_dict[chain_type_cdr] += 1
                
                list_chain_type_cdr.append(chain_type_cdr)
        if flag:
            # Clusters are sorted by size, so the 1st one is the largest.
            big_cluster_location = Counter(list_chain_type_cdr).most_common()[0][0]
            cdr_dict_big_cluster[big_cluster_location] += 1

            size_largest_cluster.append(len(cluster_cdr))

            flag = False

-- BAD: 6mam -- 
-- BAD: 6mar -- 
-- BAD: 7ebr -- 
-- BAD: 6gv4 -- 
-- BAD: 6gg0 -- 
-- BAD: 6mid -- 
-- BAD: 3ztn -- 
-- BAD: 3g6j -- 
-- BAD: 7kcr -- 
-- BAD: 6aj7 -- 
-- BAD: 6ulc -- 
-- BAD: 6p95 -- 
-- BAD: 5tq0 -- 
-- BAD: 2b4c -- 
-- BAD: 6p91 -- 
-- BAD: 5whk -- 
-- BAD: 7dnl -- 
-- BAD: 6vkn -- 
-- BAD: 3o0r -- 
-- BAD: 7c81 -- 
-- BAD: 7ec5 -- 
-- BAD: 6cxc -- 
-- BAD: 1xiw -- 
-- BAD: 4xi5 -- 
-- BAD: 6eay -- 
-- BAD: 5vod -- 
-- BAD: 3sdy -- 
-- BAD: 5w3e -- 
-- BAD: 6lht -- 
-- BAD: 3s88 -- 
-- BAD: 4nm8 -- 
-- BAD: 6ea7 -- 
-- BAD: 3pjs -- 
-- BAD: 1kb5 -- 
-- BAD: 7dnk -- 
-- BAD: 6v4p -- 
-- BAD: 1w72 -- 
-- BAD: 6e3h -- 
-- BAD: 6mjz -- 
-- BAD: 3v4v -- 
-- BAD: 7c80 -- 
-- BAD: 6lhq -- 
-- BAD: 6wds -- 
-- BAD: 6wdt -- 
-- BAD: 6s5a -- 
-- BAD: 6edu -- 
-- BAD: 3gjf -- 
-- BAD: 6ous -- 
-- BAD: 6apb -- 
-- BAD: 6ad0 -- 
-- BAD: 5uwe -- 
-- BAD: 6c5v -- 
-- BAD: 6lhp -- 
-- BAD: 7kr5 -- 
-- BAD: 6qd7 -- 
-- BAD: 7dnh -- 
-- BAD: 3ncy -- 
-- BAD: 6pv7 -- 
-- BAD: 6ujc -

### Origin of carbons in hydrophobic interactions

In [82]:
go.Figure(data = go.Pie(labels = ['H chain', 'L chain'],
    values = [hydro_dict['H'], hydro_dict['K'] + hydro_dict['L']], hole = .4) )

### Origin of carbons in hydrophobic interactions

In [61]:
go.Figure(data = go.Pie(labels = ['H1', 'H2', 'H3', 'L1', 'L2', 'L3'], 
    values = [cdr_dict['H1'], cdr_dict['H2'], cdr_dict['H3'],
    cdr_dict['L1'], cdr_dict['L2'], cdr_dict['L3']], hole = .4) )

### Origin of the biggest hydrophobic cluster

In [78]:
go.Figure(data = go.Pie(labels = ['H1', 'H2', 'H3', 'L1', 'L2', 'L3'], 
    values = [cdr_dict_big_cluster['H1'], cdr_dict_big_cluster['H2'],
    cdr_dict_big_cluster['H3'], cdr_dict_big_cluster['L1'],
    cdr_dict_big_cluster['L2'], cdr_dict_big_cluster['L3']], hole = .4) )

### Number of clusters per PDB, (not complex)

In [114]:
figu = px.histogram(pd.DataFrame({'count':clusters_count}), nbins = 120,
    histnorm = 'probability', x='count', labels={'count': "# of clusters"})
figu.update_xaxes(dtick=5)

### Size of the largest cluster of each PDB

In [121]:
figu = px.histogram(pd.DataFrame({'size':size_largest_cluster}), nbins = 20,
    histnorm = 'probability', x='size', labels={'size': "# of carbons on largest cluster"})
figu.update_xaxes(tick0 = 5, dtick=10)