In [1]:
import pandas as pd
import numpy as np
import itertools

import sys, os
from pathlib import Path
source_location = Path().resolve()
sys.path.append(source_location)
from scripts.utils import get_sabdab_details

In [2]:
df_sabdab_all = pd.read_csv(Path.joinpath(source_location,
    'structures/sabdab_summary_all.tsv'), sep="\t")
df_sabdab_90 = pd.read_csv(Path.joinpath(source_location,
    'structures/sabdab_summary_90.tsv'), sep="\t")

df_buried = pd.read_pickle(Path.joinpath(source_location,
    'data/epitope_buried.pickle'))

df_interactions = pd.read_pickle(Path.joinpath(source_location,
    'data/interactions.pickle'))

In [3]:
# protein_antigens = df_sabdab_all[df_sabdab_all['antigen_type'].str.contains("protein", na = False)
protein_antigens = df_sabdab_all.query("antigen_type == antigen_type and antigen_type.str.contains('protein')", engine = 'python').drop_duplicates()
ab_protein_antigens = set(protein_antigens.pdb.values)
all_saddab_proteins = set(df_sabdab_all.pdb.values)
print(
    f"SabDab protein antigen:\n"
    f"{len(ab_protein_antigens)} proteins out of {len(all_saddab_proteins)}, "
    f"{round(len(ab_protein_antigens) / len(all_saddab_proteins) * 100, 1)}%"
)

SabDab protein antigen:
2715 proteins out of 4979, 54.5%


In [4]:
ab_both_chains = set(protein_antigens.query("Hchain == Hchain and Lchain == Lchain").pdb.values)
ab_single_H_chain = set(protein_antigens.query("Hchain == Hchain").pdb.values)
ab_single_L_chain = set(protein_antigens.query("Lchain == Lchain").pdb.values)

n_ab_no_Hchain = len(ab_protein_antigens) - len(ab_single_H_chain)
n_ab_no_Lchain = len(ab_protein_antigens) - len(ab_single_L_chain)

print(f"All: {len(ab_protein_antigens)}\nNo Hchain: {n_ab_no_Hchain}\nNo Lchain: {n_ab_no_Lchain}\nBoth chains: {len(ab_both_chains)}")

All: 2715
No Hchain: 10
No Lchain: 528
Both chains: 2177


In [5]:
buried_fullab = df_buried[df_buried.idcode.isin(ab_both_chains)]
print(
    f"Buried surfaces of {len(set(df_buried.idcode.values))} proteins\n"
    f"with both chains: {len(set(buried_fullab.idcode.values))}"
)

Buried surfaces of 2492 proteins
with both chains: 1974


In [6]:
buried_ab_ag_interface_res = buried_fullab[~buried_fullab[['idcode', 'ab_ag_interface']].astype(str).duplicated(keep='first')][['idcode', 'chainID', 'chain_type', 'ab_ag_interface_res', 'ag_ab_interface_res', 'cdr_seq']]
buried_ab_ag_interface_res = buried_ab_ag_interface_res[~buried_ab_ag_interface_res[['idcode', 'cdr_seq']].astype(str).duplicated(keep='first')][['idcode', 'chainID', 'chain_type', 'ab_ag_interface_res', 'ag_ab_interface_res']]
len(buried_ab_ag_interface_res)

2148

In [7]:
# protein_antigens = df_sabdab_90[df_sabdab_90['antigen_type'].str.contains('protein', na = False)]
protein_antigens = df_sabdab_90.query("antigen_type == antigen_type and antigen_type.str.contains('protein')", engine = 'python').drop_duplicates()
ab_protein_antigens = set(protein_antigens.pdb.values)
all_saddab_proteins = set(df_sabdab_90.pdb.values)
print(
    f"SabDab protein antigen:\n"
    f"{len(ab_protein_antigens)} proteins out of {len(all_saddab_proteins)}, "
    f"{round(len(ab_protein_antigens) / len(all_saddab_proteins) * 100, 1)}%"
)

SabDab protein antigen:
1154 proteins out of 2017, 57.2%


In [8]:
ab_both_chains = set(protein_antigens.query("Hchain == Hchain and Lchain == Lchain").pdb.values)
ab_single_H_chain = set(protein_antigens.query("Hchain == Hchain").pdb.values)
ab_single_L_chain = set(protein_antigens.query("Lchain == Lchain").pdb.values)

n_ab_no_Hchain = len(ab_protein_antigens) - len(ab_single_H_chain)
n_ab_no_Lchain = len(ab_protein_antigens) - len(ab_single_L_chain)

print(f"All: {len(ab_protein_antigens)}\nNo Hchain: {n_ab_no_Hchain}\nNo Lchain: {n_ab_no_Lchain}\nBoth chains: {len(ab_both_chains)}")

All: 1154
No Hchain: 0
No Lchain: 0
Both chains: 1154


In [9]:
buried_fullab = df_buried[df_buried.idcode.isin(ab_both_chains)]
print(
    f"Buried surfaces of {len(set(df_buried.idcode.values))} proteins\n"
    f"with both chains: {len(set(buried_fullab.idcode.values))}"
)

Buried surfaces of 2492 proteins
with both chains: 867


In [10]:
buried_ab_ag_interface_res = buried_fullab[~buried_fullab[['idcode', 'ab_ag_interface']].astype(str).duplicated(keep='first')][['idcode', 'chainID', 'chain_type', 'ab_ag_interface_res', 'ag_ab_interface_res', 'cdr_seq']]
buried_ab_ag_interface_res = buried_ab_ag_interface_res[~buried_ab_ag_interface_res[['idcode', 'cdr_seq']].astype(str).duplicated(keep='first')][['idcode', 'chainID', 'chain_type', 'ab_ag_interface_res', 'ag_ab_interface_res']]
len(buried_ab_ag_interface_res)

947

In [11]:
def count_chains(x):
    ab_chains = protein_antigens.query(f"pdb == '{x['idcode']}' and {'L' if x['chain_type'] in 'LK' else 'H'}chain == '{x['chainID']}'")[['Hchain', 'Lchain']].values
    if len(ab_chains) != 1:
        return (None, None, None, None, None, None, None, None, None, None, None)
    assert len(ab_chains) == 1, print(x['idcode'], x['chain_type'], x['chainID'], ab_chains)
    ab_chains = list(ab_chains[0])

    cdrs = []
    incomplete = False
    for ab_chain in ab_chains:
        if ab_chain != ab_chain:
            cdrs.append((None, None))
            incomplete = True
            continue
        for cdrnumb in (1, 2, 3):
            cdr_info = buried_fullab.query(f"idcode == '{x['idcode']}' and chainID == '{ab_chain}' and cdr == {cdrnumb}")[['cdr_begin', 'cdr_end']].values
            if len(cdr_info) == 1:
                (cdr_b, cdr_e) = cdr_info[0]
                cdrs.append((cdr_b, cdr_e))
            else:
                cdrs.append((None, None))
                incomplete = True

    ag_res = []
    for (chain, resnumb, resname, aname, cdrnumb) in x['ag_ab_interface_res']:
        if aname.startswith('H'):
            continue

        ag_res.append(resname)
    
    ab_res = []
    chains_res = [set(), set()]
    cdrnumbs_res = [set(), set(),  set(), set(), set(), set()]
    for (chain, resnumb, resname, aname, cdrnumb) in x['ab_ag_interface_res']:
        if aname.startswith('H'):
            continue

        ab_res.append(resname)

        if chain == ab_chains[0]:
            i_chain = 0
        elif chain == ab_chains[1]:
            i_chain = 1
        else:
            raise Exception
        
        chains_res[i_chain].add((resnumb))
        for i_cdr in range(i_chain * 3, i_chain * 3 + 3):
            b, e = cdrs[i_cdr]
            if not b:
                continue
            if resnumb >= b and resnumb <= e:
                cdrnumbs_res[i_cdr].add((resnumb))
    
    chains = [len(i) for i in chains_res]
    cdrnumbs = [len(i) for i in cdrnumbs_res]
    
    return (*chains, *cdrnumbs, ab_res, ag_res, incomplete)

buried_ab_ag_interface_res['chains'] =\
    buried_ab_ag_interface_res.apply(lambda x: count_chains(x), axis=1)

In [12]:
buried_ab_ag_interface_res['hchains'], buried_ab_ag_interface_res['lchains'],\
    buried_ab_ag_interface_res['hcdr1'], buried_ab_ag_interface_res['hcdr2'],\
    buried_ab_ag_interface_res['hcdr3'], buried_ab_ag_interface_res['lcdr1'],\
    buried_ab_ag_interface_res['lcdr2'], buried_ab_ag_interface_res['lcdr3'],\
    buried_ab_ag_interface_res['ab_res'], buried_ab_ag_interface_res['ag_res'],\
    buried_ab_ag_interface_res['incomplete'] = zip(*buried_ab_ag_interface_res.chains)

# lost due to incompatibility between Full SabDab and Redudant nomenclature 
len(buried_ab_ag_interface_res.query('hchains == hchains')) 

946

In [13]:
buried_ab_ag_interface_res = buried_ab_ag_interface_res.query('incomplete == 0')

In [14]:
n_interfaces = len(buried_ab_ag_interface_res.query('incomplete == 0'))
n_interfaces_bothchains = len(buried_ab_ag_interface_res.query('hchains != 0 and lchains != 0 and incomplete == 0'))
n_interfaces_hchainonly = len(buried_ab_ag_interface_res.query('hchains != 0 and incomplete == 0')) - n_interfaces_bothchains
n_interfaces_lchainonly = len(buried_ab_ag_interface_res.query('lchains != 0 and incomplete == 0')) - n_interfaces_bothchains
print(
    f"Total interfaces: {n_interfaces}\n",
    f"H+L chains present: {n_interfaces_bothchains} {round(n_interfaces_bothchains / n_interfaces * 100, 1)}\n",
    f"Only H chain present: {n_interfaces_hchainonly} {round(n_interfaces_hchainonly / n_interfaces * 100, 1)}\n",
    f"Only L chain present: {n_interfaces_lchainonly} {round(n_interfaces_lchainonly / n_interfaces * 100, 1)}",
)

import plotly.graph_objects as go

labels = ['H+L chains','Only H chain','Only L']
values = [n_interfaces_bothchains, n_interfaces_hchainonly, n_interfaces_lchainonly]

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show()

Total interfaces: 946
 H+L chains present: 897 94.8
 Only H chain present: 47 5.0
 Only L chain present: 2 0.2


In [15]:
heavy_hist = np.histogram(buried_ab_ag_interface_res['hchains'], range=(0, 20), bins=10, density=True)
light_hist = np.histogram(buried_ab_ag_interface_res['lchains'], range=(0, 20), bins=10, density=True)
heavy_mean, light_mean = buried_ab_ag_interface_res['hchains'].mean(), buried_ab_ag_interface_res['lchains'].mean()

print(f'Number of Residues\nAvg Heavy chain: {round(heavy_mean, 1)}\nAvg Light chain: {round(light_mean, 1)}')
print(heavy_hist)
print(light_hist)

fig = go.Figure()
fig.add_trace(go.Scatter(y=heavy_hist[0], x=heavy_hist[1],
                    mode='lines+markers', line_shape='spline',
                    name='Heavy Chain'))
fig.add_trace(go.Scatter(y=light_hist[0], x=light_hist[1],
                    mode='lines+markers', line_shape='spline',
                    name='Light Chain'))
fig.update_layout(
    xaxis_title="# Residues",
    yaxis_title="Probability Density",
    template="plotly_white",
    autosize=False,
    width=500,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
)
fig.show()

Number of Residues
Avg Heavy chain: 9.7
Avg Light chain: 5.1
(array([0.01804671, 0.014862  , 0.03821656, 0.06794055, 0.09766454,
       0.10721868, 0.07165605, 0.05467091, 0.01963907, 0.01008493]), array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.]))
(array([0.05720339, 0.10222458, 0.13930085, 0.10752119, 0.05773305,
       0.02330508, 0.00635593, 0.00317797, 0.00264831, 0.00052966]), array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.]))


In [16]:

tot_hcdr1, tot_hcdr2, tot_hcdr3 = buried_ab_ag_interface_res['hcdr1'].sum(), buried_ab_ag_interface_res['hcdr2'].sum(), buried_ab_ag_interface_res['hcdr3'].sum()
tot_hchain_cdr = tot_hcdr1 + tot_hcdr2 + tot_hcdr3
tot_lcdr1, tot_lcdr2, tot_lcdr3 = buried_ab_ag_interface_res['lcdr1'].sum(), buried_ab_ag_interface_res['lcdr2'].sum(), buried_ab_ag_interface_res['lcdr3'].sum()
tot_lchain_cdr = tot_lcdr1 + tot_lcdr2 + tot_lcdr3
tot_all = tot_hchain_cdr + tot_lchain_cdr
print(
    '        TOTAL\n'
    f'HEAVY: {int(tot_hchain_cdr)} ({round(tot_hchain_cdr / tot_all * 100, 1)}%)  \n'
    f'LIGHT: {int(tot_lchain_cdr)} ({round(tot_lchain_cdr / tot_all * 100, 1)}%)  \n'
)
print(
    f'       CDR1    CDR2    CDR3\n'
    'HEAVY: '
    f'{int(tot_hcdr1)} ({round(tot_hcdr1 / tot_all * 100, 1)}%) '
    f'{int(tot_hcdr2)} ({round(tot_hcdr2 / tot_all * 100, 1)}%) '
    f'{int(tot_hcdr3)} ({round(tot_hcdr3 / tot_all * 100, 1)}%)\n'
    f'LIGHT: {int(tot_lcdr1)} ({round(tot_lcdr1 / tot_all * 100, 1)}%) '
    f' {int(tot_lcdr2)}  ({round(tot_lcdr2 / tot_all * 100, 1)}%) '
    f'{int(tot_lcdr3)} ({round(tot_lcdr3 / tot_all * 100, 1)}%)\n'
    'TOTAL: '
    f'{int(tot_hcdr1 + tot_lcdr1)} ({round((tot_hcdr1 + tot_lcdr1) / tot_all * 100, 1)}%) '
    f'{int(tot_hcdr2 + tot_lcdr2)} ({round((tot_hcdr2 + tot_lcdr2) / tot_all * 100, 1)}%) '
    f'{int(tot_hcdr3 + tot_lcdr3)} ({round((tot_hcdr3 + tot_lcdr3) / tot_all * 100, 1)}%)\n'
)

        TOTAL
HEAVY: 7144 (62.6%)  
LIGHT: 4261 (37.4%)  

       CDR1    CDR2    CDR3
HEAVY: 1446 (12.7%) 2009 (17.6%) 3689 (32.3%)
LIGHT: 1608 (14.1%)  707  (6.2%) 1946 (17.1%)
TOTAL: 3054 (26.8%) 2716 (23.8%) 5635 (49.4%)



In [17]:
hrange = (0, 10)
hcdr1_hist = np.histogram(buried_ab_ag_interface_res['hcdr1'], range=hrange, bins=10)
hcdr2_hist = np.histogram(buried_ab_ag_interface_res['hcdr2'], range=hrange, bins=10)
hcdr3_hist = np.histogram(buried_ab_ag_interface_res['hcdr3'], range=hrange, bins=10)
lcdr1_hist = np.histogram(buried_ab_ag_interface_res['lcdr1'], range=hrange, bins=10)
lcdr2_hist = np.histogram(buried_ab_ag_interface_res['lcdr2'], range=hrange, bins=10)
lcdr3_hist = np.histogram(buried_ab_ag_interface_res['lcdr3'], range=hrange, bins=10)

print('Average CDR length (number of residues)')
print(
    '   CDR1 CDR2 CDR3\n'
    f'H : {round(buried_ab_ag_interface_res["hcdr1"].mean(), 1)} '
    f' {round(buried_ab_ag_interface_res["hcdr2"].mean(), 1)} '
    f' {round(buried_ab_ag_interface_res["hcdr3"].mean(), 1)}\n'
    f'L : {round(buried_ab_ag_interface_res["lcdr1"].mean(), 1)} '
    f' {round(buried_ab_ag_interface_res["lcdr2"].mean(), 1)} '
    f' {round(buried_ab_ag_interface_res["lcdr3"].mean(), 1)}'
)
print(hcdr1_hist)
print(hcdr2_hist)
print(hcdr3_hist)
print(lcdr1_hist)
print(lcdr2_hist)
print(lcdr3_hist)

fig = go.Figure()
fig.add_trace(go.Scatter(y=hcdr1_hist[0][1:], x=hcdr1_hist[1][1:],
                    mode='lines+markers', line_shape='spline',
                    name='H CDR 1'))
fig.add_trace(go.Scatter(y=hcdr2_hist[0][1:], x=hcdr2_hist[1][1:],
                    mode='lines+markers', line_shape='spline',
                    name='H CDR 2'))
fig.add_trace(go.Scatter(y=hcdr3_hist[0][1:], x=hcdr3_hist[1][1:],
                    mode='lines+markers', line_shape='spline',
                    name='H CDR 3'))
fig.add_trace(go.Scatter(y=lcdr1_hist[0][1:], x=lcdr1_hist[1][1:],
                    mode='lines+markers', line_shape='spline',
                    name='L CDR 1'))
fig.add_trace(go.Scatter(y=lcdr2_hist[0][1:], x=lcdr2_hist[1][1:],
                    mode='lines+markers', line_shape='spline',
                    name='L CDR 2'))
fig.add_trace(go.Scatter(y=lcdr3_hist[0][1:], x=lcdr3_hist[1][1:],
                    mode='lines+markers', line_shape='spline',
                    name='L CDR 3'))
fig.update_layout(
    xaxis_title="# Residues",
    yaxis_title="Probability Density",
    template="plotly_white",
    autosize=False,
    width=500,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    xaxis_range=(0.5, 9.5),
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 1,
        dtick = 1
    )
)
fig.show()

Average CDR length (number of residues)
   CDR1 CDR2 CDR3
H : 1.5  2.1  3.9
L : 1.7  0.7  2.1
(array([341, 162, 206, 131,  63,  33,   8,   2,   0,   0]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]))
(array([179, 160, 202, 214, 156,  31,   4,   0,   0,   0]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]))
(array([ 23,  91, 109, 198, 214, 134,  81,  43,  28,  19]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]))
(array([272, 212, 191, 151,  67,  30,  19,   3,   1,   0]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]))
(array([557, 218,  88,  47,  17,  13,   3,   3,   0,   0]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]))
(array([237, 146, 182, 176, 138,  53,  11,   1,   0,   2]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]))


In [18]:
only_framework = len(buried_ab_ag_interface_res.query('(hcdr1 + hcdr2 + hcdr3 + lcdr1 + lcdr2 + lcdr3) == 0'))
only_cdr = len(buried_ab_ag_interface_res.query('(hcdr1 + hcdr2 + hcdr3 + lcdr1 + lcdr2 + lcdr3) == (hchains + lchains)'))

print(
    f'Only FRAMEWORK: {round(only_framework / len(buried_ab_ag_interface_res) * 100, 1)}%\n'
    f'Only CDR: {round(only_cdr / len(buried_ab_ag_interface_res) * 100, 1)}%\n'
    f'CDR + Framework: {round((len(buried_ab_ag_interface_res) - (only_framework + only_cdr)) / len(buried_ab_ag_interface_res) * 100, 1)}%'
)

Only FRAMEWORK: 0.2%
Only CDR: 14.0%
CDR + Framework: 85.8%


In [19]:
def get_cdr_perc(x):
    hcdr = x['hcdr1'] + x['hcdr2'] + x['hcdr3']
    lcdr = x['lcdr1'] + x['lcdr2'] + x['lcdr3']
    cdr = hcdr + lcdr
    total = x['hchains'] + x['lchains']
    return cdr / total

cdr_perc = buried_ab_ag_interface_res.apply(lambda x: get_cdr_perc(x), axis=1)
cdr_hist = np.histogram(cdr_perc, bins=10, density=True)

print(f'Avg CDR interaction: {round(cdr_perc.mean() * 100, 1)}%')
print(cdr_hist)


fig = go.Figure()
fig.add_trace(go.Scatter(y=cdr_hist[0], x=cdr_hist[1]*100,
                    mode='lines+markers', line_shape='spline',
                    name='Heavy Chain'))
fig.update_layout(
    xaxis_title="CDR Interactions (%)",
    yaxis_title="Probability Density",
    template="plotly_white",
    autosize=False,
    width=500,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    xaxis_range=[0,101]
)
fig.show()

Avg CDR interaction: 82.8%
(array([0.02114165, 0.        , 0.01057082, 0.0845666 , 0.26427061,
       0.28541226, 0.6448203 , 1.80761099, 3.75264271, 3.12896406]), array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]))


In [20]:
tot_hchain_all = buried_ab_ag_interface_res['hchains'].sum()
tot_lchain_all = buried_ab_ag_interface_res['lchains'].sum()
print(
    "          All    CDR  Framework\n "
    f"HEAVY:  {int(tot_hchain_all)}   {int(tot_hchain_cdr)}  {int(tot_hchain_all - tot_hchain_cdr)}\n",
    f"LIGHT:  {int(tot_lchain_all)}   {int(tot_lchain_cdr)}   {int(tot_lchain_all - tot_lchain_cdr)}\n",
    f"TOTAL: {int(tot_hchain_all + tot_lchain_all)}  {int(tot_hchain_cdr + tot_lchain_cdr)}  {int((tot_hchain_all - tot_hchain_cdr) + (tot_lchain_all - tot_lchain_cdr))}"
)


          All    CDR  Framework
 HEAVY:  9189   7144  2045
 LIGHT:  4820   4261   559
 TOTAL: 14009  11405  2604


In [21]:
def count_cdrs(x):
    n_cdrs = 0
    for cdr_i in x[['hcdr1', 'hcdr2', 'hcdr3', 'lcdr1', 'lcdr2', 'lcdr3']]:
        if cdr_i > 0:
            n_cdrs += 1
    return n_cdrs


buried_ab_ag_interface_res['n_cdrs'] = buried_ab_ag_interface_res.apply(lambda x: count_cdrs(x), axis=1)

n_cdrs_hist = np.histogram(buried_ab_ag_interface_res['n_cdrs'], range=(0.5,6.5), bins=6, density=True)
print(f"Avg number of CDRs: {round(buried_ab_ag_interface_res['n_cdrs'].mean(), 1)}")
print(n_cdrs_hist)

fig = go.Figure()
fig.add_trace(go.Scatter(y=n_cdrs_hist[0], x=n_cdrs_hist[1],
                    mode='lines+markers', line_shape='spline',
                    name='Heavy Chain'))
fig.update_layout(
    xaxis_title="Number of Interacting CDR Chains",
    yaxis_title="Probability Density",
    template="plotly_white",
    autosize=False,
    width=500,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)
fig.show()


Avg number of CDRs: 4.3
(array([0.00211864, 0.04555085, 0.1875    , 0.31038136, 0.31567797,
       0.13877119]), array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5]))


In [22]:
def get_cdr3_len(x):
    ab_chains = protein_antigens.query(
        f"pdb == '{x['idcode']}' and {'L' if x['chain_type'] in 'LK' else 'H'}chain == '{x['chainID']}'", engine = 'python')[['Hchain', 'Lchain']].values
    assert len(ab_chains) == 1, print(ab_chains)
    ab_chains = list(ab_chains[0])
    
    cdr3 = buried_fullab.query(
        f'idcode == "{x["idcode"]}" and chainID.isin({ab_chains}) and cdr == 3', engine = 'python')['cdr_seq'].values[0]
    return len(cdr3)


buried_ab_ag_interface_res['cdr3_len'] = buried_ab_ag_interface_res.apply(lambda x: get_cdr3_len(x), axis=1)

In [23]:
cdr3_hist, cdr3_bins = np.histogram(buried_ab_ag_interface_res['cdr3_len'], bins=10, range=(0, 30), density=True)
print(cdr3_hist, cdr3_bins)
fig = go.Figure()
fig.add_trace(go.Scatter(y=cdr3_hist, x=cdr3_bins,
                    mode='lines+markers', line_shape='spline',
                    name='Heavy Chain'))
fig.update_layout(
    xaxis_title="# of Residues in CDR3",
    yaxis_title="Probability Density",
    template="plotly_white",
    autosize=False,
    width=500,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)
fig.show()

[0.         0.00812147 0.03742938 0.09922316 0.08933616 0.05049435
 0.02860169 0.01129944 0.00847458 0.00035311] [ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27. 30.]


In [24]:
bins = [[] for i in cdr3_bins[1:]]
bins_i = list(enumerate(cdr3_bins[1:]))
for n_int_cdrs, cdr3_len in buried_ab_ag_interface_res[['n_cdrs', 'cdr3_len']].values:
    for i, upper_bound in bins_i:
        if cdr3_len <= upper_bound:
            bins[i].append(n_int_cdrs)

In [25]:
avg_n_cdrs_bins = [round(sum(i)/len(i), 2) for i in bins]
print(avg_n_cdrs_bins)

[5.0, 5.25, 4.86, 4.66, 4.52, 4.42, 4.37, 4.31, 4.31, 4.3]


In [26]:
def aa_dist(aa_lists):
    dist = {}
    for aas in aa_lists:
        for aa in aas:
            if aa not in dist:
                dist[aa] = 0
            dist[aa] += 1

    return dist

allaas_dist = aa_dist(buried_ab_ag_interface_res['ab_res'].values)
allaas_dist

{'THR': 1574,
 'SER': 3022,
 'TRP': 3841,
 'ASN': 1922,
 'ARG': 1793,
 'ILE': 697,
 'GLY': 2215,
 'PHE': 2160,
 'LEU': 870,
 'TYR': 10620,
 'GLN': 496,
 'GLU': 818,
 'VAL': 680,
 'ASP': 1763,
 'LYS': 408,
 'MET': 180,
 'PRO': 554,
 'CYS': 78,
 'ALA': 758,
 'HIS': 734}

In [27]:
allaas_dist = aa_dist(buried_ab_ag_interface_res['ag_res'].values)
allaas_dist

{'GLU': 1869,
 'THR': 1923,
 'PHE': 1744,
 'HIS': 976,
 'ARG': 2328,
 'SER': 1691,
 'PRO': 1474,
 'LEU': 1464,
 'GLN': 1555,
 'VAL': 933,
 'MET': 385,
 'CYS': 377,
 'ILE': 952,
 'ALA': 1151,
 'LYS': 1920,
 'ASP': 2017,
 'ASN': 2334,
 'GLY': 2118,
 'TRP': 1173,
 'TYR': 2328}

In [28]:
import sys
sys.path.append('scripts')

from scripts.utils import read_pdb_line
from Bio.Data.IUPACData import protein_letters_3to1 as AA_CONVERTER

def get_sequence(pdb_path, chains):
    prot_aas = []
    last = None
    with open(pdb_path) as f:
        for line in f:
            if line.startswith("ATOM"):
                chain, cdr_id, res, *_ = read_pdb_line(line)
                (resname, resnumb) = res
                new = resname, chain, resnumb, cdr_id
                if chain in chains and last != new:
                    res = AA_CONVERTER[resname.capitalize()]
                    prot_aas.append(resname)
                last = new[:]
    return prot_aas

aas = []
for idcode in buried_ab_ag_interface_res.idcode.unique():
    pdb_path = f"structures/raw/{idcode}.pdb"
    chains = protein_antigens.query(f'pdb == "{idcode}"')['antigen_chain'].unique()
    prot_aas = get_sequence(pdb_path, chains)
    aas.append(prot_aas)

aa_dist(aas)

{'MET': 7413,
 'LEU': 39638,
 'PRO': 22740,
 'ARG': 19670,
 'TRP': 7233,
 'GLU': 23752,
 'ALA': 27632,
 'TYR': 18171,
 'SER': 33797,
 'GLY': 31418,
 'PHE': 21932,
 'HIS': 9456,
 'VAL': 33016,
 'LYS': 25870,
 'ASP': 22109,
 'GLN': 19620,
 'THR': 33282,
 'ILE': 28738,
 'CYS': 14257,
 'ASN': 30231}

In [29]:
def filter_res(residues):
    res = set()
    for chain, resnumb, resname, aname, cdrnumb in residues:
        if aname.startswith('H'):
            continue
        res.add((chain, resnumb, resname))
    return res

def count_interchain(x):
    ab_chains = protein_antigens.query(
        f"pdb == '{x['idcode']}' and {'L' if x['chain_type'] in 'LK' else 'H'}chain == '{x['chainID']}'", engine = 'python')[['Hchain', 'Lchain']].values
    assert len(ab_chains) == 1, print(ab_chains)
    ab_chains = list(ab_chains[0])
    
    
    cdrs = buried_fullab.query(
        f'idcode == "{x["idcode"]}" and chainID.isin({ab_chains})', engine = 'python')
    assert len(cdrs) == 6, print(cdrs)
    cdr1, cdr2, cdr3, cdr4, cdr5, cdr6 = [filter_res(i) for i in cdrs['ag_cdr_interface_res'].values]
    
    shared_cdr = []
    all_cdrs = (cdr1, cdr2, cdr3, cdr4, cdr5, cdr6)
    for (i, j) in itertools.combinations(range(len(all_cdrs)), 2):
        if i == j:
            continue
        cdr_i = all_cdrs[i]
        cdr_j = all_cdrs[j]
        shared_cdrij = len(cdr_i.intersection(cdr_j))
        shared_cdr.append(shared_cdrij)

    ag_cdr_res = filter_res([ii for i in cdrs['ag_cdr_interface_res'].values for ii in i])
    ag_chain_res = filter_res([ii for i in cdrs['ag_cdrchain_interface_res'].values for ii in i])
    ag_res = filter_res([ii for i in cdrs['ag_ab_interface_res'].values for ii in i])
    inter_chain = ag_res - ag_chain_res
    inter_cdr = ag_chain_res - ag_cdr_res
    
    return (len(inter_chain), len(inter_cdr), len(ag_res), *shared_cdr)

buried_ab_ag_interface_res["cdr_info"] = buried_ab_ag_interface_res.apply(lambda x: count_interchain(x), axis=1)
buried_ab_ag_interface_res["inter_chain"], buried_ab_ag_interface_res["inter_cdr"], buried_ab_ag_interface_res["ag_res"], buried_ab_ag_interface_res["cdr_12"], buried_ab_ag_interface_res["cdr_13"], buried_ab_ag_interface_res["cdr_14"], buried_ab_ag_interface_res["cdr_15"], buried_ab_ag_interface_res["cdr_16"], buried_ab_ag_interface_res["cdr_23"], buried_ab_ag_interface_res["cdr_24"], buried_ab_ag_interface_res["cdr_25"], buried_ab_ag_interface_res["cdr_26"], buried_ab_ag_interface_res["cdr_34"], buried_ab_ag_interface_res["cdr_35"], buried_ab_ag_interface_res["cdr_36"], buried_ab_ag_interface_res["cdr_45"], buried_ab_ag_interface_res["cdr_46"], buried_ab_ag_interface_res["cdr_56"] = zip(*buried_ab_ag_interface_res.cdr_info)

In [30]:
print(np.histogram(buried_ab_ag_interface_res['inter_chain'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5]))
print(np.histogram(buried_ab_ag_interface_res['inter_cdr'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 26]))
print(np.histogram(buried_ab_ag_interface_res['ag_res'], range=(0, 40), bins=10))

print(buried_ab_ag_interface_res['ag_res'].mean())
print(buried_ab_ag_interface_res['inter_cdr'].mean() / 6)

(array([774, 154,  16,   2]), array([-0.5,  0.5,  1.5,  2.5,  3.5]))
(array([314, 272, 143,  99,  37,  16,  64]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5,  5.5, 26. ]))
(array([  4,  31, 220, 309, 251,  90,  31,   6,   1,   2]), array([ 0.,  4.,  8., 12., 16., 20., 24., 28., 32., 36., 40.]))
14.662790697674419
0.3023255813953488


In [31]:
print(np.histogram(buried_ab_ag_interface_res['cdr_12'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_12'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_13'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_13'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_14'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_14'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_15'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_15'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_16'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_16'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_23'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_23'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_24'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_24'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_25'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_25'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_26'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_26'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_34'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_34'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_35'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_35'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_36'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_36'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_45'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_45'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_46'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_46'].mean())
print(np.histogram(buried_ab_ag_interface_res['cdr_45'], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]), buried_ab_ag_interface_res['cdr_45'].mean())

print(max(buried_ab_ag_interface_res['cdr_12']), max(buried_ab_ag_interface_res['cdr_13']), max(buried_ab_ag_interface_res['cdr_14']), max(buried_ab_ag_interface_res['cdr_15']), max(buried_ab_ag_interface_res['cdr_16']), max(buried_ab_ag_interface_res['cdr_23']), max(buried_ab_ag_interface_res['cdr_24']), max(buried_ab_ag_interface_res['cdr_25']), max(buried_ab_ag_interface_res['cdr_26']), max(buried_ab_ag_interface_res['cdr_34']), max(buried_ab_ag_interface_res['cdr_35']), max(buried_ab_ag_interface_res['cdr_36']), max(buried_ab_ag_interface_res['cdr_45']), max(buried_ab_ag_interface_res['cdr_46']), max(buried_ab_ag_interface_res['cdr_45']))

(array([768, 156,  21,   1,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.2124735729386892
(array([734, 154,  52,   6,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.2917547568710359
(array([943,   2,   1,   0,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.004228329809725159
(array([941,   5,   0,   0,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.005285412262156448
(array([943,   3,   0,   0,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.003171247357293869
(array([728, 180,  34,   4,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.2748414376321353
(array([943,   3,   0,   0,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.003171247357293869
(array([943,   3,   0,   0,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.003171247357293869
(array([896,  47,   3,   0,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.056025369978858354
(array([663, 229,  52,   2,   0]), array([-0.5,  0.5,  1.5,  2.5,  3.5,  4.5])) 0.358350951374207

In [32]:
z = [
    [None, buried_ab_ag_interface_res['cdr_12'].mean(),buried_ab_ag_interface_res['cdr_13'].mean(),buried_ab_ag_interface_res['cdr_14'].mean(),buried_ab_ag_interface_res['cdr_15'].mean(),buried_ab_ag_interface_res['cdr_16'].mean()],
    [buried_ab_ag_interface_res['cdr_12'].mean(),None,buried_ab_ag_interface_res['cdr_23'].mean(),buried_ab_ag_interface_res['cdr_24'].mean(),buried_ab_ag_interface_res['cdr_25'].mean(),buried_ab_ag_interface_res['cdr_26'].mean()],
    [buried_ab_ag_interface_res['cdr_13'].mean(),buried_ab_ag_interface_res['cdr_23'].mean(),None,buried_ab_ag_interface_res['cdr_34'].mean(),buried_ab_ag_interface_res['cdr_35'].mean(),buried_ab_ag_interface_res['cdr_36'].mean()],
    [buried_ab_ag_interface_res['cdr_14'].mean(),buried_ab_ag_interface_res['cdr_24'].mean(),buried_ab_ag_interface_res['cdr_34'].mean(),None,buried_ab_ag_interface_res['cdr_45'].mean(),buried_ab_ag_interface_res['cdr_46'].mean()],
    [buried_ab_ag_interface_res['cdr_15'].mean(),buried_ab_ag_interface_res['cdr_25'].mean(),buried_ab_ag_interface_res['cdr_35'].mean(),buried_ab_ag_interface_res['cdr_45'].mean(),None,buried_ab_ag_interface_res['cdr_56'].mean()],
    [buried_ab_ag_interface_res['cdr_16'].mean(),buried_ab_ag_interface_res['cdr_26'].mean(),buried_ab_ag_interface_res['cdr_36'].mean(),buried_ab_ag_interface_res['cdr_46'].mean(),buried_ab_ag_interface_res['cdr_56'].mean(),None],
]

fig = go.Figure(data=go.Heatmap(colorscale=('white', 'black'), z=z, x=['H CDR 1', 'H CDR 2', 'H CDR 3', 'L CDR 1', 'L CDR 2', 'L CDR 3'], y=['H CDR 1', 'H CDR 2', 'H CDR 3', 'L CDR 1', 'L CDR 2', 'L CDR 3']))
fig.show()

In [33]:
clean_idcodes = set(buried_ab_ag_interface_res.idcode.values)
hbonds = []
wms = []
for (idcode, ab_h_chain, ab_l_chain, ag_chain, interactions) in df_interactions:
    if idcode not in clean_idcodes:
        continue
    _, _ , _, _, _, _, hhbonds, lhbonds, hwm, lwm = interactions
    i_hbonds = len(hhbonds) + len(lhbonds)
    hbonds.append(i_hbonds)

    i_wms = len(hwm) + len(lwm)
    wms.append(i_wms)

print(np.histogram(hbonds, range=(0, 40), bins=10))
print(sum(hbonds) / len(hbonds), len(hbonds))

print(np.histogram(wms, range=(0, 280), bins=10))
print(sum(wms) / len(wms), len(wms))


(array([ 7, 20, 53, 55, 48, 29, 13,  9,  3,  1]), array([ 0.,  4.,  8., 12., 16., 20., 24., 28., 32., 36., 40.]))
14.974789915966387 238
(array([ 1,  9, 46, 65, 50, 42, 13,  4,  6,  2]), array([  0.,  28.,  56.,  84., 112., 140., 168., 196., 224., 252., 280.]))
116.8109243697479 238


In [34]:
def count_linear_seqs(x, tolerance):
    ab_chains = protein_antigens.query(
        f"pdb == '{x['idcode']}' and {'L' if x['chain_type'] in 'LK' else 'H'}chain == '{x['chainID']}'")[['Hchain', 'Lchain']].values
    assert len(ab_chains) == 1, print(ab_chains)
    ab_chains = list(ab_chains[0])
    
    cdr_epitope = buried_fullab.query(
        f'idcode == "{x["idcode"]}"')['ag_ab_interface_res'].values[0]
    
    epitope = {}
    for atom in cdr_epitope:
        chain, resnumb, resname, aname, _ = atom
        if aname.startswith('H'):
            continue
        if chain not in epitope:
            epitope[chain] = []
        epitope[chain].append(resnumb)

    seq_lens = []
    for chain, res in epitope.items():
        n_seqs = 0
        prev = None
        ep_seq = 0
        res = sorted(set(res))
        for i, resnumb in enumerate(res):
            if prev and (resnumb != prev + 1 and resnumb > prev + 1 + tolerance):
                seq_lens.append(ep_seq)
                ep_seq = 0
            prev = resnumb
            ep_seq += 1
        seq_lens.append(ep_seq)
    
    return seq_lens

buried_ab_ag_interface_res["epi_seq_len_0"] = buried_ab_ag_interface_res.apply(lambda x: count_linear_seqs(x, 0), axis=1)
buried_ab_ag_interface_res["epi_seq_len_1"] = buried_ab_ag_interface_res.apply(lambda x: count_linear_seqs(x, 1), axis=1)
buried_ab_ag_interface_res["epi_seq_len_3"] = buried_ab_ag_interface_res.apply(lambda x: count_linear_seqs(x, 3), axis=1)


In [35]:
all_epi_seqs, epi_seqs_per_ab = [], []
for epi_seqs in buried_ab_ag_interface_res["epi_seq_len_0"]:
    all_epi_seqs += epi_seqs
    epi_seqs_per_ab.append(len(epi_seqs))

print(np.histogram(all_epi_seqs, bins=10, range=(0, 20)))
print(np.histogram(epi_seqs_per_ab, bins=10, range=(0, 30)))

all_epi_seqs, epi_seqs_per_ab = [], []
for epi_seqs in buried_ab_ag_interface_res["epi_seq_len_1"]:
    all_epi_seqs += epi_seqs
    epi_seqs_per_ab.append(len(epi_seqs))

print(np.histogram(all_epi_seqs, bins=10, range=(0, 20)))
print(np.histogram(epi_seqs_per_ab, bins=10, range=(0, 30)))

all_epi_seqs, epi_seqs_per_ab = [], []
for epi_seqs in buried_ab_ag_interface_res["epi_seq_len_3"]:
    all_epi_seqs += epi_seqs
    epi_seqs_per_ab.append(len(epi_seqs))

print(np.histogram(all_epi_seqs, bins=10, range=(0, 20)))
print(np.histogram(epi_seqs_per_ab, bins=10, range=(0, 30)))


(array([3892, 2160,  630,  204,   86,   13,    1,    0,    0,    0]), array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.]))
(array([ 25, 233, 393, 203,  80,   8,   3,   0,   1,   0]), array([ 0.,  3.,  6.,  9., 12., 15., 18., 21., 24., 27., 30.]))
(array([1990, 1591,  776,  399,  150,   51,   14,    5,    0,    0]), array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.]))
(array([ 68, 511, 276,  83,   6,   2,   0,   0,   0,   0]), array([ 0.,  3.,  6.,  9., 12., 15., 18., 21., 24., 27., 30.]))
(array([ 926, 1151,  759,  445,  225,   99,   46,   15,    6,    1]), array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20.]))
(array([193, 609, 129,  13,   2,   0,   0,   0,   0,   0]), array([ 0.,  3.,  6.,  9., 12., 15., 18., 21., 24., 27., 30.]))


In [36]:
def count_atomtypes(x):
    ab_chains = protein_antigens.query(
        f"pdb == '{x['idcode']}' and {'L' if x['chain_type'] in 'LK' else 'H'}chain == '{x['chainID']}'", engine = 'python')[['Hchain', 'Lchain']].values
    assert len(ab_chains) == 1, print(ab_chains)
    ab_chains = list(ab_chains[0])
    
    epitope = buried_fullab.query(
        f'idcode == "{x["idcode"]}" and chainID.isin({ab_chains})', engine = 'python')['ag_ab_interface_res'].values[0]
    
    ag_atom_dist = {}
    
    for atom in epitope:
        chain, resnumb, resname, aname, _ = atom
        if aname.startswith('H'):
            continue
        if aname[0] not in ag_atom_dist:
            ag_atom_dist[aname[0]] = 0
        ag_atom_dist[aname[0]] += 1
    
    return ag_atom_dist

buried_ab_ag_interface_res['atom_hist'] = buried_ab_ag_interface_res.apply(lambda x: count_atomtypes(x), axis=1)

In [37]:
atom_hist = {'N': [], 'C': [], 'O': [], 'S': []}
for atoms in buried_ab_ag_interface_res['atom_hist']:
    for atom, count in atoms.items():
        atom_hist[atom].append(count)

In [38]:
print(np.histogram(atom_hist['C'], range=(0, 50), bins=10))

(array([  5,  30, 117, 230, 271, 169,  84,  28,   7,   4]), array([ 0.,  5., 10., 15., 20., 25., 30., 35., 40., 45., 50.]))
