In [1]:
import pandas as pd
import numpy as np

In [2]:
df_sabdab_90 = pd.read_csv('structures/sabdab_summary_90_May2022.tsv', sep="\t")

In [3]:
print(f"Original SabDab\nAbAg Complexes: {len(df_sabdab_90)}\nPDB Files: {len(set(df_sabdab_90.pdb.values))}")

Original SabDab
AbAg Complexes: 3890
PDB Files: 1800


In [4]:
df_sabdab_90_ = df_sabdab_90.query("Hchain == Hchain and Lchain == Lchain")
df_sabdab_90_ = df_sabdab_90_.query("Hchain != Lchain")
print(f"With both chains\nAbAg Complexes: {len(df_sabdab_90_)}\nPDB Files: {len(set(df_sabdab_90_.pdb.values))}")

With both chains
AbAg Complexes: 3735
PDB Files: 1743


In [5]:
df_sabdab_90_ = df_sabdab_90_.drop_duplicates()
print(f"Dropped duplicates\nAbAg Complexes: {len(df_sabdab_90_)}\nPDB Files: {len(set(df_sabdab_90_.pdb.values))}")

Dropped duplicates
AbAg Complexes: 3581
PDB Files: 1743


In [6]:
df_sabdab_90_ = df_sabdab_90_.query(
        "antigen_type == antigen_type and antigen_type.str.contains('protein')"
)
print(f"With a protein antigen\nAbAg Complexes: {len(df_sabdab_90_)}\nPDB Files: {len(set(df_sabdab_90_.pdb.values))}")

With a protein antigen
AbAg Complexes: 2786
PDB Files: 1309


In [7]:
df_sabdab_90_.to_pickle('data/sabdad_90_cleaned.pickle')

In [8]:
df_cdrs = pd.read_pickle('data/cdr_epitope.pickle')

In [9]:
print(f"Processed, CDR-specific epitopes calculated and removed redundant Abs in the same structure \nAbAg Complexes: {len(df_cdrs)/6}\nPDB Files: {len(set(df_cdrs.idcode.values))}")

Processed, CDR-specific epitopes calculated and removed redundant Abs in the same structure 
AbAg Complexes: 1483.0
PDB Files: 1309


In [10]:
df_buried = pd.read_pickle('data/epitope_buried_.pickle')

print(f"Calculated buried surfaces, paratopes and epitopes \nAbAg Complexes: {len(df_buried)/6}\nPDB Files: {len(set(df_buried.idcode.values))}")

Calculated buried surfaces, paratopes and epitopes 
AbAg Complexes: 1426.0
PDB Files: 1261


In [11]:
failed_idcodes = set()
with open("scripts/PDB2PQR_FAILS") as f:
    for line in f:
        line = line.strip()
        failed_idcodes.add(line)

print(f"Failed PDB2PQR: {len(failed_idcodes)}\nOnly partially: {len(failed_idcodes - (failed_idcodes - set(df_buried.idcode.values)))}")

Failed PDB2PQR: 49
Only partially: 2


In [12]:
df_buried = df_buried[df_buried["ag_ab_interface"] != set()]
df_buried = df_buried[df_buried["ab_ag_interface"] != set()]
print(f"Bound (non-empty paratopes and epitopes) \nAbAg Complexes: {len(df_buried)/6}\nPDB Files: {len(set(df_buried.idcode.values))}")

df_buried.to_pickle("data/epitope_buried_cleaned.pickle")

Bound (non-empty paratopes and epitopes) 
AbAg Complexes: 1425.0
PDB Files: 1260


In [13]:
buried_ab_ag_interface_res = df_buried[~df_buried[['idcode', 'ab_ag_interface']].astype(str).duplicated(keep='first')][['idcode', 'chainID', 'chain_type', 'ab_ag_interface_res', 'ag_ab_interface_res', 'cdr_seq']]
buried_ab_ag_interface_res = buried_ab_ag_interface_res[['idcode', 'chainID', 'chain_type', 'ab_ag_interface_res', 'ag_ab_interface_res']]

print(f"Final \nAbAg Complexes: {len(buried_ab_ag_interface_res)}")

buried_ab_ag_interface_res.to_pickle("data/buried_interface_res.pickle")

Final 
AbAg Complexes: 1425


In [171]:
df_cdrs = pd.read_pickle('data/cdr_epitope.pickle')

In [172]:
df_cdrs.query(f"idcode == '1adq'")

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues
2496,1adq,H,H,1,GFTFDDY,26,32,"[3422, 3423, 3424, 3425, 3426, 3427, 3428, 342...","[120, 130, 131, 132, 133, 134, 135, 140, 141, ...","[ 252 , 254 , 254 , 254 , 254 , 254 , 25..."
2497,1adq,H,H,2,SWNTGT,52,56,"[3633, 3634, 3635, 3636, 3637, 3638, 3639, 364...","[123, 124, 126, 127, 129, 130, 131, 132, 133]","[ 253 , 253 , 253 , 253 , 254 , 254 , 25..."
2498,1adq,H,H,3,TRSYVVAAEYYFHY,95,102,"[4011, 4012, 4013, 4014, 4015, 4016, 4017, 401...","[107, 108, 113, 114, 115, 117, 118, 119, 120, ...","[ 251 , 251 , 252 , 252 , 252 , 252 , 25..."
2499,1adq,L,L,1,GGNNIGSKSVH,24,34,"[1807, 1808, 1809, 1810, 1811, 1812, 1813, 181...",[1569],[ 433 ]
2500,1adq,L,L,2,DDSDRPP,50,56,"[2007, 2008, 2009, 2010, 2011, 2012, 2013, 201...","[1482, 1483, 1499, 1500, 1565, 1567, 1568, 156...","[ 422 , 422 , 424 , 424 , 433 , 433 , 43..."
2501,1adq,L,L,3,QVWDSSSDHAV,89,97,"[2298, 2299, 2300, 2301, 2302, 2303, 2304, 230...",[],[]


In [170]:
df_cdrs.query(f"idcode == '1adq'")

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues
2496,1adq,H,H,1,GFTFDDY,26,32,"[3422, 3423, 3424, 3425, 3426, 3427, 3428, 342...","[120, 130, 131, 132, 133, 134, 135, 140, 141, ...","[252, 254, 254, 254, 254, 254, 255, 255, 255, ..."
2497,1adq,H,H,2,SWNTGT,52,56,"[3633, 3634, 3635, 3636, 3637, 3638, 3639, 364...","[123, 124, 126, 127, 129, 130, 131, 132, 133]","[253, 253, 253, 253, 254, 254, 254, 254, 254]"
2498,1adq,H,H,3,TRSYVVAAEYYFHY,95,102,[],[],[]
2499,1adq,L,L,1,GGNNIGSKSVH,24,34,"[1807, 1808, 1809, 1810, 1811, 1812, 1813, 181...",[1569],[433]
2500,1adq,L,L,2,DDSDRPP,50,56,"[2007, 2008, 2009, 2010, 2011, 2012, 2013, 201...","[1482, 1483, 1499, 1500, 1565, 1567, 1568, 156...","[422, 422, 424, 424, 433, 433, 433, 433, 434, ..."
2501,1adq,L,L,3,QVWDSSSDHAV,89,97,"[2298, 2299, 2300, 2301, 2302, 2303, 2304, 230...",[],[]


In [169]:
df_buried_old.query(f"idcode == '1adq'")

Unnamed: 0,idcode,chainID,chain_type,cdr,cdr_seq,cdr_begin,cdr_end,cdr_atoms,epitope_atoms,epitope_residues,ag_ab_interface,ag_cdrchain_interface,ag_cdr_interface,ab_ag_interface,ag_ab_interface_res,ag_cdrchain_interface_res,ag_cdr_interface_res,ab_ag_interface_res
24678,1adq,H,H,1,GFTFDDY,26,32,"[3422, 3423, 3424, 3425, 3426, 3427, 3428, 342...","[120, 130, 131, 132, 133, 134, 135, 140, 141, ...","[252, 254, 254, 254, 254, 254, 255, 255, 255, ...","{266, 274, 275, 2322, 277, 278, 2967, 3107, 31...","{266, 274, 275, 277, 278, 3107, 3108, 3110, 31...","{274, 278}","{3713, 4866, 3717, 4870, 3719, 4873, 7492, 749...","[(A, 253, ILE, HG23, 1), (A, 254, SER, CB, 1),...","[(A, 253, ILE, HG23, 1), (A, 254, SER, CB, 1),...","[(A, 254, SER, CB, 1), (A, 254, SER, HB2, 1)]","[(H, 31, ASP, CG, 1), (H, 99, VAL, CG2, 1), (H..."
24679,1adq,H,H,2,SWNTGT,52,56,"[3633, 3634, 3635, 3636, 3637, 3638, 3639, 364...","[123, 124, 126, 127, 129, 130, 131, 132, 133]","[253, 253, 253, 253, 254, 254, 254, 254, 254]","{266, 274, 275, 2322, 277, 278, 2967, 3107, 31...","{266, 274, 275, 277, 278, 3107, 3108, 3110, 31...","{266, 277}","{3713, 4866, 3717, 4870, 3719, 4873, 7492, 749...","[(A, 253, ILE, HG23, 2), (A, 254, SER, CB, 2),...","[(A, 253, ILE, HG23, 2), (A, 254, SER, CB, 2),...","[(A, 253, ILE, HG23, 2), (A, 254, SER, HA, 2)]","[(H, 31, ASP, CG, 2), (H, 99, VAL, CG2, 2), (H..."
24680,1adq,H,H,3,TRSYVVAAEYYFHY,95,102,"[4011, 4012, 4013, 4014, 4015, 4016, 4017, 401...","[107, 108, 113, 114, 115, 117, 118, 119, 120, ...","[251, 251, 252, 252, 252, 252, 252, 252, 252, ...","{266, 274, 275, 2322, 277, 278, 2967, 3107, 31...","{266, 274, 275, 277, 278, 3107, 3108, 3110, 31...","{3107, 3108, 3110, 3111, 3112, 3114, 3124, 312...","{3713, 4866, 3717, 4870, 3719, 4873, 7492, 749...","[(A, 253, ILE, HG23, 3), (A, 254, SER, CB, 3),...","[(A, 253, ILE, HG23, 3), (A, 254, SER, CB, 3),...","[(A, 434, ASN, C, 3), (A, 434, ASN, O, 3), (A,...","[(H, 31, ASP, CG, 3), (H, 99, VAL, CG2, 3), (H..."
24681,1adq,L,L,1,GGNNIGSKSVH,24,34,"[1807, 1808, 1809, 1810, 1811, 1812, 1813, 181...",[1569],[433],"{266, 274, 275, 2322, 277, 278, 2967, 3107, 31...","{3171, 3109, 3176, 3184, 2322, 2967, 3161, 2941}",{},"{3713, 4866, 3717, 4870, 3719, 4873, 7492, 749...","[(A, 253, ILE, HG23, 1), (A, 254, SER, CB, 1),...","[(A, 438, GLN, N, 1), (A, 434, ASN, CB, 1), (A...",[],"[(H, 31, ASP, CG, 1), (H, 99, VAL, CG2, 1), (H..."
24682,1adq,L,L,2,DDSDRPP,50,56,"[2007, 2008, 2009, 2010, 2011, 2012, 2013, 201...","[1482, 1483, 1499, 1500, 1565, 1567, 1568, 156...","[422, 422, 424, 424, 433, 433, 433, 433, 434, ...","{266, 274, 275, 2322, 277, 278, 2967, 3107, 31...","{3171, 3109, 3176, 3184, 2322, 2967, 3161, 2941}","{3109, 2322, 2967, 3161, 2941}","{3713, 4866, 3717, 4870, 3719, 4873, 7492, 749...","[(A, 253, ILE, HG23, 2), (A, 254, SER, CB, 2),...","[(A, 438, GLN, N, 2), (A, 434, ASN, CB, 2), (A...","[(A, 434, ASN, CB, 2), (A, 382, GLU, HG3, 2), ...","[(H, 31, ASP, CG, 2), (H, 99, VAL, CG2, 2), (H..."
24683,1adq,L,L,3,QVWDSSSDHAV,89,97,"[2298, 2299, 2300, 2301, 2302, 2303, 2304, 230...",[],[],"{266, 274, 275, 2322, 277, 278, 2967, 3107, 31...","{3171, 3109, 3176, 3184, 2322, 2967, 3161, 2941}",{},"{3713, 4866, 3717, 4870, 3719, 4873, 7492, 749...","[(A, 253, ILE, HG23, 3), (A, 254, SER, CB, 3),...","[(A, 438, GLN, N, 3), (A, 434, ASN, CB, 3), (A...",[],"[(H, 31, ASP, CG, 3), (H, 99, VAL, CG2, 3), (H..."


In [163]:
buried_ab_ag_interface_res_old.iloc[:1].apply(lambda x: count_chains(x), axis=1)

0 45 51
1 71 76
2 118 127
3 43 58
4 74 80
5 113 121
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
3 43 58
4 74 80
5 113 121
3 43 58
4 74 80
5 113 121
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
3 43 58
4 74 80
5 113 121
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127
0 45 51
1 71 76
2 118 127


0    (9, 3, 3, 0, 5, 0, 2, 0, [THR, SER, TRP, ASN, ...
dtype: object

In [154]:
buried_ab_ag_interface_res_old.query("idcode == '1adq'")

Unnamed: 0,idcode,chainID,chain_type,ab_ag_interface_res,ag_ab_interface_res,chains,hchains,lchains,hcdr1,hcdr2,hcdr3,lcdr1,lcdr2,lcdr3,ab_res,ag_res,incomplete
24678,1adq,H,H,"[(H, 31, ASP, CG, 1), (H, 99, VAL, CG2, 1), (H...","[(A, 253, ILE, HG23, 1), (A, 254, SER, CB, 1),...","(6, 3, 1, 1, 0, 1, 2, 0, [ASP, VAL, PRO, PRO, ...",6.0,3.0,1.0,1.0,0.0,1.0,2.0,0.0,"[ASP, VAL, PRO, PRO, ARG, PRO, PRO, ARG, ARG, ...","[SER, SER, SER, ASN, ASN, ASN, ASN, ASN, ASN, ...",False


In [141]:
buried_ab_ag_interface_res.query("idcode == '1adq'")["ab_ag_interface_res"].values[0]

[('H', '31', 'ASP', 'CG', 1),
 ('H', '99', 'VAL', 'CG2', 1),
 ('H', '31', 'ASP', 'HA', 1),
 ('H', '99', 'VAL', 'HG11', 1),
 ('H', '31', 'ASP', 'HB3', 1),
 ('H', '99', 'VAL', 'HG21', 1),
 ('L', '56', 'PRO', 'C', 1),
 ('L', '56', 'PRO', 'CB', 1),
 ('H', '96', 'ARG', 'O', 1),
 ('L', '56', 'PRO', 'CG', 1),
 ('L', '56', 'PRO', 'CD', 1),
 ('H', '96', 'ARG', 'CD', 1),
 ('H', '96', 'ARG', 'NE', 1),
 ('H', '96', 'ARG', 'CZ', 1),
 ('L', '32', 'SER', 'OG', 1),
 ('H', '96', 'ARG', 'NH2', 1),
 ('L', '56', 'PRO', 'HB3', 1),
 ('L', '56', 'PRO', 'HD3', 1),
 ('H', '96', 'ARG', 'HD2', 1),
 ('H', '52A', 'TRP', 'CG', 1),
 ('H', '96', 'ARG', 'HD3', 1),
 ('H', '52A', 'TRP', 'CD2', 1),
 ('H', '52A', 'TRP', 'CE2', 1),
 ('H', '52A', 'TRP', 'CE3', 1),
 ('H', '100E', 'TYR', 'OH', 1),
 ('H', '97', 'SER', 'HA', 1),
 ('H', '97', 'SER', 'HB3', 1),
 ('H', '2', 'VAL', 'HG23', 1),
 ('H', '98', 'TYR', 'C', 1),
 ('H', '98', 'TYR', 'CB', 1),
 ('H', '98', 'TYR', 'CG', 1),
 ('H', '98', 'TYR', 'CD1', 1),
 ('H', '98', 'TYR', 

In [86]:
buried_ab_ag_interface_res_old = pd.read_pickle('data/buried_ab_ag_interface_res_old.pickle')

In [166]:
df_buried_old = pd.read_pickle('data/epitope_buried_old.pickle')

In [16]:
buried_ab_ag_interface_res_old.sort_values(by=["idcode"])[:3]

NameError: name 'buried_ab_ag_interface_res_old' is not defined

In [129]:
buried_ab_ag_interface_res_.sort_values(by=["idcode"])[:3]

Unnamed: 0,idcode,chainID,chain_type,ab_ag_interface_res,ag_ab_interface_res,chains,hchains,lchains,hcdr1,hcdr2,hcdr3,lcdr1,lcdr2,lcdr3,ab_res,ag_res,incomplete,bothchains
2388,1adq,H,H,"[(H, 31, ASP, CG, 1), (H, 99, VAL, CG2, 1), (H...","[(A, 253, ILE, HG23, 1), (A, 254, SER, CB, 1),...","(6, 3, 1, 1, 0, 1, 2, 0, [ASP, VAL, PRO, PRO, ...",6,3,1,1,0,1,2,0,"[ASP, VAL, PRO, PRO, ARG, PRO, PRO, ARG, ARG, ...","[SER, SER, SER, ASN, ASN, ASN, ASN, ASN, ASN, ...",False,9
696,1afv,H,H,"[(H, 99, TRP, CZ2, 1), (L, 96, LYS, C, 1), (H,...","[(A, 76, GLU, OE2, 1), (A, 102, SER, N, 1), (A...","(8, 7, 0, 0, 0, 3, 0, 0, [TRP, LYS, PHE, TRP, ...",8,7,0,0,0,3,0,0,"[TRP, LYS, PHE, TRP, TRP, GLU, VAL, VAL, ASN, ...","[GLU, SER, PRO, ALA, ALA, ALA, GLU, GLU, ASP, ...",False,15
7068,1ahw,E,H,"[(E, 52, ASP, OD1, 1), (E, 52, ASP, HB2, 1), (...","[(F, 152, ILE, CG1, 1), (F, 152, ILE, CD1, 1),...","(14, 7, 4, 4, 0, 1, 1, 5, [ASP, GLU, GLU, ASN,...",14,7,4,4,0,1,1,5,"[ASP, GLU, GLU, ASN, ASN, ASN, PHE, ILE, ILE, ...","[ILE, ILE, THR, TYR, TYR, TYR, TYR, GLN, GLN, ...",False,21


In [111]:
buried_ab_ag_interface_res_ = buried_ab_ag_interface_res.query(f"idcode in {list(set(buried_ab_ag_interface_res.idcode.values) & set(buried_ab_ag_interface_res_old.idcode.values))}")

In [123]:
# '7mhy', '7mjs', '4xx1', '6wj1', '6q18', '6mph', '6oor', '6bdz'
# buried_ab_ag_interface_res.query("idcode == '7mhy'")
# set(buried_ab_ag_interface_res.idcode.values) & set(buried_ab_ag_interface_res_old.idcode.values)
print(buried_ab_ag_interface_res_['hcdr3'].sum() / buried_ab_ag_interface_res_['hcdr1'].sum())
print(buried_ab_ag_interface_res_old['hcdr3'].sum() / buried_ab_ag_interface_res_old['hcdr1'].sum())

0.4178082191780822
2.5511756569847854
