In [57]:
from Bio.PDB import *
import pandas as pd
import os
import importlib
import glob as glob

from SAbDab_downloader import *
import Interactions
import PPIDatabase

importlib.reload(Interactions)
importlib.reload(PPIDatabase)

<module 'PPIDatabase' from 'c:\\Users\\chekm\\OneDrive\\Documents\\Python Scripts\\Nanobody-Protein-Interface-Data\\PPIDatabase.py'>

In [43]:
'''
Setup Directories
'''     

if os.path.exists("Data") == False:
	os.makedirs("Data/Antibody")
	os.mkdir('Data/Nanobody')

	from getPDBs import *
	createMetadata()        

if os.path.exists("pdbFiles") == False:
    os.mkdir('pdbFiles/')

In [58]:
'''
Data Preperation that creates a pandas dataframe structured
|PDB|ChainPairings|
some PDB ID, Nanobody Chains
'''

nanobody_metadata = pd.read_csv("Data/Nanobody/Nanobody_metadata.csv")
nanobody_metadata['Resolution'] = nanobody_metadata['Resolution'].str.replace('Å', '').replace(' ', '').astype(float)
# Filter for a resolution less than 7
filtered = nanobody_metadata.query('Resolution <= 7')[['PDB', 'Chain Pairings']]
# This step is necessary for ScanNet Data. It just converts the str into a list of nanobody chains
filtered['Chain Pairings'] = filtered['Chain Pairings'].str.findall(r"VH: (\w)")

print(filtered['Chain Pairings'])

0       [B]
1       [B]
2       [B]
3       [B]
4       [B]
       ... 
1004    [B]
1005    [B]
1006    [C]
1007    [B]
1008    [N]
Name: Chain Pairings, Length: 1006, dtype: object


In [59]:
'''
Reproducable in that all you need is a pandas dataframe with a 'PDB' column paired with a 'Chain Pairings' column
that contains a list of all the VH Nanobodies in the PDB file.
'''
database = PPIDatabase.PPIDatabase(metadata = filtered)

In [195]:

await database.getpdbs()

In [60]:
database.generatetxtLabelFile()

Generating Labels
1bzq; 1g6v; 1jto; 1jtp; 1jtt; 1kxq; 1kxt; 1kxv; 1mel; 1op9; 1ri8; 1rjc;
1xfp; 1zmy; 1zv5; 1zvh; 1zvy; 2bse; 2p42; 2p43; 2p44; 2p45; 2p46; 2p47;
2p48; 2p49; 2p4a; 2wzp; 2x89; 2xt1; 2xv6; 2xxm; 3cfi; 3eba; 3ezj; 3g9a;
3jbc; 3jbd; 3k1k; 3k3q; 3k74; 3k7u; 3k80; 3k81; 3ogo; 3p0g; 3p9w; 3qsk;
3rjq; 3sn6; 3stb; 3v0a; 3zkq; 3zks; 3zkx; 3zlq; 4bel; 4bfb; 4c57; 4c58;
4c59; 4cdg; 4dk3; 4dk6; 4eig; 4eiz; 4ej1; 4fhb; 4gft; 4grw; 4hem; 4hep;
4i0c; 4i13; 4i1n; 4ios; 4kdt; 4kfz; 4kml; 4krl; 4krm; 4kro; 4krp; 4ksd;
4laj; 4lde; 4ldl; 4ldo; 4lgp; 4lgr; 4lgs; 4lhj; 4lhq; 4m3k; 4mqs; 4mqt;
4n1h; 4n9o; 4nbx; 4nby; 4nbz; 4nc0; 4nc1; 4nc2; 4ocl; 4ocm; 4ocn; 4orz;
4p2c; 4pgj; 4pir; 4pou; 4qkx; 4qo1; 4s10; 4tvs; 4u3x; 4w2o; 4w2q; 4w6w;
4w6x; 4w6y; 4wem; 4wen; 4weu; 4wgv; 4wgw; 4x7c; 4x7d; 4x7e; 4x7f; 4xt1;
4y7m; 4y8d; 4yga; 4z9k; 5bop; 5boz; 5c1m; 5c2u; 5c3l; 5da0; 5dfz; 5dmj;
5e0q; 5e1h; 5e5m; 5e7f; 5eul; 5f1k; 5f1o; 5f21; 5f7k; 5f7l; 5f7m; 5f7n;
5f7w; 5f7y; 5f8q; 5f8r; 5f93; 5f97; 5f9a; 5f9d

(['>8a67_B\nB 1 M 0\nB 2 Q 0\nB 3 I 0\nB 4 F 0\nB 5 V 0\nB 6 K 0\nB 7 T 0\nB 8 L 1\nB 9 T 1\nB 10 G 0\nB 11 K 0\nB 12 T 0\nB 13 I 0\nB 14 T 0\nB 15 L 0\nB 16 E 0\nB 17 V 0\nB 18 E 0\nB 19 P 0\nB 20 S 0\nB 21 D 0\nB 22 T 0\nB 23 I 0\nB 24 E 0\nB 25 N 0\nB 26 V 0\nB 27 K 0\nB 28 A 0\nB 29 K 0\nB 30 I 0\nB 31 Q 0\nB 32 D 0\nB 33 K 0\nB 34 E 0\nB 35 G 0\nB 36 I 0\nB 37 P 0\nB 38 P 0\nB 39 D 0\nB 40 Q 0\nB 41 Q 0\nB 42 R 1\nB 43 L 0\nB 44 I 1\nB 45 F 0\nB 46 A 0\nB 47 G 0\nB 48 R 0\nB 49 Q 1\nB 50 L 0\nB 51 E 0\nB 52 D 0\nB 53 G 0\nB 54 R 0\nB 55 T 0\nB 56 L 0\nB 57 S 0\nB 58 D 0\nB 59 Y 0\nB 60 N 0\nB 61 I 0\nB 62 Q 0\nB 63 R 0\nB 64 E 0\nB 65 S 0\nB 66 T 0\nB 67 L 0\nB 68 H 0\nB 69 L 0\nB 70 V 1\nB 71 L 1\nB 72 R 1\nB 73 L 1\nB 74 R 1\nB 75 G 1\nB 76 G 0\n',
  '>7fg3_D\nD 2 E 0\nD 3 V 0\nD 4 Q 0\nD 5 L 0\nD 6 V 0\nD 7 E 0\nD 8 S 0\nD 9 G 0\nD 10 G 0\nD 11 G 0\nD 12 Q 0\nD 13 V 0\nD 14 E 0\nD 15 T 0\nD 16 G 0\nD 17 G 0\nD 18 S 0\nD 19 L 0\nD 20 R 0\nD 21 L 0\nD 22 S 0\nD 23 C 0\nD 24 Q 0\n

In [146]:
'''
See overlap from nonredundant dataset
'''
pdbsNonredundant1 = glob.glob('NonredundantNb1/Data_set/Nb-Ag_3/*.pdb')

pdbsNonredundant1 = [i[-10:-6] for i in pdbsNonredundant1]
print(filtered.loc['PDB'].to_list())
different = []
for pdb in pdbsNonredundant1:
    if pdb not in filtered.loc['PDB'].to_list():
        different.append(pdb)

print(different)


['8dt8', '8jlz', '8fxs', '8fxv', '8h4i', '8h4k', '8h4l', '8hbv', '8hbw', '8hmp', '8ilx', '8im0', '8im1', '8irr', '8j1n', '8aok', '8aom', '8gw8', '7y3g', '8g8w', '7xld', '7xli', '8g2y', '8bb7', '8bf4', '7xk9', '7xka', '8flq', '8flr', '8fls', '8flt', '8flu', '8sbb', '7yit', '8fu6', '8ont', '7uia', '8cxr', '8f8w', '8f8x', '8i2g', '8en1', '7yag', '7yah', '7yai', '7yaj', '8bzy', '8c02', '8f76', '8eqb', '8emy', '8en5', '8en6', '8en4', '8b01', '8c3l', '8emz', '8en0', '8en2', '8en3', '8hix', '8hj0', '8hj2', '7ru6', '7rug', '7ph4', '8bev', '8bgg', '8hmv', '8c8p', '8a67', '7tmw', '7xv3', '8dqu', '8h3x', '8h3y', '7qvk', '7wn0', '7wn1', '8bpk', '8e0e', '8gni', '8gnj', '8gq5', '8hdo', '8hdp', '7x4i', '7ust', '7usv', '8b7w', '8hao', '7qha', '7wcm', '7wcn', '7wd1', '7wd2', '7x2j', '8dam', '8gz3', '8ha0', '8haf', '7x2m', '8b41', '7x2l', '8e99', '8gz5', '7stg', '7ssh', '7st3', '7sqp', '7sr0', '7sr3', '7sr4', '7sr5', '7srk', '7uny', '7unz', '7wki', '8e3x', '8e3y', '8e3z', '8ew6', '7pa5', '7q3q', '7q3r',

In [152]:
'''
See overlap from nonredundant dataset
'''
pdbsNonredundant2 = glob.glob('NonredundantNb2/extended_dataset/complex_3/*.pdb')

pdbsNonredundant2 = [i[-10:-6] for i in pdbsNonredundant2]
print(pdbsNonredundant2)
different2 = []
for pdb in pdbsNonredundant2:
    if pdb not in filtered.loc['PDB'].to_list():
        different2.append(pdb)
print(different2)

['5nbl', '5van', '5vnw', '5y80', '6b20', '6c9w', '6dbg', '6ehg', '6ey0', '6ey6', '6fe4', '6fv0', '6h02', '6h6y', '6h6z', '6h71', '6h7n', '6mxt']
['6dbg']


In [56]:
#clear file
data = open('Data\ScanNetData.txt', 'w')
data.close()

#clear file
data = open('Data\ScanNetData_test.txt', 'w')
data.close()

#clear file
data = open('Data\ScanNetData_train.txt', 'w')
data.close()

In [27]:
'''
Test PeSTo for accuracy using test set.
'''

def splitPDBintoChain(pdb, chainID):
    pdbPath = f'pdbFiles/{pdb}.pdb'
    with open(pdbPath, 'r') as fr:
        lines = fr.readlines()
        with open(f'ScanNetData_test_pdbFiles/{pdb}_{chainID}.pdb', 'w') as fw:
            for line in lines:
                if line.split()[4] == chainID:
                    fw.write(line)

In [29]:
path_to_ScanNetTest = 'Data/ScanNetData_test.txt'
pdbsToSplit = []
with open(path_to_ScanNetTest, 'r') as f:
    data = f.readlines()
    for line in data:
        if line.startswith('>'):
            pdbsToSplit.append(line.replace('>',"").replace('\n',''))

print(len(pdbsToSplit))
'''
for point in pdbsToSplit:
    pdb, chain = point.split('_')
    splitPDBintoChain(pdb, chain)

'''

392


"\nfor point in pdbsToSplit:\n    pdb, chain = point.split('_')\n    splitPDBintoChain(pdb, chain)\n\n"