In [16]:
from Bio.PDB import *
import pandas as pd
import os
import glob as glob

from SAbDab_downloader import *
import Interactions
import PPIDatabase

<module 'PPIDatabase' from 'c:\\Users\\chekm\\OneDrive\\Documents\\Python Scripts\\Nanobody-Protein-Interface-Data\\PPIDatabase.py'>

In [2]:
'''
Setup Directories
'''     

if os.path.exists("Data") == False:
	os.makedirs("Data/Antibody")
	os.mkdir('Data/Nanobody')

	from getPDBs import *
	createMetadata()        

if os.path.exists("pdbFiles") == False:
    os.mkdir('pdbFiles/')

In [4]:
'''
Data Preperation that creates a pandas dataframe structured
|PDB|ChainPairings|
some PDB ID, Nanobody Chains
'''

nanobody_metadata = pd.read_csv("Data/Nanobody/Nanobody_metadata.csv")
nanobody_metadata['Resolution'] = nanobody_metadata['Resolution'].str.replace('Å', '').replace(' ', '').astype(float)
# Filter for a resolution less than 7
filtered = nanobody_metadata.query('Resolution <= 7')[['PDB', 'Chain Pairings']]
# This step is necessary for ScanNet Data. It just converts the str into a list of nanobody chains
filtered['Chain Pairings'] = filtered['Chain Pairings'].str.findall(r"VH: (\w)")

print(filtered)

       PDB Chain Pairings
0     8c5h            [N]
1     8f0j            [N]
2     8f0k            [N]
3     8f2a            [N]
4     8f2b            [N]
...    ...            ...
1011  6gwq            [B]
1012  2p48            [B]
1013  6yo6            [C]
1014  4mqt            [B]
1015  7e14            [N]

[1013 rows x 2 columns]


In [17]:
'''
Reproducable in that all you need is a pandas dataframe with a 'PDB' column paired with a 'Chain Pairings' column
that contains a list of all the VH Nanobodies in the PDB file.
'''
database = PPIDatabase.PPIDatabase(metadata = filtered)

In [6]:

await database.getpdbs()

In [18]:
database.generatetxtLabelFile()
print('Finished')

Generating Labels
1bzq; 1g6v; 1jto; 1jtp; 1jtt; 1kxq; 1kxt; 1kxv; 1mel; 1op9; 1ri8; 1rjc;
1xfp; 1zmy; 1zv5; 1zvh; 1zvy; 2bse; 2p42; 2p43; 2p44; 2p45; 2p46; 2p47;
2p48; 2p49; 2p4a; 2wzp; 2x89; 2xt1; 2xv6; 2xxm; 3cfi; 3eba; 3ezj; 3g9a;
3jbc; 3jbd; 3k1k; 3k3q; 3k74; 3k7u; 3k80; 3k81; 3ogo; 3p0g; 3p9w; 3qsk;
3rjq; 3sn6; 3stb; 3v0a; 3zkq; 3zks; 3zkx; 3zlq; 4bel; 4bfb; 4c57; 4c58;
4c59; 4cdg; 4dk3; 4dk6; 4eig; 4eiz; 4ej1; 4fhb; 4gft; 4grw; 4hem; 4hep;
4i0c; 4i13; 4i1n; 4ios; 4kdt; 4kfz; 4kml; 4krl; 4krm; 4kro; 4krp; 4ksd;
4laj; 4lde; 4ldl; 4ldo; 4lgp; 4lgr; 4lgs; 4lhj; 4lhq; 4m3k; 4mqs; 4mqt;
4n1h; 4n9o; 4nbx; 4nby; 4nbz; 4nc0; 4nc1; 4nc2; 4ocl; 4ocm; 4ocn; 4orz;
4p2c; 4pgj; 4pir; 4pou; 4qkx; 4qo1; 4s10; 4tvs; 4u3x; 4w2o; 4w2q; 4w6w;
4w6x; 4w6y; 4wem; 4wen; 4weu; 4wgv; 4wgw; 4x7c; 4x7d; 4x7e; 4x7f; 4xt1;
4y7m; 4y8d; 4yga; 4z9k; 5bop; 5boz; 5c1m; 5c2u; 5c3l; 5da0; 5dfz; 5dmj;
5e0q; 5e1h; 5e5m; 5e7f; 5eul; 5f1k; 5f1o; 5f21; 5f7k; 5f7l; 5f7m; 5f7n;
5f7w; 5f7y; 5f8q; 5f8r; 5f93; 5f97; 5f9a; 5f9d

(['>7k7y_E\nE 1 M 0\nE 2 P 0\nE 3 K 0\nE 4 I 0\nE 5 N 0\nE 6 S 0\nE 7 F 0\nE 8 N 0\nE 9 Y 0\nE 10 N 0\nE 11 D 0\nE 12 P 0\nE 13 V 0\nE 14 N 0\nE 15 D 0\nE 16 R 0\nE 17 T 0\nE 18 I 0\nE 19 L 0\nE 20 Y 0\nE 21 I 0\nE 22 K 0\nE 23 P 0\nE 24 G 0\nE 25 G 0\nE 26 C 0\nE 27 Q 0\nE 28 E 0\nE 29 F 0\nE 30 Y 0\nE 31 K 0\nE 32 S 0\nE 33 F 0\nE 34 N 0\nE 35 I 0\nE 36 M 0\nE 37 K 0\nE 38 N 0\nE 39 I 0\nE 40 W 0\nE 41 I 0\nE 42 I 0\nE 43 P 0\nE 44 E 0\nE 45 R 0\nE 46 N 0\nE 47 V 0\nE 48 I 0\nE 49 G 0\nE 50 T 0\nE 51 T 0\nE 52 P 0\nE 53 Q 0\nE 54 D 0\nE 55 F 0\nE 56 H 0\nE 57 P 0\nE 58 P 0\nE 59 T 0\nE 60 S 0\nE 61 L 0\nE 62 K 0\nE 63 N 0\nE 64 G 0\nE 65 D 0\nE 66 S 0\nE 67 S 0\nE 68 Y 0\nE 69 Y 0\nE 70 D 0\nE 71 P 0\nE 72 N 0\nE 73 Y 0\nE 74 L 0\nE 75 Q 0\nE 76 S 0\nE 77 D 0\nE 78 E 0\nE 79 E 0\nE 80 K 0\nE 81 D 0\nE 82 R 0\nE 83 F 0\nE 84 L 0\nE 85 K 0\nE 86 I 0\nE 87 V 0\nE 88 T 0\nE 89 K 0\nE 90 I 0\nE 91 F 0\nE 92 N 0\nE 93 R 0\nE 94 I 0\nE 95 N 0\nE 96 N 0\nE 97 N 0\nE 98 L 0\nE 99 S 0\nE 100 G

In [27]:
'''
Utility Function to split PDBs into chains
'''

def splitPDBintoChain(pdb, chainID):
    pdbPath = f'pdbFiles/{pdb}.pdb'
    with open(pdbPath, 'r') as fr:
        lines = fr.readlines()
        with open(f'ScanNetData_test_pdbFiles/{pdb}_{chainID}.pdb', 'w') as fw:
            for line in lines:
                if line.split()[4] == chainID:
                    fw.write(line)

In [3]:
'''
Length of Data


path_to_ScanNetTest = 'Data/ScanNetData.txt'
with open(path_to_ScanNetTest, 'r') as f:
    data = f.read().split('>')
    print(len(data[1:]))

'''

3983
