#### Test PLAbDab-nano functionality

Following OG PLAbDab - https://github.com/oxpig/PLAbDab/blob/main/notebooks/PLAbDab.ipynb

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
plt.rcParams['figure.dpi'] = 300

Where database is kept currently:

In [2]:
pdn = pd.read_csv('/path/to/plabdabnano/all_sequences.csv.gz')
pdn_vhh = pd.read_csv('/path/to/plabdabnano/vhh_sequences.csv.gz')
pdn_vnar = pd.read_csv('/path/to/plabdabnano/vnar_sequences.csv.gz')

In [3]:
from PLAbDab_nano import PLAbDab_nano

Initiate database

In [4]:
data_directory = '/path/to/plabdabnano/'

In [5]:
pdn = PLAbDab_nano(data_directory, n_jobs=10)

In [6]:
df = pdn.all_sequences

In [7]:
len(df)

4913

#### Search PLAbDab-nano by structure

Can't search VNAR sequences, won't be recognised as acceptable input

Unusual characters e.g. X, B (sometimes seen in patent sequences) also can't be used

In [8]:
seq = 'QVQLQESGGGLVQAGGSLRLSCAASGTISYTPDMGWYRQAPGKEREFVAGITVGTSTYYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCAASRQWGPGFYYWGQGTQVTVSS'

In [9]:
best_struc = pdn.structure_search(seq)

In [10]:
len(best_struc)

116

In [11]:
best_struc.drop_duplicates(["model"]).head(5)

Unnamed: 0,source,model,type,cdr_lengths,cdr_sequences,sequence,ID,definition,reference_authors,reference_title,organism,update_date,targets_mentioned,rmsd
0,GenBank,QGY73589,VHH,8_7_12,"{'CDRH1': 'GTISYTPD', 'CDRH2': 'ITVGTST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGTISYTPDMGWYRQAPGKERE...,QGY73589,"NanobodyNbR1A12, partial [synthetic construct]","Yu,C.; Wang,L.; Rowe,R.G.; Han,A.; Ji,W.; McMa...",A nanobody targeting the LIN28:let-7 interacti...,synthetic construct,27-FEB-2020,TUT4; LIN28; PAPD3; ZCCHC1; CSDD1; TENT3A; LIN...,0.631316
1,SAbDab,7T83_A,VHH,8_7_12,"{'CDRH1': 'GYIYRRYR', 'CDRH2': 'ISGGSST', 'CDR...",EVQLVESGGGLVQPGGSLRLSCAASGYIYRRYRMGWYRQAPGKGRE...,7T83_A,Structure Of Angiotensin Ii Type I Receptor (A...,"Harvey,E.P.;Shin,J.E.;Skiba,M.A.;Nemeth,G.R.;H...",An in silico method to assess antibody fragmen...,Synthetic construct,07-DEC-2022,,1.791303
9,SAbDab,8ET0_C,VHH,8_7_12,"{'CDRH1': 'GSIFSIRE', 'CDRH2': 'ITSGGTT', 'CDR...",QVQLQESGGGLVQPGGSLRLSCAASGSIFSIREWGWYRQAPGKQRE...,8ET0_D,Crystal Complex Of Murine Cyclooxygenase-2 Wit...,"Xu,S.;Uddin,M.J.",Crystal complex of murine cycloxygenase-2 with...,Vicugna pacos,18-OCT-2023,,1.853
11,SAbDab,8FXV_B,VHH,8_7_12,"{'CDRH1': 'GYISNDDV', 'CDRH2': 'ISVGAST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGYISNDDVMGWYRQAPGKERE...,8FXV_B,Crystal Structure Of Human Protgf-Beta2 In Com...,"Le,V.Q.;Zhao,B.;Ramesh,S.;Toohey,C.;DeCosta,A....",A specialized integrin-binding motif enables p...,Synthetic construct,21-JUN-2023,Beta2; TUBB2C; TUBB4B,1.950287
12,SAbDab,8FXS_D,VHH,8_7_12,"{'CDRH1': 'GTIFEANI', 'CDRH2': 'IGYGSST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGTIFEANIMGWYRQAPGKERE...,8FXS_D,Crystal Structure Of Human Pro-Tgf-Beta2 In Co...,"Le,V.Q.;Zhao,B.;Ramesh,S.;Toohey,C.;DeCosta,A....",A specialized integrin-binding motif enables p...,Synthetic construct,21-JUN-2023,Beta2; TUBB2C; TUBB4B,2.041302


#### Search PLAbDab-nano by sequence

##### VHH search (with KA-search)

You can only search with VHH sequences or it will break

In [12]:
vhh_query = 'QVQLQESGGGLVQAGGSLRLSCAASGTISYTPDMGWYRQAPGKEREFVAGITVGTSTYYADSVKGRFTISRDNAKNTVYLQMNSLKPEDTAVYYCAASRQWGPGFYYWGQGTQVTVSS'

In [14]:
best_ident = pdn.vhh_seq_search(vhh_query, keep_best_n = 10, regions=['whole'], length_matched=['Any'])

Limiting hmmer search to species ['any'] was requested but hits did not achieve a high enough bitscore. Reverting to using any species


In [15]:
best_ident

Unnamed: 0,source,model,type,cdr_lengths,cdr_sequences,sequence,ID,definition,reference_authors,reference_title,organism,update_date,targets_mentioned,Species,Chain,Identity
0,GenBank,,VHH,8_7_12,"{'CDRH1': 'GTISYTPD', 'CDRH2': 'ITVGTST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGTISYTPDMGWYRQAPGKERE...,QGY73589,"NanobodyNbR1A12, partial [synthetic construct]","Yu,C.; Wang,L.; Rowe,R.G.; Han,A.; Ji,W.; McMa...",A nanobody targeting the LIN28:let-7 interacti...,synthetic construct,27-FEB-2020,TUT4; LIN28; PAPD3; ZCCHC1; CSDD1; TENT3A; LIN...,Any,Heavy,1.0
1,GenBank,,VHH,8_7_12,"{'CDRH1': 'GSISVSQA', 'CDRH2': 'ISTGGST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGSISVSQAMGWYRQAPGKERE...,QGY73581,"NanobodyNbS1B4, partial [synthetic construct]","Yu,C.; Wang,L.; Rowe,R.G.; Han,A.; Ji,W.; McMa...",A nanobody targeting the LIN28:let-7 interacti...,synthetic construct,27-FEB-2020,TUT4; LIN28; PAPD3; ZCCHC1; CSDD1; TENT3A; LIN...,Any,Heavy,0.881356
2,SAbDab,,VHH,8_7_12,"{'CDRH1': 'GTIFEANI', 'CDRH2': 'IGYGSST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGTIFEANIMGWYRQAPGKERE...,8FXS_E,Crystal Structure Of Human Pro-Tgf-Beta2 In Co...,"Le,V.Q.;Zhao,B.;Ramesh,S.;Toohey,C.;DeCosta,A....",A specialized integrin-binding motif enables p...,Synthetic construct,21-JUN-2023,Beta2; TUBB2C; TUBB4B,Any,Heavy,0.864407
3,SAbDab,,VHH,8_7_12,"{'CDRH1': 'GYISNDDV', 'CDRH2': 'ISVGAST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGYISNDDVMGWYRQAPGKERE...,8FXV_B,Crystal Structure Of Human Protgf-Beta2 In Com...,"Le,V.Q.;Zhao,B.;Ramesh,S.;Toohey,C.;DeCosta,A....",A specialized integrin-binding motif enables p...,Synthetic construct,21-JUN-2023,Beta2; TUBB2C; TUBB4B,Any,Heavy,0.864407
4,SAbDab,,VHH,8_7_12,"{'CDRH1': 'GTIFEANI', 'CDRH2': 'IGYGSST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGTIFEANIMGWYRQAPGKERE...,8FXS_D,Crystal Structure Of Human Pro-Tgf-Beta2 In Co...,"Le,V.Q.;Zhao,B.;Ramesh,S.;Toohey,C.;DeCosta,A....",A specialized integrin-binding motif enables p...,Synthetic construct,21-JUN-2023,Beta2; TUBB2C; TUBB4B,Any,Heavy,0.864407
5,SAbDab,,VHH,8_7_12,"{'CDRH1': 'GYISDAYY', 'CDRH2': 'ITHGTNT', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGYISDAYYMGWYRQAPGKERE...,5VNV_A,Crystal Structure Of Nb.B201,"McMahon,C.;Baier,A.S.;Pascolutti,R.;Wegrecki,M...",Yeast surface display platform for rapid disco...,Synthetic construct,01-MAY-2017,,Any,Heavy,0.855932
6,SAbDab,,VHH,8_7_12,"{'CDRH1': 'GYISDAYY', 'CDRH2': 'ITHGTNT', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGYISDAYYMGWYRQAPGKERE...,5VNW_D,Crystal Structure Of Nb.B201 Bound To Human Se...,"McMahon,C.;Baier,A.S.;Pascolutti,R.;Wegrecki,M...",Yeast surface display platform for rapid disco...,Synthetic construct,01-MAY-2017,,Any,Heavy,0.855932
7,SAbDab,,VHH,8_7_12,"{'CDRH1': 'GYISDAYY', 'CDRH2': 'ITHGTNT', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGYISDAYYMGWYRQAPGKERE...,5VNW_C,Crystal Structure Of Nb.B201 Bound To Human Se...,"McMahon,C.;Baier,A.S.;Pascolutti,R.;Wegrecki,M...",Yeast surface display platform for rapid disco...,Synthetic construct,01-MAY-2017,,Any,Heavy,0.855932
8,GenBank,,VHH,8_7_12,"{'CDRH1': 'GTIFTYFV', 'CDRH2': 'ITLGGTT', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGTIFTYFVMGWYRQAPGKERE...,QGY73586,"NanobodyNbS2A4, partial [synthetic construct]","Yu,C.; Wang,L.; Rowe,R.G.; Han,A.; Ji,W.; McMa...",A nanobody targeting the LIN28:let-7 interacti...,synthetic construct,27-FEB-2020,TUT4; LIN28; PAPD3; ZCCHC1; CSDD1; TENT3A; LIN...,Any,Heavy,0.847458
9,GenBank,,VHH,8_7_12,"{'CDRH1': 'GYIFADSV', 'CDRH2': 'ITYGGST', 'CDR...",QVQLQESGGGLVQAGGSLRLSCAASGYIFADSVMGWYRQAPGKERE...,QGY73582,"NanobodyNbS1B11, partial [synthetic construct]","Yu,C.; Wang,L.; Rowe,R.G.; Han,A.; Ji,W.; McMa...",A nanobody targeting the LIN28:let-7 interacti...,synthetic construct,27-FEB-2020,TUT4; LIN28; PAPD3; ZCCHC1; CSDD1; TENT3A; LIN...,Any,Heavy,0.838983


##### VNAR search (with BLAST)

You can only search with a VNAR sequence or nothing will be returned, as database to search against is built from only the existing VNAR sequences

In [7]:
vnar_query = 'ARVDQTPRSVTKETGESLTINCVLRDASYALGSTCWYRKKSGSTNEESISKGGRYVETVNSGSKSFSLRINDLTVEDGGTYRCGVCVLVDYCDVLGCMRRWHCRDCECGLRGPPSPSPWRRLQGRRRQVGIQRYQSSNCKKL'

Not length matched, whole sequence, 70% identity cut-off

In [15]:
best_ident = pdn.vnar_seq_search(vnar_query, data_directory=data_directory, keep_best_n = 10, seq_identity_cutoff=0.7, regions='whole', length_matched=[True])
best_ident

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,source,model,type,cdr_lengths,cdr_sequences,sequence,ID,definition,reference_authors,reference_title,organism,update_date,targets_mentioned,Identity
2,GenBank,FAILED,VNAR,8_|_39,"{'CDRH1': 'DASYELGS', 'CDRH2': '', 'CDRH3': 'G...",ARVDQTPRSVTKETGESLTINCVLRDASYELGSTCWYRKKSGSTNE...,QHY91049_H,Sequence 543 from patent US 10479990,"Hasler,J.; Rutkowski,J.L.",Semi-synthetic nurse shark VNAR libraries for ...,Unknown.,10-FEB-2020,10C; X11; mer; BLYS; MINT1; TALL-1; TALL1; TNF...,0.95402
1,GenBank,FAILED,VNAR,8_|_51,"{'CDRH1': 'DASYALGS', 'CDRH2': '', 'CDRH3': 'G...",ARVDQTPRSVTKETGESLTINCVLRDASYALGSTCWYRKKSGSTNE...,QHY91050_H,Sequence 544 from patent US 10479990,"Hasler,J.; Rutkowski,J.L.",Semi-synthetic nurse shark VNAR libraries for ...,Unknown.,10-FEB-2020,10C; X11; mer; BLYS; MINT1; TALL-1; TALL1; TNF...,0.90845


Length matched, CDR3 only 

In [17]:
best_ident = pdn.vnar_seq_search(vnar_query, data_directory=data_directory, keep_best_n = 10, regions='cdr3', seq_identity_cutoff=0.7, length_matched=[True], url=True)
best_ident

Unnamed: 0,source,model,type,cdr_lengths,cdr_sequences,sequence,ID,definition,reference_authors,reference_title,organism,update_date,targets_mentioned,url,Identity
1,GenBank,FAILED,VNAR,8_|_51,"{'CDRH1': 'DASYALGS', 'CDRH2': '', 'CDRH3': 'G...",ARVDQTPRSVTKETGESLTINCVLRDASYALGSTCWYRKKSGSTNE...,QHY91050_H,Sequence 544 from patent US 10479990,"Hasler,J.; Rutkowski,J.L.",Semi-synthetic nurse shark VNAR libraries for ...,Unknown.,10-FEB-2020,10C; X11; mer; BLYS; MINT1; TALL-1; TALL1; TNF...,https://patents.google.com/patent/US10479990/en,0.78431


#### Search PLAbDab-nano by text

Searches reference title for keywords

In [20]:
df = pdn.column_search(term = "SARS-CoV-2", url=True)

Returns both shark and camelid nanobody entries:

In [23]:
df

Unnamed: 0,source,model,type,cdr_lengths,cdr_sequences,sequence,ID,definition,reference_authors,reference_title,organism,update_date,targets_mentioned,url
0,GenBank,WAH70826,VHH,8_8_17,"{'CDRH1': 'RGTFRNSR', 'CDRH2': 'ISASGGFE', 'CD...",QVQVVESGGGLTQAGGSLRLSCAGSRGTFRNSRMGWFRQAPGKERE...,WAH70826,anti-SARS-CoV-2 RBD-specific immunoglobulin he...,"Li,M.; Ren,Y.; Aw,Z.Q.; Chen,B.; Yang,Z.; Lei,...",Broadly neutralizing and protective nanobodies...,Vicugna pacos,10-DEC-2022,SARS; RBD; CoV,https://www.ncbi.nlm.nih.gov/protein/WAH70826
1,GenBank,WAH70825,VHH,8_8_18,"{'CDRH1': 'GSRLDYHG', 'CDRH2': 'ISGRGMII', 'CD...",ELQVVESGGGLVQPGESLRLSCQLSGSRLDYHGAGWFRQAPGKERE...,WAH70825,anti-SARS-CoV-2 RBD-specific immunoglobulin he...,"Li,M.; Ren,Y.; Aw,Z.Q.; Chen,B.; Yang,Z.; Lei,...",Broadly neutralizing and protective nanobodies...,Vicugna pacos,10-DEC-2022,SARS; RBD; CoV,https://www.ncbi.nlm.nih.gov/protein/WAH70825
2,GenBank,WAH70824,VHH,8_11_22,"{'CDRH1': 'GFTSDRYS', 'CDRH2': 'IVSSGASTSTT', ...",QLQLVESGGGLVQPGGSLRLSCAVSGFTSDRYSIAWFRQAPGKERE...,WAH70824,anti-SARS-CoV-2 RBD-specific immunoglobulin he...,"Li,M.; Ren,Y.; Aw,Z.Q.; Chen,B.; Yang,Z.; Lei,...",Broadly neutralizing and protective nanobodies...,Vicugna pacos,10-DEC-2022,SARS; RBD; CoV,https://www.ncbi.nlm.nih.gov/protein/WAH70824
3,GenBank,WAH70823,VHH,8_8_20,"{'CDRH1': 'GFTLDHYA', 'CDRH2': 'ISASGGPT', 'CD...",QLQVVESGGGLVQPGGSLRLSCAASGFTLDHYAIAWFRQAAGKERE...,WAH70823,anti-SARS-CoV-2 RBD-specific immunoglobulin he...,"Li,M.; Ren,Y.; Aw,Z.Q.; Chen,B.; Yang,Z.; Lei,...",Broadly neutralizing and protective nanobodies...,Vicugna pacos,10-DEC-2022,SARS; RBD; CoV,https://www.ncbi.nlm.nih.gov/protein/WAH70823
4,GenBank,WAH70822,VHH,8_8_21,"{'CDRH1': 'GFTWDYYA', 'CDRH2': 'ISNSGGRT', 'CD...",ELQLVESGGGEVQPGGSLRLSCAVSGFTWDYYAIGWFRQASGQERE...,WAH70822,anti-SARS-CoV-2 RBD-specific immunoglobulin he...,"Li,M.; Ren,Y.; Aw,Z.Q.; Chen,B.; Yang,Z.; Lei,...",Broadly neutralizing and protective nanobodies...,Vicugna pacos,10-DEC-2022,SARS; RBD; CoV,https://www.ncbi.nlm.nih.gov/protein/WAH70822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,SAbDab,7KGL_B,VHH,8_9_8,"{'CDRH1': 'GFPVAYKT', 'CDRH2': 'IESYGIKWT', 'C...",QVQLVESGGGLVQAGGSLRLSCAASGFPVAYKTMWWYRQAPGKERE...,7MFV_B,Crystal Structure Of Synthetic Nanobody,"Ahmad,J.;Jiang,J.;Boyd,L.F.;Natarajan,K.;Margu...",Synthetic nanobody-SARS-CoV-2 receptor-binding...,Synthetic construct,11-APR-2021,SARS; CoV,https://opig.stats.ox.ac.uk/webapps/newsabdab/...
274,SAbDab,7MJH_D,VHH,8_7_18,"{'CDRH1': 'GFTFDDYA', 'CDRH2': 'MYNNGRT', 'CDR...",EVQLVESGGGLVQPGGSLRLSCAASGFTFDDYAMSWVRQAPGKGLE...,7MJI_E,Cryo-Em Structure Of The Sars-Cov-2 N501Y Muta...,"Zhu,X.;Mannar,D.;Srivastava,S.S.;Berezuk,A.M.;...",Cryo-electron microscopy structures of the N50...,Synthetic construct,20-APR-2021,SARS; ACEH; ACE2; CoV; Ectodomain,https://opig.stats.ox.ac.uk/webapps/newsabdab/...
275,TheraSAbDab,Rimteravimab,VHH/sdAb,8_8_18,"{'CDRH1': 'GRTFSEYA', 'CDRH2': 'ISWSGGAT', 'CD...",DVQLVESGGGLVQPGGSLRLSCAASGRTFSEYAMGWFRQAPGKERE...,Rimteravimab,Mixed Nanobody (VHH-CH2-CH3 dimer) heavy varia...,ExeVir,Therapeutic antibody Rimteravimab: Mixed Nanob...,Unknown.,31-DEC-2023,SARS-CoV-2 Spike RBD,https://opig.stats.ox.ac.uk/webapps/newsabdab/...
276,GenBank,FAILED,VNAR,8_|_17,"{'CDRH1': 'DSNCALAS', 'CDRH2': '', 'CDRH3': 'N...",ARVDQTPQTITKETGESLTINCVLRDSNCALASTDWYRKKSGSTNE...,7S83_B_H,"Chain B, ShAb02 VNAR","Chen,W.H.; Hajduczki,A.; Martinez,E.J.; Bai,H....",Shark nanobodies with potent SARS-CoV-2 neutra...,Ginglymostoma cirratum,25-OCT-2023,SARS; CoV,https://www.ncbi.nlm.nih.gov/protein/7S83_B_H


Check results titles:

In [28]:
print(df['reference_title'][0])
print(df['reference_title'][277])

Broadly neutralizing and protective nanobodies against SARS-CoV-2 Omicron subvariants BA.1, BA.2, and BA.4/5 and diverse sarbecoviruses
Shark nanobodies with potent SARS-CoV-2 neutralizing activity and broad sarbecovirus reactivity


In [29]:
len(set(df['reference_title']))

67