# Database

In [1]:
import src
from src import *
from src.creation import create_entry, combine_sites, NoModulator

In [2]:
# parameters from https://docs.peewee-orm.com/en/latest/peewee/sqlite_ext.html#getting-started
db.init('database.db')
src.allodb.save_cifs = True

In [3]:
db.drop_tables(tables)
db.create_tables(tables)

# Exploration

In [4]:
import pandas as pd
pd.DF = pd.DataFrame

In [5]:
df = pd.read_csv("ASD_Release_202309_AS.txt", sep="\t")
df

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
0,ASD00020000_1,aroG,Escherichia coli,P0AB91,1KFL,ASD02170001,PHE,A,Lig,Inhibitor,(2S)-2-amino-3-phenyl-propanoic acid,1354,Inner Protein Regulator,Protein-Protein Interaction,12126632,Allosteric inhibition of 3-deoxy-D-arabino-hep...,No,"Chain A:PRO150,GLN151,ALA154,GLY178,LEU179,SER..."
1,ASD00020000_2,aroF,Escherichia coli,P00888,6AGM,,TYR,A,Lig,Inhibitor,TYROSINE,601,Inner Protein Regulator,Inner Protein,,,No,
2,ASD00020000_4,aroF,Thermotoga maritima,Q9WYH8,3PG9,ASD00690002,TYR,A,Lig,Inhibitor,(2S)-2-amino-3-(4-hydroxyphenyl)propanoic acid,339,Inner Protein Regulator,Protein-Protein Interaction,21282100,Tyrosine latching of a regulatory gate affords...,No,"Chain A:SER31,GLY33,GLN34,GLU35,ARG36,VAL38; C..."
3,ASD00020000_5,ARO4,Saccharomyces cerevisiae,P32449,1OF6,ASD00020003,DTY,A,Lig,Inhibitor,(2R)-2-amino-3-(4-hydroxyphenyl)propanoic acid,1370,Inner Protein Regulator,Protein-Protein Interaction,,Crystal Structure of the Double Complex of the...,No,"Chain A:THR162,GLN166,ALA169,GLY193,LEU194,SER..."
4,ASD00020000_5,ARO4,Saccharomyces cerevisiae,P32449,1OFR,ASD02170001,PHE,H,Lig,Inhibitor,(2S)-2-amino-3-phenyl-propanoic acid,1002,Inner Protein Regulator,Protein-Protein Interaction,15019786,Substrate and Metal Complexes of 3-Deoxy-D-Ara...,No,"Chain G:THR162,PRO165,GLN166,ALA169,LEU190,GLY..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3097,ASD80060000_2,,Vibrio cholerae,,3OWZ,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36
3098,ASD80060000_2,,Vibrio cholerae,,3OXE,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36
3099,ASD80060000_2,,Vibrio cholerae,,3OXJ,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36
3100,ASD80060000_2,,Vibrio cholerae,,3OXM,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3102 entries, 0 to 3101
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   target_id                3102 non-null   object
 1   target_gene              2759 non-null   object
 2   organism                 3102 non-null   object
 3   pdb_uniprot              3088 non-null   object
 4   allosteric_pdb           3102 non-null   object
 5   modulator_serial         2560 non-null   object
 6   modulator_alias          3078 non-null   object
 7   modulator_chain          3069 non-null   object
 8   modulator_class          3088 non-null   object
 9   modulator_feature        3049 non-null   object
 10  modulator_name           3081 non-null   object
 11  modulator_resi           3029 non-null   object
 12  function                 2742 non-null   object
 13  position                 2758 non-null   object
 14  pubmed_id                2497 non-null  

## NAs

In [7]:
df.isna().sum()

target_id                    0
target_gene                343
organism                     0
pdb_uniprot                 14
allosteric_pdb               0
modulator_serial           542
modulator_alias             24
modulator_chain             33
modulator_class             14
modulator_feature           53
modulator_name              21
modulator_resi              73
function                   360
position                   344
pubmed_id                  605
ref_title                  477
site_overlap               362
allosteric_site_residue    211
dtype: int64

The columns needed to identify an allosteric site are the PDB (`allosteric_pdb`) and the **modulator ID fields** (`_alias` (residue name), `_chain` (chain ID) and `_resi` (residue number)). Other interesting column is 
`allosteric_site_residue` (annotated allosteric site).

In [8]:
df[["allosteric_pdb", "modulator_alias", "modulator_chain", "modulator_resi", "allosteric_site_residue"]].isna().sum()

allosteric_pdb               0
modulator_alias             24
modulator_chain             33
modulator_resi              73
allosteric_site_residue    211
dtype: int64

Thre are a lot of entries with missing annotated allosteric site. A considerable amount of entries have missing ID fields of the modulator, which are necessary to identify the modulator molecule in the PDB, but the PDB code is always present.

### Missing modulator ID fields

#### _alias

In [9]:
df[df.modulator_alias.isna()]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789.0,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
1174,ASD03160000_2,ADORA2A,Escherichia coli,P0ABE7,4EIY,ASD03160008,,A,Ion,Inhibitor,sodium(+1) cation,2402.0,Inner Protein Regulator,Inner Protein,22798613.0,Structural basis for allosteric regulation of ...,No,"Chain A:ASP52,SER91,ASN280"
1175,ASD03160000_2,GRM1,Escherichia coli,P0ABE7,4N6H,ASD03160008,,A,Ion,Inhibitor,sodium(+1) cation,1220.0,Inner Protein Regulator,Inner Protein,24413399.0,Molecular control of delta-opioid receptor sig...,No,"Chain A:ASP95,ASN131,SER135"
1179,ASD03160000_2,cybC,Escherichia coli,P0ABE7,5WIV,ASD03160008,,A,Ion,Regulator,SODIUM ION,1224.0,Allosteric function,Allosteric position,29051383.0,D4 dopamine receptor high-resolution structure...,No,"Chain A:ASP80,LEU118,PHE403,ALA79,TRP407,LEU76..."
1515,ASD04020000_2,kgd,Mycobacterium smegmatis,A0R2B1,6I2P,,,,Pep,Inhibitor,Glycogen accumulation regulator GarA,301.0,Inner Protein Regulator,Protein-Protein Interaction,,,Yes,
1763,ASD05870000_1,F2,Homo sapiens,P00734,1DX5,ASD03160008,,M,Ion,Activator,sodium(+1) cation,2001.0,Inner Protein Regulator,Inner Protein,10761923.0,Structural Basis for the Anticoagulant Activit...,No,1DX5
1764,ASD05870000_1,F2,Homo sapiens,P00734,1JMO,ASD03160008,,H,Ion,Activator,sodium(+1) cation,501.0,Inner Protein Regulator,Inner Protein,12169660.0,Crystal structures of native and thrombin-comp...,No,1JMO
1765,ASD05870000_1,F2,Homo sapiens,P00734,1JOU,ASD03160008,,B,Ion,Activator,sodium(+1) cation,3001.0,Inner Protein Regulator,Inner Protein,12679024.0,The molecular basis of thrombin allostery reve...,No,1JOU
1766,ASD05870000_1,F2,Homo sapiens,P00734,1SFQ,ASD03160008,,E,Ion,Activator,sodium(+1) cation,402.0,Inner Protein Regulator,Inner Protein,15152000.0,Molecular dissection of na+ binding to thrombin.,No,1SFQ
1767,ASD05870000_1,F2,Homo sapiens,P00734,1SG8,ASD03160008,,B,Ion,Activator,sodium(+1) cation,1701.0,Inner Protein Regulator,Inner Protein,15152000.0,Molecular dissection of na+ binding to thrombin.,No,1SG8


#### *Dataset correction

Entries with sodium as the allosteric modulator ligand must recognize the sodium residue name "NA" correctly and not as NaN.

In [10]:
na_before = df.isna().sum()

In [11]:
df = pd.read_csv(
    "ASD_Release_202309_AS.txt", 
    sep="\t", keep_default_na=False,
    na_values = [
        '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', 
        '-NaN', '-nan', '1.#IND', '1.#QNAN', '<NA>', 'N/A', 
        'NULL', 'NaN', 'n/a', 'nan', 'null'
    ] # all defaults in function documentation except "NA"
)

In [12]:
df[df.modulator_alias.isna()]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789.0,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
1515,ASD04020000_2,kgd,Mycobacterium smegmatis,A0R2B1,6I2P,,,,Pep,Inhibitor,Glycogen accumulation regulator GarA,301.0,Inner Protein Regulator,Protein-Protein Interaction,,,Yes,
1892,ASD07480000_1,NTRK1,Homo sapiens,P04629,6D22,,,,,,,,,,29672039.0,,,6D22
2911,ASD17660000_1,spuE,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I6J0,6IKM,,,,Pep,Inhibitor,Anti-SpuE Antibody,,,,,,,
2912,ASD17720000_1,lpxD,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9HXY6,6UEC,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3039,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEE,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3040,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEG,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3079,ASD22000000_1,,Sulfolobus islandicus,Q54324,6GVT,,,,DNA,Regulator,5'-D(*CP*TP*GP*TP*GP*CP*TP*CP*A)-3',,,,,,,
3080,ASD22010000_1,csm2,Streptococcus thermophilus,A0A0A7HIX1,6NUD,,,,RNA,Activator,target ssRNA,,,,,,,


In [13]:
na_after = df.isna().sum()

In [14]:
list(zip(na_before.index, na_before.values - na_after.values))

[('target_id', 0),
 ('target_gene', 3),
 ('organism', 0),
 ('pdb_uniprot', 0),
 ('allosteric_pdb', 0),
 ('modulator_serial', 0),
 ('modulator_alias', 15),
 ('modulator_chain', 0),
 ('modulator_class', 0),
 ('modulator_feature', 0),
 ('modulator_name', 0),
 ('modulator_resi', 0),
 ('function', 0),
 ('position', 0),
 ('pubmed_id', 0),
 ('ref_title', 0),
 ('site_overlap', 0),
 ('allosteric_site_residue', 0)]

There are also 3 less NAs in `target_gene` but that's unimportant. There are still (24-13=) 11 entries with missing `modulator_alias`.

#### _chain

In [15]:
df[df.modulator_chain.isna()]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
87,ASD00310000_5,PRKAB2,Homo sapiens,O43741,4RER,ASD00310360,BCD,,Lig,Regulator,BETA-CYCLODEXTRIN,301,Protein-Protein Interaction,Protein-Protein Interaction,25412657.0,Structural basis of AMPK regulation by adenine...,No,4RER
360,ASD01040000_1,ESR1,Homo sapiens,P03372,5DIG,ASD01040012,5CE,,Lig,Regulator,"(1S,3aR,5S,7aS)-5-[4-hydroxy-2-(trifluoromethy...",601,Allosteric function,Allosteric position,27107013.0,Predictive features of ligand-specific signali...,No,5DIG
403,ASD01040000_1,ESR1,Homo sapiens,P03372,5EHJ,ASD01040044,5K5,,Lig,,"4,4'-[(4aR,8aR)-octahydronaphthalen-2(1H)-ylid...",900,Allosteric function,Allosteric position,27107013.0,Predictive features of ligand-specific signali...,No,5EHJ
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789.0,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
741,ASD01510000_2,dnaK,Escherichia coli,P0A6Y8,4R5G,ASD01510131,3JE,,Lig,Inhibitor,triphenyl(phenylethynyl)phosphonium,701,Allosteric function,Allosteric position,25148104.0,Structural Basis for the Inhibition of HSP70 a...,No,"Chain B:ALA503,VAL394,LEU392,ASP393,LEU484,LEU..."
1190,ASD03200000_1,AURKA,Homo sapiens,O14965,5G15,,MB1,,Pep,Activator,MB1 MONOBODY,1-96,,,,"Monobodies as Novel, Highly Potent Allosteric ...",,5G15
1191,ASD03200000_1,AURKA,Homo sapiens,O14965,5L8J,,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,Pep,Inhibitor,New antigen receptor variable domain,,Allosteric Function,Protein-Protein,27411893.0,Allosteric inhibition of Aurora-A kinase by a ...,No,5L8J
1192,ASD03200000_1,AURKA,Homo sapiens,O14965,5L8K,,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,Pep,Inhibitor,New antigen receptor variable domain,,Allosteric Function,Protein-Protein,27411893.0,Allosteric inhibition of Aurora-A kinase by a ...,No,5L8K
1193,ASD03200000_1,AURKA,Homo sapiens,O14965,5L8L,,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,Pep,Inhibitor,New antigen receptor variable domain,,Allosteric Function,Protein-Protein,27411893.0,Allosteric inhibition of Aurora-A kinase by a ...,No,5L8L
1515,ASD04020000_2,kgd,Mycobacterium smegmatis,A0R2B1,6I2P,,,,Pep,Inhibitor,Glycogen accumulation regulator GarA,301,Inner Protein Regulator,Protein-Protein Interaction,,,Yes,


No apparent correction needed.

#### _resi

In [16]:
df[df.modulator_resi.isna()]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
353,ASD01020000_1,EGFR,Homo sapiens,P00533,7JXQ,ASD01020081,VNS,"G,I,M,O",compound,,2-(5-fluoro-2-hydroxyphenyl)-2-[5-[4-(1-methyl...,,,,35422503,,,7JXQ
354,ASD01020000_1,EGFR,Homo sapiens,P00533,7VRA,ASD01020069,I0A,B,compound,,25-chloro-11-(ethylsulfonyl)-44-morpholino-11H...,,,,35446588,,,7VRA
355,ASD01020000_1,EGFR,Homo sapiens,P00533,7VRE,ASD01020058,7VH,B,compound,,5-Chloro-N-{5-chloro-2-methoxy-4-[4-(4-methylp...,,,,35446588,,,7VRE
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
483,ASD01140000_3,GABRA1,Homo sapiens,P14867,8DD2,ASD01140041,R5R,E,Compound,,"N,N-dimethyl-2-(6-methyl-2-p-tolylimidazo[1,2-...",,,,35933426,,,8DD2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2982,ASD20880000_1,DHDPS1,Arabidopsis thaliana,Q9LZX6,6VVH,,LYS,"AAA, BBB, CCC, DDD",Lig,Regulator,LYSINE,,,,,,,
3039,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEE,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3040,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEG,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3079,ASD22000000_1,,Sulfolobus islandicus,Q54324,6GVT,,,,DNA,Regulator,5'-D(*CP*TP*GP*TP*GP*CP*TP*CP*A)-3',,,,,,,


No apparent correction needed.

#### One missing

In [17]:
#             *
((df.modulator_alias.isna() & df.modulator_chain.notna() & df.modulator_resi.notna()).sum(),

 
#                                        *
(df.modulator_alias.notna() & df.modulator_chain.isna() & df.modulator_resi.notna()).sum(),

 
#                                                                     *
(df.modulator_alias.notna() & df.modulator_chain.notna() & df.modulator_resi.isna()).sum())

(0, 17, 58)

`_alias` is never missing, which will allow to identify at least the identity of the allosteric modulator if not the precise molecule in the complete PDB structure; and `_resi` is missing in a considerable deal of entries.

#### Two missing

In [18]:
#           *                             *
((df.modulator_alias.isna() & df.modulator_chain.isna() & df.modulator_resi.notna()).sum(),

 
#                                      *                                *
(df.modulator_alias.notna() & df.modulator_chain.isna() & df.modulator_resi.isna()).sum(),

 
#            *                                                        *
(df.modulator_alias.isna() & df.modulator_chain.notna() & df.modulator_resi.isna()).sum())

(1, 7, 0)

There are 3 entries missing both `_chain` and `_resi`.

#### All missing

In [19]:
((df.modulator_alias.notna() & df.modulator_chain.notna() & df.modulator_resi.notna()).sum(),
(df.modulator_alias.isna() & df.modulator_chain.isna() & df.modulator_resi.isna()).sum())

(3011, 8)

Only 2817 entries have all 3 fields with a non-NaN value, and 6 of them have all NaN in the modulator id fields.

In [20]:
unknown = df[(df.modulator_alias.isna() & df.modulator_chain.isna() & df.modulator_resi.isna())]
unknown

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789.0,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
1892,ASD07480000_1,NTRK1,Homo sapiens,P04629,6D22,,,,,,,,,,29672039.0,,,6D22
2911,ASD17660000_1,spuE,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I6J0,6IKM,,,,Pep,Inhibitor,Anti-SpuE Antibody,,,,,,,
2912,ASD17720000_1,lpxD,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9HXY6,6UEC,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3039,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEE,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3040,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEG,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3079,ASD22000000_1,,Sulfolobus islandicus,Q54324,6GVT,,,,DNA,Regulator,5'-D(*CP*TP*GP*TP*GP*CP*TP*CP*A)-3',,,,,,,
3080,ASD22010000_1,csm2,Streptococcus thermophilus,A0A0A7HIX1,6NUD,,,,RNA,Activator,target ssRNA,,,,,,,


Some of the cases could be identified thanks to the `modulator_name` field.

### Missing allosteric site residues

In [21]:
df.isna()["allosteric_site_residue"].sum()

211

In [22]:
df[~df.isna()["allosteric_site_residue"]]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
0,ASD00020000_1,aroG,Escherichia coli,P0AB91,1KFL,ASD02170001,PHE,A,Lig,Inhibitor,(2S)-2-amino-3-phenyl-propanoic acid,1354,Inner Protein Regulator,Protein-Protein Interaction,12126632,Allosteric inhibition of 3-deoxy-D-arabino-hep...,No,"Chain A:PRO150,GLN151,ALA154,GLY178,LEU179,SER..."
2,ASD00020000_4,aroF,Thermotoga maritima,Q9WYH8,3PG9,ASD00690002,TYR,A,Lig,Inhibitor,(2S)-2-amino-3-(4-hydroxyphenyl)propanoic acid,339,Inner Protein Regulator,Protein-Protein Interaction,21282100,Tyrosine latching of a regulatory gate affords...,No,"Chain A:SER31,GLY33,GLN34,GLU35,ARG36,VAL38; C..."
3,ASD00020000_5,ARO4,Saccharomyces cerevisiae,P32449,1OF6,ASD00020003,DTY,A,Lig,Inhibitor,(2R)-2-amino-3-(4-hydroxyphenyl)propanoic acid,1370,Inner Protein Regulator,Protein-Protein Interaction,,Crystal Structure of the Double Complex of the...,No,"Chain A:THR162,GLN166,ALA169,GLY193,LEU194,SER..."
4,ASD00020000_5,ARO4,Saccharomyces cerevisiae,P32449,1OFR,ASD02170001,PHE,H,Lig,Inhibitor,(2S)-2-amino-3-phenyl-propanoic acid,1002,Inner Protein Regulator,Protein-Protein Interaction,15019786,Substrate and Metal Complexes of 3-Deoxy-D-Ara...,No,"Chain G:THR162,PRO165,GLN166,ALA169,LEU190,GLY..."
5,ASD00020000_5,ARO4,Saccharomyces cerevisiae,P32449,1OG0,ASD02170001,PHE,A,Lig,Inhibitor,(2S)-2-amino-3-phenyl-propanoic acid,1012,Inner Protein Regulator,Protein-Protein Interaction,,Crystal Structure of the Mutant G226S of the T...,No,"Chain A:THR162,GLN166,ALA169,LEU190,GLY193,LEU..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3097,ASD80060000_2,,Vibrio cholerae,,3OWZ,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36
3098,ASD80060000_2,,Vibrio cholerae,,3OXE,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36
3099,ASD80060000_2,,Vibrio cholerae,,3OXJ,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36
3100,ASD80060000_2,,Vibrio cholerae,,3OXM,ASD01260003,GLY,A,Lig,Regulator,2-aminoethanoic acid,89,Inner DNA/RNA Regulator,Inner DNA/RNA,21145485,Structural insights into ligand recognition by...,No,Chain A:GLY36


In some cases the `allosteric_site_residue` field is not NA but it only has 1 residue annotated or, in other examples seen above, it has only the PDB ID:

In [23]:
sum(df["allosteric_site_residue"].str.len() <= 4)

1408

Cases where it has only one residue, e.g. 'Chain A:ALA70':

In [24]:
sum(df["allosteric_site_residue"].str.len() <= len('Chain A:ALA70'))

1429

## Exploration of fields

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3102 entries, 0 to 3101
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   target_id                3102 non-null   object
 1   target_gene              2762 non-null   object
 2   organism                 3102 non-null   object
 3   pdb_uniprot              3088 non-null   object
 4   allosteric_pdb           3102 non-null   object
 5   modulator_serial         2560 non-null   object
 6   modulator_alias          3093 non-null   object
 7   modulator_chain          3069 non-null   object
 8   modulator_class          3088 non-null   object
 9   modulator_feature        3049 non-null   object
 10  modulator_name           3081 non-null   object
 11  modulator_resi           3029 non-null   object
 12  function                 2742 non-null   object
 13  position                 2758 non-null   object
 14  pubmed_id                2497 non-null  

### modulator_class

In [26]:
df.modulator_class.value_counts(dropna=False)

modulator_class
Lig                2688
Ion                 216
Pep                  76
lig                  55
compound             20
NaN                  14
Compound             13
Lig;Lig               6
Lig;Ion               4
Lig;Lig;Lig;Lig       2
pep                   2
antibody              1
Gas                   1
ion                   1
Lon                   1
DNA                   1
RNA                   1
Name: count, dtype: int64

#### Proteins-peptides

In [27]:
df.query("modulator_class in ['Pep', 'pep', 'antibody']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
24,ASD00060000_1,PDPK1,Homo sapiens,O15530,5LVP,,HM-peptide,E,Pep,Activator,hydrophobic-motif peptide of PKB/Akt,1-15,Allosteric Fuction,Allosteric Position,27693059,Bidirectional Allosteric Communication between...,Yes,5LVP
27,ASD00180000_1,ADORA2A,Homo sapiens,P29274,3VG9,ASD00180006;ASD00180006,Chain_B;Chain_C,B;C,Pep,Inhibitor,,Chain_B;Chain_C,Inner Protein Regulator,Protein-Protein Interaction,22286059,G-protein-coupled receptor inactivation by an ...,No,3VG9
28,ASD00180000_1,ADORA2A,Homo sapiens,P29274,3VGA,ASD00180006;ASD00180006,Chain_B;Chain_C,B;C,Pep,Inhibitor,,Chain_B;Chain_C,Inner Protein Regulator,Protein-Protein Interaction,22286059,G-protein-coupled receptor inactivation by an ...,No,3VGA
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
736,ASD01470000_1,MET,Human immunodeficiency virus type 1,P08581,4O3T,ASD01470017,Chain P,P;P;P;P;P;P;P;P;P,Pep,Activator,IVGGYPWWM,495;496;497;498;499;500;501;502;503,Inner Protein Regulator,Inner Protein,24859116,An allosteric switch for pro-HGF/Met signaling...,No,"Chain P:VAL496,TRP501,GLY497,PRO500,TRP502,ILE..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2905,ASD17430000_1,ERBB3,Homo sapiens,P21860,5CUS,,KTN3379,"L,M,N,O",Pep,Inhibitor,Fab LC region of KTN3379,1-216,Allosteric function,Allosteric position,26460020,Inhibition of ErbB3 by a monoclonal antibody t...,No,5CUS
2907,ASD17450000_1,,Norwalk virus,Q5F4T5,5O03,,Nano-32,"C,D",Pep,,Nanobody (VHH) Nano-32,,,,29095961,Nanobodies targeting norovirus capsid reveal f...,,
2908,ASD17460000_1,DPP8,Homo sapiens,Q6V1X1,6EOP,,SUMO1,D,Pep,Inhibitor,SER-LEU-ARG-PHE-LEU-TYR-GLU-GLY,8,Allosteric function,Allosteric position,29382749,Structures and mechanism of dipeptidyl peptida...,No,6EOP
2911,ASD17660000_1,spuE,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I6J0,6IKM,,,,Pep,Inhibitor,Anti-SpuE Antibody,,,,,,,


Being proteins, in most cases whole chains of the PDB as annotated as the allosteric modulators, and they have nonstandard annotations in the modulator ID fields in general, so it will be best to process them separately.

In [28]:
prots = df.query("modulator_class in ['Pep', 'pep', 'antibody']")

#### NAs

In [29]:
df[df.modulator_class.isna()]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
594,ASD01230000_2,Gria2,Rattus norvegicus,P19491,6ZYU,ASD01230694_2_ZCH@@ASD01230695_2_ZCH,QSZ@@QSW,"A,B,C@@ABC",,,,,,,34242002.0,,,
1244,ASD03330000_1,CNR1,Homo sapiens,P21554,7FEE,ASD03331915_HJX,7IC,E,,,,,,,35637350.0,,,7FEE
1245,ASD03330000_1,CNR1,Homo sapiens,P21554,7WV9,ASD03331915_HJX,7IC,E,,,,,,,35637350.0,,,7WV9
1892,ASD07480000_1,NTRK1,Homo sapiens,P04629,6D22,,,,,,,,,,29672039.0,,,6D22
1954,ASD08170000_1,GLS,Homo sapiens,O94925,5WJ6,ASD08170129,B4A,A,,,"2-phenyl-N-{5-[4-({5-[(phenylacetyl)amino]-1,3...",601.0,,,29317493.0,Characterization of the interactions of potent...,,5WJ6
2102,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4QG1,,GTP,,,Activator,GUANOSINE-5'-TRIPHOSPHATE,,,,25288794.0,Structural Basis of Allosteric Activation of S...,,4QG1
2172,ASD10100000_1,rmlA,Pseudomonas aeruginosa,Q9HU22,5FTS,,P3I,A,,Inhibitor,"N-(6-Amino-1-(2-bromobenzyl)-2,4-dioxo-1,2,3,4...",,,,,Allosteric Competitive Inhibitors of the Gluco...,,
2177,ASD10100000_1,rmlA,Pseudomonas aeruginosa,Q9HU22,5FYE,,LD6,,,Inhibitor,"N-(6-Amino-1-(3-fluorobenzyl)-2,4-dioxo-1,2,3,...",,,,,Allosteric Competitive Inhibitors of the Gluco...,,
2946,ASD18420000_1,DHPS,Homo sapiens,P49366,6P4V,,8XY,A,,Inhibitor,6-bromo-N-(1H-indol-4-yl)-1-benzothiophene-2-c...,401.0,,,,,,6PGR
2958,ASD19110000_1,Gpr52,Homo sapiens,Q9Y2T5,6LI0,,EN6,A,,Activator,N-(2-hydroxyethyl)-5-(hydroxymethyl)-3-methyl-...,1401.0,,,,,,6LI0


Except for the first, they look like normal modulator entries with (at least) a correct `modulator_alias` field. The first one looks like corresponds to two different allosteric molecules that bind together.

#### Multiple modulators

In [30]:
df.query("modulator_class in ['Lig;Lig', 'Lig;Ion', 'Lig;Lig;Lig;Lig']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
545,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3IJO,ASD01230003;ASD01230003,B4D;B4D,B;B,Lig;Lig,Activator,"(3S)-6-chloro-1,1-dioxo-3-(prop-2-enylsulfanyl...",401;800,Inner Protein Regulator,Protein-Protein Interaction,19673491.0,Probing the allosteric modulator binding site ...,No,"Chain B:ILE92,THR93,LYS104,PRO105,PHE106,MET10..."
547,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3IK6,ASD01230004;ASD01230004,HCZ;HCZ,B;B,Lig;Lig,Activator,"6-chloro-1,1-dioxo-3,4-dihydro-2H-benzo[e][1,2...",262;800,Inner Protein Regulator,Protein-Protein Interaction,19673491.0,Probing the allosteric modulator binding site ...,No,"Chain B:ILE92,LYS104,PRO105,SER108,SER217,LYS2..."
548,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3IL1,ASD01230202;ASD01230202,B5D;B5D,B;B,Lig;Lig,Activator,"(3S)-7-chloro-3-methyl-3,4-dihydro-2H-benzo[e]...",262;801,Inner Protein Regulator,Protein-Protein Interaction,19673491.0,Probing the allosteric modulator binding site ...,No,"Chain B:ILE92,LYS104,PRO105,SER108,SER217,LYS2..."
665,ASD01320000_3,PYGM,Oryctolagus cuniculus,P00489,1Z62,ASD01320116;ASD01320116,IAA;IAA,A;A,Lig;Lig,Inhibitor,2-[[(2Z)-2-(2-oxo-7H-indol-3-ylidene)-7H-indol...,990;991,Inner Protein Regulator,Inner Protein,,Indirubin-3'-Aminooxy-Acetate Inhibits Glycoge...,No,"Chain A:TRP67,ILE68,GLN71,GLN72,TYR75,ARG193,P..."
717,ASD01360000_1,groL,Escherichia coli,P0A6F5,1PCQ,ASD01720027;ASD01360010,ADP;AF3,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",600;602,Inner Protein Regulator,Inner Protein,14517228.0,Role of the gamma-phosphate of ATP in triggeri...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."
718,ASD01360000_1,groL,Escherichia coli,P0A6F5,1SVT,ASD01720027;ASD01360010,ADP;AF3,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",600;602,Inner Protein Regulator,Inner Protein,15313620.0,Exploring the structural dynamics of the E.col...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."
726,ASD01360000_1,groEL,Escherichia coli,Q548M1,4PKN,ASD01720027;ASD01360014,ADP;BEF,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",601;602,Inner Protein Regulator,Inner Protein,25136110.0,Formation and structures of GroEL:GroES2 chape...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."
727,ASD01360000_1,groEL,Escherichia coli,Q548M1,4PKO,ASD01720027;ASD01360014,ADP;BEF,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",601;602,Inner Protein Regulator,Inner Protein,25136110.0,Formation and structures of GroEL:GroES2 chape...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."
1160,ASD03000000_1,upp,Sulfolobus solfataricus,Q980Q4,1XTU,ASD00150003;ASD00150003;ASD00150003;ASD00150003,CTP;CTP;CTP;CTP,A;D;E;H,Lig;Lig;Lig;Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(4-amino-2-oxo-pyrimidin-1-yl...",1260;4260;5260;8260,Inner Protein Regulator,Protein-Protein Interaction,15654744.0,Allosteric regulation and communication betwee...,No,"Chain A:ARG29,LYS30,VAL33,ARG37,GLU87,LYS91; C..."
1161,ASD03000000_1,upp,Sulfolobus solfataricus,Q980Q4,3G6W,ASD00290006;ASD00290006;ASD00290006;ASD00290006,GTP;GTP;GTP;GTP,A;B;D;D,Lig;Lig;Lig;Lig,Activator,"[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H...",301;301;217;301,Inner Protein Regulator,Protein-Protein Interaction,19683539.0,Structural and kinetic studies of the alloster...,No,"Chain A:LYS30,ARG34,ARG37; Chain B:ARG37,LYS91..."


These cases can be detected, together with the previous identified case of two molecules that bind together, by identifying special characters such as ';', ',', '@', '-' or '/' in the modulator ID fields.

In [31]:
temp_df = (
    df.merge(
        pd.concat([
            unknown, prots
        ]),
        how="outer", indicator=True
    )
    .query(f"_merge == 'left_only'").drop("_merge", axis=1)
)

multiple = temp_df[
    temp_df.modulator_alias.str.contains(r"\W", na=False) 
    | temp_df.modulator_chain.str.contains(r"\W", na=False) 
    | temp_df.modulator_resi.str.contains(r"\W", na=False)
]
multiple

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
56,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,2000;2008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,HIS84,GLN314,ARG316,..."
58,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,HIS84,GLN314,A..."
60,ASD00250000_6,,Solanum tuberosum,P23509,1YP4,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,ARG83,HIS84,GL..."
90,ASD00310000_7,PRKAA2,Homo sapiens,P54646,4ZHX,ASD00310181,C2Z,"E,F",Lig,Activator,5-(5-hydroxyl-isoxazol-3-yl)-furan-2-phosphoni...,401402,Allosteric Function,Allosteric Position,26952388,Structural basis of allosteric and synergistic...,No,4ZHX
96,ASD00330000_2,pyrI,Escherichia coli,E8Y329,4KH0,ASD00330030;ASD01140145;ASD00330030,ATP;MG;ATP,B;B;B,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",202;203;204,Inner Protein Regulator,Inner Protein,24138583,New Paradigm for Allosteric Regulation of Esch...,No,"Chain B:ALA11,ILE12,VAL17,ASP19,HIS20,THR43,GL..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2984,ASD20880000_1,DHDPS1,Arabidopsis thaliana,Q9LZX6,6VVH,,LYS,"AAA, BBB, CCC, DDD",Lig,Regulator,LYSINE,,,,,,,
2993,ASD21390000_1,fumC,Mycobacterium tuberculosis,P9WN93,5F91,,5WJ,"A,C",Lig,Inhibitor,~{N}-[5-(azepan-1-ylsulfonyl)-2-methoxy-phenyl...,468,Inner Protein Regulator,Allosteric Position,,,No,
3001,ASD21400000_1,pfkA,Staphylococcus aureus,P99165,5XZA,,ADP-Mg,A,lig,activator,ADENOSINE-5'-DIPHOSPHATE-MG,323,Protein-Protein Interaction,Allosteric position,,,No,
3010,ASD21480000_1,ppnN,Escherichia coli,P0ADR8,6GFM,,0O2,A,,Regulator,guanosine 5'-(tetrahydrogen triphosphate) 3'-(...,501,,,,,,


##### Examination

In [32]:
multiple[:50]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
56,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,2000;2008,Inner Protein Regulator,Protein-Protein Interaction,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,HIS84,GLN314,ARG316,..."
58,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,HIS84,GLN314,A..."
60,ASD00250000_6,,Solanum tuberosum,P23509,1YP4,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,ARG83,HIS84,GL..."
90,ASD00310000_7,PRKAA2,Homo sapiens,P54646,4ZHX,ASD00310181,C2Z,"E,F",Lig,Activator,5-(5-hydroxyl-isoxazol-3-yl)-furan-2-phosphoni...,401402,Allosteric Function,Allosteric Position,26952388.0,Structural basis of allosteric and synergistic...,No,4ZHX
96,ASD00330000_2,pyrI,Escherichia coli,E8Y329,4KH0,ASD00330030;ASD01140145;ASD00330030,ATP;MG;ATP,B;B;B,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",202;203;204,Inner Protein Regulator,Inner Protein,24138583.0,New Paradigm for Allosteric Regulation of Esch...,No,"Chain B:ALA11,ILE12,VAL17,ASP19,HIS20,THR43,GL..."
97,ASD00330000_2,pyrI,Escherichia coli,E8Y329,4KH1,ASD00150003;ASD00150002;ASD00330030,CTP;MG;UTP,B;B;B,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(4-amino-2-oxo-pyrimidin-1-yl...",202;203;204,Inner Protein Regulator,Inner Protein,24138583.0,New Paradigm for Allosteric Regulation of Esch...,No,"Chain B:LEU7,GLN8,VAL9,ALA11,ILE12,VAL17,ASP19..."
116,ASD00330000_3,pyrI,Escherichia coli,P0A7F3,4FYX,ASD00150002;ASD00900001;ASD00330030,UTP;MG;DCP,B;B;B,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(2,4-dioxopyrimidin-1-yl)-3,4...",202;203;204,Inner Protein Regulator,Inner Protein,22906065.0,Metal Ion Involvement in the Allosteric Mechan...,No,"Chain B:LYS6,LEU7,GLN8,VAL9,ALA11,ILE12,VAL17,..."
117,ASD00330000_3,pyrI,Escherichia coli,P0A7F3,4FYY,ASD00150002;ASD00150003;ASD00330030,UTP;MG;CTP,B;B;B,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(2,4-dioxopyrimidin-1-yl)-3,4...",202;203;204,Inner Protein Regulator,Inner Protein,22906065.0,Metal Ion Involvement in the Allosteric Mechan...,No,"Chain B:LEU7,GLN8,VAL9,ALA11,ILE12,VAL17,ASP19..."
164,ASD00470000_3,CALM1,Homo sapiens,P62158,1CKK,ASD11630001;ASD11630001;ASD11630001;ASD11630001,CA;CA;CA;CA,A;A;A;A,Ion,Regulator,CALCIUM ION,151;152;153;154,Inner Protein Regulator,Inner Protein,10467092.0,A novel target recognition revealed by calmodu...,No,1CKK
165,ASD00470000_3,CALM1,Homo sapiens,P62158,1IQ5,ASD11630001;ASD11630001;ASD11630001;ASD11630001,CA;CA;CA;CA,A;A;A;A,Ion,Regulator,CALCIUM ION,361;362;363;364,Inner Protein Regulator,Inner Protein,11545585.0,Target-induced conformational adaptation of ca...,No,1IQ5


In [33]:
multiple[50:100]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1540,ASD04220000_1,SERPINC1,Homo sapiens,P01008,1TB6,ASD04220055;ASD04220056;ASD04220057;ASD0422005...,GU8;GU5;GU1;GU6;GU2;GU3,I;I;I;I;I;I,Lig,Activator,"(2R,3R,4S,5R,6R)-3,4-dimethoxy-6-(methoxymethy...",443;444;445;446;447;448,Protein-Protein Interaction Regulator,Inner Protein,15311269.0,Structure of the antithrombin-thrombin-heparin...,No,1TB6
1554,ASD04270000_2,Ptgs2,Mus musculus,Q05769,4OTJ,ASD04270003,IXP,"A,B,C,D",Lig,Inhibitor,"(5S,5aS,8aS,9S)-8-oxo-9-(3,4,5-trimethoxypheny...",706,Inner Protein Regulator,Allosteric Position,27588346.0,Antitumor Activity of Cytotoxic Cyclooxygenase...,No,"Chain D:LYS180,LEU183,MET487,ILE442,ARG438,GLU..."
1576,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXF,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,304;305,Inner Protein Regulator,Inner Protein,21291269.0,Discovery of a potential allosteric ligand bin...,No,3PXF
1577,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXQ,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,301;302,Inner Protein Regulator,Inner Protein,21291269.0,Discovery of a potential allosteric ligand bin...,No,3PXQ
1578,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXZ,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,299;300,Inner Protein Regulator,Inner Protein,21291269.0,Discovery of a potential allosteric ligand bin...,No,3PXZ
1579,ASD04590000_1,CDK2,Homo sapiens,P24941,3PY1,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,301;302,Inner Protein Regulator,Inner Protein,21291269.0,Discovery of a potential allosteric ligand bin...,No,3PY1
1580,ASD04590000_1,CDK2,Homo sapiens,P24941,4EZ7,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,302;303,Inner Protein Regulator,Inner Protein,22893598.0,A Novel Approach to the Discovery of Small-Mol...,No,4EZ7
1583,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,302；303,,,,,,6Q4D
1809,ASD06390000_1,KRAS,Homo sapiens,P01116,7O70,ASD06390050,V4T,"C,G",compound,,"1-((3R,14aS)-11-chloro-9-fluoro-10-(2-fluoro-6...",,,,35471939.0,,,7O70
1810,ASD06390000_1,KRAS,Homo sapiens,P01116,7O83,ASD06390052,V52,"E,I",compound,,1-[(8aS)-6-chloro-5-(5-methyl-1H-indazol-4-yl)...,,,,35471939.0,,,7O83


In [34]:
multiple[100:]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2632,ASD13220000_1,,Pseudomonas syringae pv. syringae,Q6VE93,5KLQ,ASD03710003,IHP,"A,B,C",Lig,Activator,inositol hexakisphosphate,401,Allosteric Function,Allosteric Position,27525589.0,Structure of a pathogen effector reveals the e...,No,"Chain B:ARG49,GLY316,LYS289,ARG311,GLU103,GLU3..."
2633,ASD13230000_1,CASTOR1,Homo sapiens,Q8WTX7,5I2C,,ARG,"A,B,C,D",Lig,Regulator,Arginine,323,Inner Protein Regulator,Allosteric Position,27487210.0,Mechanism of arginine sensing by CASTOR1 upstr...,No,5I2C
2652,ASD14120000_1,FBP2,Homo sapiens,O00757,5ET6,ASD00030001,AMP,"A,B,C,D",Lig,Inhibitor,ADENOSINE MONOPHOSPHATE,401401401401,Inner Protein Regulator,Allosteric Position,27050133.0,"T-to-R switch of muscle fructose-1,6-bisphosph...",No,5ET6
2654,ASD14170000_1,proRS,Plasmodium falciparum,Q8I5R7,4WI1,ASD14170001,3O6,"A,B",Lig,Inhibitor,1-(4-fluorophenyl)-3-[4-(4-fluorophenyl)- 1-me...,801801,Inner Protein Regulator,Allosteric Position,27798837.0,Biochemical and Structural Characterization of...,No,Chain A:GLU404
2655,ASD14170000_1,proRS,Plasmodium falciparum,Q8I5R7,5IFU,ASD14170002,GBM,"A,B",Lig,Inhibitor,5-chloro-N-(2-{4-[(cyclohexylcarbamoyl)sulfamo...,801,Allosteric Function,Allosteric Position,27798837.0,Biochemical and Structural Characterization of...,yes,"Chain B:GLY517,TYR285,ILE270,GLU404,ARG403,SER..."
2667,ASD14570000_1,MAT2A,Homo sapiens,P31153,5UGH,ASD14570001,8AJ,A;C,Lig,Inhibitor,"2-(7-chloro-5-phenyl[1,2,4]triazolo[4,3-a]quin...",401;401,Inner Protein Regulator,Inner Protein,28553945.0,Targeting S-adenosylmethionine biosynthesis wi...,No,5UGH
2683,ASD14640000_1,EED,Homo sapiens,O75530,5GSA,ASD14640003,73K,"A,B",Lig,Inhibitor,N-(furan-2-ylmethyl)-8-(4-methylsulfonylphenyl...,501,Inner Protein Regulator,Allosteric Position,28135235.0,An allosteric PRC2 inhibitor targeting the H3K...,No,5GSA
2685,ASD14640000_1,EED,Homo sapiens,O75530,5H14,,LQB,"A,B",Lig,Inhibitor,"2-[3-(3,5-dimethylpyrazol-1-yl)-4-nitro-phenyl...",502,Inner Protein Regulator,Allosteric Position,28072869.0,Discovery and Molecular Basis of a Diverse Set...,No,5H14
2686,ASD14640000_1,EED,Homo sapiens,O75530,5H15,,LQD,"A,B",Lig,Inhibitor,"(3R,4S)-1-[(2-methoxyphenyl)methyl]-N,N-dimeth...",501,Inner Protein Regulator,Allosteric Position,28072869.0,Discovery and Molecular Basis of a Diverse Set...,No,5H15
2707,ASD15140000_1,Itpr1,Mus musculus,P11881,5GUG,ASD07660002,I3P,"A,B",Lig,Activator,"D-MYO-INOSITOL-1,4,5-TRIPHOSPHATE",30003000,Inner Protein Regulator,Allosteric Position,28416699.0,IP3-mediated gating mechanism of the IP3 recep...,No,Chain A:THR267


#### Special classes

In [35]:
df.query("modulator_class in ['Gas', 'Lon', 'DNA', 'RNA']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2904,ASD17420000_1,,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9HVB9,5BQ1,ASD17420001,CO2,A,Gas,Regulator,CARBON DIOXIDE,302303.0,Allosteric function,Allosteric position,26457866.0,Carbon Dioxide 'Trapped' in a beta-Carbonic An...,No,"Chain A:GLY103,VAL104,ASN65,ALA45,ARG64,HIS98,..."
3024,ASD21610000_1,RBP2,Homo sapiens,P50120,6ON5,,ZN,A,Lon,Regulator,ZINC ION,201.0,Allosteric function,Allosteric position,,,No,6ON5
3079,ASD22000000_1,,Sulfolobus islandicus,Q54324,6GVT,,,,DNA,Regulator,5'-D(*CP*TP*GP*TP*GP*CP*TP*CP*A)-3',,,,,,,
3080,ASD22010000_1,csm2,Streptococcus thermophilus,A0A0A7HIX1,6NUD,,,,RNA,Activator,target ssRNA,,,,,,,


"Gas" and "Lon" seem like normal small molecule (or ion) modulator entries, although CO2 seems to have two copies annotated with two different residue numbers. "DNA" and "RNA" are entries with all missing modulator ID fields.

### Other columns and errors examples from the first notebooks

# Processing

Updates and changes to the information in the Allosteric Database allosteric site dataset DataFrame must be included in a dictionary with entries that have the following format:

```
<PDB>: {
    'pdb': <PDB>,
    'mods': [
        [{<modulator molecule>}], ...
    ]
}
```

and each `[{<modulator molecule>}]` must be able to be converted in a DataFrame when passed to `pandas.DataFrame( )` and correspond to fields that allow to identify residues from the PDB, e.g.:
```
[
    [{'auth_asym_id': 'b', 'label_entity_id': '2'}],
]

[
    [{'auth_asym_id': 'AAA', 'auth_comp_id': 'LYS'}],
    [{'auth_asym_id': 'BBB', 'auth_comp_id': 'LYS'}],
    [{'auth_asym_id': 'CCC', 'auth_comp_id': 'LYS'}],
    [{'auth_asym_id': 'DDD', 'auth_comp_id': 'LYS'}]
]
```

In [36]:
import numpy as np
from tqdm.notebook import tqdm

import tempfile, requests

In [37]:
processed = [pd.Series(index=df.columns, dtype=object)]

In [38]:
errors = {}
error_entries = []

In [39]:
errors_groups = lambda: {
    val: sorted(
        k for k, v in errors.items()
        if any(map(lambda x: val in x, v))
    )
    for val in set(
        map(
            lambda x: x[0].split(": ")[-1], 
            errors.values()
        )
    )
}

In [40]:
def process_entry(entry, updates={}, auto_site_grouping=True, stringent_site_grouping=True):
    try:
        origpdb = entry.allosteric_pdb.upper()

        # If the PDB is not in the passed updates dictonary, assume it has standard single molecule annotation
        if origpdb not in updates:
            pdb = origpdb
            mods = [[{
                k: v for k,v in {
                    "auth_asym_id": entry["modulator_chain"],
                    "auth_comp_id": entry["modulator_alias"].upper(),
                    "auth_seq_id": entry["modulator_resi"],
                }.items()
                if v is not np.nan
            }],]
        # Else, get the information from the updates dictionary
        else:
            update = updates[origpdb]
            if update is None:
                # Skip the entry if the update is None (error/nonexisting PDB)
                return
            else:
                pdb = update["pdb"].upper()
                mods = update["mods"]
                
        old_sites = None
        # If PDB already exists, save its site IDs
        if PDB.get_or_none(PDB.entry_id == pdb.lower()) is not None:
            old_sites = list(PDB.get(PDB.entry_id == pdb.lower()).sites)

        print(pdb, mods)
        with db.atomic() as txn:
            try:
                sites = create_entry(db, pdb.lower(), mods, auto_site_grouping, stringent_site_grouping)
                
            except NoModulator:
                txn.rollback()
                
                data = utils.MMCIF2Dict().parse(
                    utils.PDBCif._get_cif(
                        pdb.lower(), save=True
                    )['filename']
                )[pdb]
                
                molec_entities = pd.DF(data["_entity"], dtype=str).query("type in ['non-polymer', 'branched']")["id"].to_list()
                mod_names = set(m['auth_comp_id'] for mod in mods for m in mod)

                found_modulators = (
                    pd.DF(data["_atom_site"], dtype=str)
                    .query(f"label_entity_id in {molec_entities} and auth_comp_id in {list(mod_names)}")[
                        ["label_asym_id", "auth_comp_id", "auth_seq_id", "pdbx_PDB_ins_code"]
                    ].drop_duplicates()
                )
                modulators = [[i] for i in found_modulators.to_dict(orient="records")]
                
                if (
                    len(modulators) > 0
                    and set(found_modulators.auth_comp_id.unique()) == mod_names
                ): #, f"{pdb.lower()}, {mods}: couldn't retrieve modulator in pdb with using residue name"
                    sites = create_entry(db, pdb.lower(), modulators, auto_site_grouping, stringent_site_grouping)
                else:
                    txn.rollback()

                    try:
                        new_mod_names = []
                        for mod_name in mod_names:
                            response = requests.get(
                                f"https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/{mod_name}.cif"
                            )
                            assert response.status_code != 404
                            
                            with tempfile.NamedTemporaryFile("w+", suffix=".cif") as f:
                                f.write(response.content.decode())
                                mod_data = utils.MMCIF2Dict().parse(f.name)[mod_name]["_chem_comp"]
                                
                            new_names = (
                                [resname for i in mod_data["pdbx_replaced_by"] for resname in i.split(" ")]
                                if mod_data["pdbx_replaced_by"] != ["?"] else (
                                    [resname for i in mod_data["pdbx_subcomponent_list"] for resname in i.split(" ")]
                                    if mod_data["pdbx_subcomponent_list"] != ["?"] else (
                                        None
                                    )
                                )
                            )
                            
                            assert new_names is not None
                            new_mod_names.extend(new_names)

                        newmods = [[{"auth_comp_id": m}] for m in new_mod_names]
                        sites = create_entry(db, pdb.lower(), newmods, auto_site_grouping, stringent_site_grouping)
                        update = {"pdb": pdb, "mods": newmods}
                        updates[origpdb] = {origpdb: update}
                    
                    except Exception as e:
                        print(e)
                        assert False, f"{pdb.lower()}, {mods}: couldn't retrieve modulator in pdb with using residue name"
                    
                
                assert len(sites) == 1, f"{pdb.lower()}, {modulators}: using only residue name retrieves more than one site(group)"

        for site in sites:
            site.info["source"] = {
                "allosteric_database": [{
                    k: v for k, v in {
                        "entry": [entry.to_dict()],
                        "version": '5.1',
                        "date": "202309",
                        "update": update if (origpdb in updates) else None
                    }.items() if v is not None
                }],
            }
            site.save()

    
        if old_sites is not None:
            try:
                combine_sites(db, pdb.lower(), old_sites, sites, auto_site_grouping, stringent_site_grouping)
            except Exception as e:
                assert False, "combine_sites failed; " + str(e.args[0])
                
                    

    
    except AssertionError as error:
        id = pdb.lower()
        errors.setdefault(id, [])
        errors[id].append(error.args[0])
        error_entries.append(entry)
        print(id, errors[id])
    except KeyError as error:
        id = pdb.lower()
        errors.setdefault(id, [])
        errors[id].append(str(error.args[0]))
        error_entries.append(entry)
        print(id, errors[id])
    # except:
    #     raise

In [41]:
def get_error(pdb):
    print("ENTRIES:", len(df.query(f"allosteric_pdb == '{pdb.upper()}'")), "SITES:", [(s, s.modulator) for p in [PDB.get_or_none(PDB.entry_id == pdb)] if p is not None for s in p.sites])
    print("ERROR:", errors[pdb.lower()])
    return pd.DF(error_entries).query(f"allosteric_pdb == '{pdb.upper()}'")

In [42]:
def solve_error(pdb, update, auto_site_grouping=False, stringent_site_grouping=True):
    if pdb in errors:
        errors.pop(pdb)
    entry = pd.DF(globals()["error_entries"]).query(f"allosteric_pdb == '{pdb.upper()}'")#.squeeze()
    assert len(entry.squeeze())>0
    globals()["error_entries"] = [
        row for i, row in (
            pd.DF(globals()["error_entries"])
            .merge(entry, how="outer", indicator=True)
            .query(f"_merge == 'left_only'").drop("_merge", axis=1)
            .iterrows()
        )
    ]
    # error_entries.remove(entry)
    process_entry(entry.squeeze(), updates=update, auto_site_grouping=auto_site_grouping, stringent_site_grouping=stringent_site_grouping)

## Proteins-peptides

In [43]:
df.query(f"allosteric_pdb in {prots.allosteric_pdb.to_list()}").allosteric_pdb.value_counts().sort_values()

allosteric_pdb
5LVP    1
6I53    1
4O3T    1
4O3U    1
1L5G    1
       ..
6EOP    1
6IKM    1
5LUQ    1
5N70    1
6IZW    1
Name: count, Length: 79, dtype: int64

PDBs that have an annotation of an allosteric protein/peptide only have that single entry in the dataset.

In [44]:
protein_chains = {}

Assuming that if `_alias` has a chain name, the whole chain is the annotated modulator:

In [45]:
prots[prots.modulator_alias.str.contains(r'(Chain|chain)', na=False)]

  prots[prots.modulator_alias.str.contains(r'(Chain|chain)', na=False)]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
27,ASD00180000_1,ADORA2A,Homo sapiens,P29274,3VG9,ASD00180006;ASD00180006,Chain_B;Chain_C,B;C,Pep,Inhibitor,,Chain_B;Chain_C,Inner Protein Regulator,Protein-Protein Interaction,22286059.0,G-protein-coupled receptor inactivation by an ...,No,3VG9
28,ASD00180000_1,ADORA2A,Homo sapiens,P29274,3VGA,ASD00180006;ASD00180006,Chain_B;Chain_C,B;C,Pep,Inhibitor,,Chain_B;Chain_C,Inner Protein Regulator,Protein-Protein Interaction,22286059.0,G-protein-coupled receptor inactivation by an ...,No,3VGA
736,ASD01470000_1,MET,Human immunodeficiency virus type 1,P08581,4O3T,ASD01470017,Chain P,P;P;P;P;P;P;P;P;P,Pep,Activator,IVGGYPWWM,495;496;497;498;499;500;501;502;503,Inner Protein Regulator,Inner Protein,24859116.0,An allosteric switch for pro-HGF/Met signaling...,No,"Chain P:VAL496,TRP501,GLY497,PRO500,TRP502,ILE..."
737,ASD01470000_1,MET,Human immunodeficiency virus type 1,P08581,4O3U,ASD01470059,Chain P,P;P;P;P;P;P;P;P;P;P;P;P;P;P,Pep,Activator,IIGGCPYWMDREEC,495;496;497;498;499;500;501;502;503;504;505;50...,Inner Protein Regulator,Inner Protein,24859116.0,An allosteric switch for pro-HGF/Met signaling...,No,Chain B:THR222
1246,ASD03380000_1,degS,Escherichia coli,P0AEE3,1SOZ,ASD03380004,Chain D,D;D;D;D,Pep,Activator,VYQF,407;408;409;410,Inner Protein Regulator,Inner Protein,15137941.0,Crystal structure of the DegS stress sensor: H...,No,"Chain D:GLN409,PHE410,TYR408,VAL407"
1247,ASD03380000_1,degS,Escherichia coli,P0AEE3,2R3Y,ASD03380009,Chain D,D;D;D;D,Pep,Activator,VYWF,407;408;409;410,Inner Protein Regulator,Inner Protein,17938245.0,Regulation of the sigmaE stress response by De...,No,"Chain D:VAL407,PHE410,TYR408,TRP409"
1248,ASD03380000_1,degS,Escherichia coli,P0AEE3,3GCN,ASD03380005,Chain B,B;B;B,Pep,Activator,YQF,408;409;410,Inner Protein Regulator,Inner Protein,,Mechanisms of allosteric activation of the Deg...,No,"Chain B:GLN409,TYR408,PHE410"
1249,ASD03380000_1,degS,Escherichia coli,P0AEE3,3GCO,ASD03380004,Chain B,B;B;B;B,Pep,Activator,VYQF,407;408;409;410,Inner Protein Regulator,Inner Protein,19836340.0,OMP peptides activate the DegS stress-sensor p...,No,"Chain B:GLN409,TYR408,PHE410,VAL407"
1250,ASD03380000_1,degS,Escherichia coli,P0AEE3,3GDS,ASD03380007,Chain B,B;B;B;B,Pep,Activator,VYYF,407;408;409;410,Inner Protein Regulator,Inner Protein,19836340.0,OMP peptides activate the DegS stress-sensor p...,No,"Chain B:TYR409,TYR408,PHE410,VAL407"
1251,ASD03380000_1,degS,Escherichia coli,P0AEE3,3GDU,ASD03380006,Chain D,D;D;D,Pep,Activator,YRF,408;409;410,Inner Protein Regulator,Inner Protein,19836340.0,OMP peptides activate the DegS stress-sensor p...,No,"Chain D:ARG409,PHE410,TYR408"


In [46]:
for i, pdb in prots[prots.modulator_alias.str.contains(r'(Chain|chain)', na=False)].iterrows():
    protein_chains.update({pdb.allosteric_pdb: pdb.modulator_alias.upper().replace('_', ' ').replace('CHAIN ', '').split(";")})
protein_chains

  for i, pdb in prots[prots.modulator_alias.str.contains(r'(Chain|chain)', na=False)].iterrows():


{'3VG9': ['B', 'C'],
 '3VGA': ['B', 'C'],
 '4O3T': ['P'],
 '4O3U': ['P'],
 '1SOZ': ['D'],
 '2R3Y': ['D'],
 '3GCN': ['B'],
 '3GCO': ['B'],
 '3GDS': ['B'],
 '3GDU': ['D'],
 '3GDV': ['D'],
 '4RQZ': ['D'],
 '2PBK': ['C']}

In [47]:
prots.query(f'allosteric_pdb not in {list(protein_chains.keys())}')[
    ['modulator_alias', 'modulator_chain', 'modulator_name', 'modulator_resi']
][:33]

Unnamed: 0,modulator_alias,modulator_chain,modulator_name,modulator_resi
24,HM-peptide,E,hydrophobic-motif peptide of PKB/Akt,1-15
482,,,Megabody38,
841,RGDF-MVA,C;C;C;C;C;,RGDF-MVA,5001;5002;5003;5004;5005
949,ACE;DTR;LEU;ASP;GLN;ILE;VAL;TRP;PHE;ASN;ALA;PR...,B;B;B;B;B;B;B;B;B;B;B;B;B;B;B;B;B,ethanal;(2R)-2-amino-3-(1H-indol-3-yl)propanoi...,0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
1110,CYS;GLN;LEU;TYR,D;D;D;D,"(2R)-2-amino-3-sulfanyl-propanoic acid;(2S)-2,...",66;67;68;69
1182,TPX2,"B,E",TARGETING PROTEIN FOR XKLP2,1-43
1190,MB1,,MB1 MONOBODY,1-96
1191,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,New antigen receptor variable domain,
1192,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,New antigen receptor variable domain,
1193,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,New antigen receptor variable domain,


In [48]:
prots.query(f'allosteric_pdb not in {list(protein_chains.keys())}')[
    ['modulator_alias', 'modulator_chain', 'modulator_name', 'modulator_resi']
][33:]

Unnamed: 0,modulator_alias,modulator_chain,modulator_name,modulator_resi
1750,ARG;GLY;ARG;TRP;GLN;VAL;TRP;GLY;LEU;ALA;LYS;AR...,C;C;C;C;C;C;C;C;C;C;C;C;C,,4;5;6;7;8;9;10;11;12;13;14;15;16
1751,SER;ARG;VAL;VAL;VAL;TRP;ARG;LEU;PRO;PRO;LEU;HIS,C;C;C;C;C;C;C;C;C;C;C;C,,4;5;6;7;8;9;10;11;12;13;14;15
1752,SER;VAL;LEU;ALA;ALA;ARG;ALA;ARG;MET;TRP;MET;TR...,C;C;C;C;C;C;C;C;C;C;C;C;C;C,,3;4;5;6;7;8;9;10;11;12;13;14;15;16
1771,0G6,B,"D-phenylalanyl-N-[(2S,3S)-6-{[amino(iminio)met...",801
2021,GS-5745,"L,M,N,H,I,J",GS-5745 Fab,
2029,rF10 repebody,B,rF10 repebody,1-266
2094,5D3,"C,D,E,F",5D3-Fab,
2123,Nb7,B,Camelid-Derived Antibody Fragment Nb7,1-247
2124,Nb7,B,Camelid-Derived Antibody Fragment Nb7,1-247
2125,Nb7,B,Camelid-Derived Antibody Fragment Nb7,1-247


Further assuming that if the number of residues annotated in `_resi` coincide with the number of residues that the annotated chain(s) in `_chain` has(ve) in the PDB, the whole chain(s) is(are) the annotated modulator(s):

In [49]:
for i, pdb in prots.query(f'allosteric_pdb not in {list(protein_chains.keys())}').iterrows():
    resi = pdb.modulator_resi
    if isinstance(resi, str) and isinstance(pdb.modulator_chain, str):
        if "-" in resi:
            reslist = tuple(range(int(resi.split("-")[0]), int(resi.split("-")[-1])+1))
        elif ';' in resi or ',' in resi:
            reslist = resi.strip(";,").replace(', ', ',').replace(';', ',').replace(',', ', ').split(", ")
        else:
            continue

        chains = list(set(pdb.modulator_chain.strip(";,").replace(', ', ',').replace(';', ',').replace(',', ', ').split(", ")))
        
        residues = (
            pd.DF(
                utils.MMCIF2Dict().parse(
                    utils.PDBCif._get_cif(
                        pdb.allosteric_pdb.lower(), save=True
                    )['filename']
                )[pdb.allosteric_pdb]["_atom_site"]).drop(
                [
                    'group_PDB', 'id', 'type_symbol',
                    'auth_atom_id', 'label_atom_id', 'label_alt_id',
                    'Cartn_x', 'Cartn_y', 'Cartn_z',
                    'occupancy', 'B_iso_or_equiv', 'pdbx_formal_charge'
                ],
                axis=1
            ).drop_duplicates()
            .query(f"auth_asym_id in {chains} and auth_comp_id != 'HOH'")
        )
        
        if len(reslist) == len(residues):
            protein_chains.update({pdb.allosteric_pdb: chains})
protein_chains

Downloading 5lvp
Downloading 1l5g
Downloading 3wmg
Downloading 4fgt
Downloading 4c3p
Downloading 3av9
Downloading 3ava
Downloading 3avb
Downloading 3avc
Downloading 3avf
Downloading 3avg
Downloading 3avh
Downloading 3avi
Downloading 3avj
Downloading 3avk
Downloading 3avl
Downloading 3avm
Downloading 3avn
Downloading 5oyj
Downloading 4ejf
Downloading 4z6a
Downloading 3alo
Downloading 5e95
Downloading 2ns8
Downloading 3zqf
Downloading 3zqg
Downloading 3zqh
Downloading 3zqi
Downloading 6htf
Downloading 5lhn
Downloading 5lhp
Downloading 5lhq
Downloading 4h36
Downloading 4h39
Downloading 4h3b
Downloading 5ttw
Downloading 5c6d
Downloading 4yga
Downloading 4z61
Downloading 5k1a
Downloading 5k1c
Downloading 5cus


{'3VG9': ['B', 'C'],
 '3VGA': ['B', 'C'],
 '4O3T': ['P'],
 '4O3U': ['P'],
 '1SOZ': ['D'],
 '2R3Y': ['D'],
 '3GCN': ['B'],
 '3GCO': ['B'],
 '3GDS': ['B'],
 '3GDU': ['D'],
 '3GDV': ['D'],
 '4RQZ': ['D'],
 '2PBK': ['C'],
 '1L5G': ['C'],
 '4FGT': ['D'],
 '3AV9': ['X'],
 '3AVA': ['X'],
 '3AVB': ['X'],
 '3AVF': ['D'],
 '3AVG': ['F'],
 '3AVH': ['F'],
 '3AVI': ['F'],
 '3AVJ': ['F'],
 '3AVK': ['F'],
 '3AVL': ['F'],
 '3AVM': ['F'],
 '3AVN': ['H'],
 '4EJF': ['E'],
 '3ALO': ['E'],
 '2NS8': ['E'],
 '3ZQF': ['C'],
 '3ZQG': ['C'],
 '3ZQH': ['C'],
 '3ZQI': ['C'],
 '4H36': ['B'],
 '4H39': ['B'],
 '4H3B': ['B'],
 '5TTW': ['B']}

In [50]:
prots.query(f'allosteric_pdb not in {list(protein_chains.keys())}')[
    ['allosteric_pdb', 'modulator_alias', 'modulator_chain', 'modulator_name', 'modulator_resi']
]

Unnamed: 0,allosteric_pdb,modulator_alias,modulator_chain,modulator_name,modulator_resi
24,5LVP,HM-peptide,E,hydrophobic-motif peptide of PKB/Akt,1-15
482,6I53,,,Megabody38,
949,3WMG,ACE;DTR;LEU;ASP;GLN;ILE;VAL;TRP;PHE;ASN;ALA;PR...,B;B;B;B;B;B;B;B;B;B;B;B;B;B;B;B;B,ethanal;(2R)-2-amino-3-(1H-indol-3-yl)propanoi...,0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
1182,4C3P,TPX2,"B,E",TARGETING PROTEIN FOR XKLP2,1-43
1190,5G15,MB1,,MB1 MONOBODY,1-96
1191,5L8J,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,New antigen receptor variable domain,
1192,5L8K,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,New antigen receptor variable domain,
1193,5L8L,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,New antigen receptor variable domain,
1238,5DC4,VSSVPTKLEVVAATPTSLLISWDAPAVTVDYYVITYGETGGWSGYQ...,B,AS25 monobody,
1270,3AVC,SER;ASP;LYS;ILE;ASP;ASN,F;F;F;F;F;F,(2S)-2-amino-3-hydroxy-propanoic acid;(2S)-2-a...,1;2;3;4;5;6


After manual inspection, every entry has a correct _chain annotated, except:
- 6PX5 and 6IZW annotations are small-molecule modulators
- 5UVG annotation is a domain of the single protein entity that is in the PDB
- The ones that have NaN _chain:

In [51]:
NaNchain = {
    '6I53': ['G'],
    '5G15': ["B"],
    '5L8J': ["B"],
    '5L8K': ["B"],
    '5L8L': ["B"],
    '6I2P': ['D'], # regulatory protein # in the model there is also an 'E' chain that looks like it occupies the same position in another rep. of the protein chain, but these two are not in the assembly; the dataset entry has a 301 _resi but is not in the pdb
    '5BJZ': ["C", "D", "L", "H"],
    '5NIF': ['3', '4'],
    '6IKM': 'a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r'.split(",")
}

In [52]:
df.query("allosteric_pdb in ['6PX5', '6IZW']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1771,ASD05870000_1,F2,Homo sapiens,P00734,6PX5,,0G6,B,pep,Inhibitor,"D-phenylalanyl-N-[(2S,3S)-6-{[amino(iminio)met...",801,Inner Protein Regulator,Allosteric position,,,Yes,6PX5
2920,ASD17910000_1,mglA,Myxococcus xanthus (strain DK1622),Q1DB04,6IZW,,GSP,A,pep,activator,5'-GUANOSINE-DIPHOSPHATE-MONOTHIOPHOSPHATE,301,Allosteric function,Allosteric position,,,yes,


In [53]:
prots = prots.merge(
    df.query("allosteric_pdb in ['6PX5', '6IZW']"),
    how="outer", indicator=True
).query(f"_merge == 'left_only'").drop("_merge", axis=1)

In [54]:
for i, pdb in prots.query(f'allosteric_pdb not in {list(protein_chains.keys()) + ["5UVG"]}').iterrows():
    if isinstance(pdb.modulator_chain, str):
        chains = list(set(pdb.modulator_chain.strip(";,").replace(', ', ',').replace(';', ',').replace(',', ', ').split(", ")))
        protein_chains.update({pdb.allosteric_pdb: chains})
protein_chains

{'3VG9': ['B', 'C'],
 '3VGA': ['B', 'C'],
 '4O3T': ['P'],
 '4O3U': ['P'],
 '1SOZ': ['D'],
 '2R3Y': ['D'],
 '3GCN': ['B'],
 '3GCO': ['B'],
 '3GDS': ['B'],
 '3GDU': ['D'],
 '3GDV': ['D'],
 '4RQZ': ['D'],
 '2PBK': ['C'],
 '1L5G': ['C'],
 '4FGT': ['D'],
 '3AV9': ['X'],
 '3AVA': ['X'],
 '3AVB': ['X'],
 '3AVF': ['D'],
 '3AVG': ['F'],
 '3AVH': ['F'],
 '3AVI': ['F'],
 '3AVJ': ['F'],
 '3AVK': ['F'],
 '3AVL': ['F'],
 '3AVM': ['F'],
 '3AVN': ['H'],
 '4EJF': ['E'],
 '3ALO': ['E'],
 '2NS8': ['E'],
 '3ZQF': ['C'],
 '3ZQG': ['C'],
 '3ZQH': ['C'],
 '3ZQI': ['C'],
 '4H36': ['B'],
 '4H39': ['B'],
 '4H3B': ['B'],
 '5TTW': ['B'],
 '5LVP': ['E'],
 '3WMG': ['B'],
 '4C3P': ['E', 'B'],
 '5DC4': ['B'],
 '3AVC': ['F'],
 '5OYJ': ['A'],
 '1DVA': ['Y'],
 '4Z6A': ['T'],
 '5E95': ['B'],
 '5TH9': ['J', 'N', 'H', 'L', 'I', 'M'],
 '6HTF': ['B'],
 '5NJ3': ['E', 'F', 'D', 'C'],
 '5LHN': ['B'],
 '5LHP': ['B'],
 '5LHQ': ['B'],
 '5JQH': ['D', 'C'],
 '4QBY': ['2'],
 '5C6D': ['B', 'A'],
 '4RIQ': ['A'],
 '4YGA': ['D', 'F', 'B'

In [55]:
prots.query(f'allosteric_pdb not in {list(protein_chains.keys()) + ["5UVG"]}')

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789.0,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
10,ASD03200000_1,AURKA,Homo sapiens,O14965,5G15,,MB1,,Pep,Activator,MB1 MONOBODY,1-96,,,,"Monobodies as Novel, Highly Potent Allosteric ...",,5G15
11,ASD03200000_1,AURKA,Homo sapiens,O14965,5L8J,,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,Pep,Inhibitor,New antigen receptor variable domain,,Allosteric Function,Protein-Protein,27411893.0,Allosteric inhibition of Aurora-A kinase by a ...,No,5L8J
12,ASD03200000_1,AURKA,Homo sapiens,O14965,5L8K,,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,Pep,Inhibitor,New antigen receptor variable domain,,Allosteric Function,Protein-Protein,27411893.0,Allosteric inhibition of Aurora-A kinase by a ...,No,5L8K
13,ASD03200000_1,AURKA,Homo sapiens,O14965,5L8L,,Ac-MARVDQTPRIATKETGESLTINCVLRDTACALDSTNWYRTKLG...,,Pep,Inhibitor,New antigen receptor variable domain,,Allosteric Function,Protein-Protein,27411893.0,Allosteric inhibition of Aurora-A kinase by a ...,No,5L8L
37,ASD04020000_2,kgd,Mycobacterium smegmatis,A0R2B1,6I2P,,,,Pep,Inhibitor,Glycogen accumulation regulator GarA,301,Inner Protein Regulator,Protein-Protein Interaction,,,Yes,
66,ASD17270000_1,malE,Escherichia coli O157:H7,P0AEY0,5BJZ,,sAB-11M,,Pep,,Synthetic antibody,,,,,,,
72,ASD17340000_1,SCL1,Saccharomyces cerevisiae (strain ATCC 204508 /...,P21243,5NIF,,Blm-pep,,Pep,Activator,Blm-pep,14,Allosteric Fuction,Allosteric Position,28733623.0,Crystal structure of a low molecular weight ac...,No,
77,ASD17660000_1,spuE,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I6J0,6IKM,,,,Pep,Inhibitor,Anti-SpuE Antibody,,,,,,,


In [56]:
protein_chains.update(NaNchain)
protein_chains

{'3VG9': ['B', 'C'],
 '3VGA': ['B', 'C'],
 '4O3T': ['P'],
 '4O3U': ['P'],
 '1SOZ': ['D'],
 '2R3Y': ['D'],
 '3GCN': ['B'],
 '3GCO': ['B'],
 '3GDS': ['B'],
 '3GDU': ['D'],
 '3GDV': ['D'],
 '4RQZ': ['D'],
 '2PBK': ['C'],
 '1L5G': ['C'],
 '4FGT': ['D'],
 '3AV9': ['X'],
 '3AVA': ['X'],
 '3AVB': ['X'],
 '3AVF': ['D'],
 '3AVG': ['F'],
 '3AVH': ['F'],
 '3AVI': ['F'],
 '3AVJ': ['F'],
 '3AVK': ['F'],
 '3AVL': ['F'],
 '3AVM': ['F'],
 '3AVN': ['H'],
 '4EJF': ['E'],
 '3ALO': ['E'],
 '2NS8': ['E'],
 '3ZQF': ['C'],
 '3ZQG': ['C'],
 '3ZQH': ['C'],
 '3ZQI': ['C'],
 '4H36': ['B'],
 '4H39': ['B'],
 '4H3B': ['B'],
 '5TTW': ['B'],
 '5LVP': ['E'],
 '3WMG': ['B'],
 '4C3P': ['E', 'B'],
 '5DC4': ['B'],
 '3AVC': ['F'],
 '5OYJ': ['A'],
 '1DVA': ['Y'],
 '4Z6A': ['T'],
 '5E95': ['B'],
 '5TH9': ['J', 'N', 'H', 'L', 'I', 'M'],
 '6HTF': ['B'],
 '5NJ3': ['E', 'F', 'D', 'C'],
 '5LHN': ['B'],
 '5LHP': ['B'],
 '5LHQ': ['B'],
 '5JQH': ['D', 'C'],
 '4QBY': ['2'],
 '5C6D': ['B', 'A'],
 '4RIQ': ['A'],
 '4YGA': ['D', 'F', 'B'

Transform the `protein_chains` dictionary into the format for updates, passing as identifiers of the allosteric modulator the chain ID as `auth_asym_id` and the entity ID of the protein that has the processed chain ID (to leave out e.g. waters and ions) as `label_entity_id`:

In [57]:
protein_updates = {}

for pdb, chains in tqdm(protein_chains.items()):
    data = utils.MMCIF2Dict().parse(
                utils.PDBCif._get_cif(
                    pdb.lower(), save=True
                )['filename']
            )[pdb]

    protein_entities = (
        pd.DF(data["_entity_poly"], dtype=str)
        .query("type == 'polypeptide(L)'")
        .entity_id.to_list()
    )
    
    protein_residues = (
        pd.DF(data["_atom_site"]).drop(
            [
                'group_PDB', 'id', 'type_symbol',
                'auth_atom_id', 'label_atom_id', 'label_alt_id',
                'Cartn_x', 'Cartn_y', 'Cartn_z',
                'occupancy', 'B_iso_or_equiv', 'pdbx_formal_charge'
            ],
            axis=1
        ).drop_duplicates()
        .query(f"label_entity_id in {protein_entities}")
    )

    protein_updates.update({
        pdb: {
            "pdb": pdb, 
            "mods": [
                [{
                    "auth_asym_id": chain, 
                    "label_entity_id": protein_residues.query(f"auth_asym_id == '{chain}'").label_entity_id.unique().item()
                }] for chain in chains
            ]
        }
    })

protein_updates

  0%|          | 0/76 [00:00<?, ?it/s]

Downloading 3vg9
Downloading 3vga
Downloading 4o3t
Downloading 4o3u
Downloading 1soz
Downloading 2r3y
Downloading 3gcn
Downloading 3gco
Downloading 3gds
Downloading 3gdu
Downloading 3gdv
Downloading 4rqz
Downloading 2pbk
Downloading 5dc4
Downloading 1dva
Downloading 5th9
Downloading 5nj3
Downloading 5jqh
Downloading 4qby
Downloading 4riq
Downloading 5fvl
Downloading 5luq
Downloading 5n70
Downloading 5o03
Downloading 6eop
Downloading 6i53
Downloading 5g15
Downloading 5l8j
Downloading 5l8k
Downloading 5l8l
Downloading 6i2p
Downloading 5bjz
Downloading 5nif
Downloading 6ikm


{'3VG9': {'pdb': '3VG9',
  'mods': [[{'auth_asym_id': 'B', 'label_entity_id': '2'}],
   [{'auth_asym_id': 'C', 'label_entity_id': '3'}]]},
 '3VGA': {'pdb': '3VGA',
  'mods': [[{'auth_asym_id': 'B', 'label_entity_id': '2'}],
   [{'auth_asym_id': 'C', 'label_entity_id': '3'}]]},
 '4O3T': {'pdb': '4O3T',
  'mods': [[{'auth_asym_id': 'P', 'label_entity_id': '3'}]]},
 '4O3U': {'pdb': '4O3U',
  'mods': [[{'auth_asym_id': 'P', 'label_entity_id': '3'}]]},
 '1SOZ': {'pdb': '1SOZ',
  'mods': [[{'auth_asym_id': 'D', 'label_entity_id': '2'}]]},
 '2R3Y': {'pdb': '2R3Y',
  'mods': [[{'auth_asym_id': 'D', 'label_entity_id': '2'}]]},
 '3GCN': {'pdb': '3GCN',
  'mods': [[{'auth_asym_id': 'B', 'label_entity_id': '2'}]]},
 '3GCO': {'pdb': '3GCO',
  'mods': [[{'auth_asym_id': 'B', 'label_entity_id': '2'}]]},
 '3GDS': {'pdb': '3GDS',
  'mods': [[{'auth_asym_id': 'B', 'label_entity_id': '2'}]]},
 '3GDU': {'pdb': '3GDU',
  'mods': [[{'auth_asym_id': 'D', 'label_entity_id': '2'}]]},
 '3GDV': {'pdb': '3GDV',
 

Discard the two identified error cases:

In [58]:
errors.update({pdb.lower(): "Wrong annotation" for pdb in ["5UVG"]})
for i, row in prots.query('allosteric_pdb in ["5UVG", "5C6D"]').iterrows():
    error_entries.append(row)

prots = prots.merge(
    prots.query('allosteric_pdb in ["5UVG"]'),
    how="outer", indicator=True
).query(f"_merge == 'left_only'").drop("_merge", axis=1)

### Processing

In [59]:
iterprots = lambda x = (
    prots.merge(pd.concat(processed, axis=1).T, how="outer", indicator=True)
    .query(f"_merge == 'left_only'").drop("_merge", axis=1)
): tqdm(x.iterrows(), total=len(x), smoothing=0)

for i, entry in iterprots():
    process_entry(entry, updates=protein_updates)
    processed.append(entry)

  0%|          | 0/76 [00:00<?, ?it/s]

5LVP [[{'auth_asym_id': 'E', 'label_entity_id': '2'}]]
3VG9 [[{'auth_asym_id': 'B', 'label_entity_id': '2'}], [{'auth_asym_id': 'C', 'label_entity_id': '3'}]]
3vg9 ['Molecules of the annotated modulator(s) bind close together but were not grouped']
3VGA [[{'auth_asym_id': 'B', 'label_entity_id': '2'}], [{'auth_asym_id': 'C', 'label_entity_id': '3'}]]
3vga ['Molecules of the annotated modulator(s) bind close together but were not grouped']
6I53 [[{'auth_asym_id': 'G', 'label_entity_id': '4'}]]
4O3T [[{'auth_asym_id': 'P', 'label_entity_id': '3'}]]
4O3U [[{'auth_asym_id': 'P', 'label_entity_id': '3'}]]
1L5G [[{'auth_asym_id': 'C', 'label_entity_id': '3'}]]
3WMG [[{'auth_asym_id': 'B', 'label_entity_id': '2'}]]
4FGT [[{'auth_asym_id': 'D', 'label_entity_id': '2'}]]
4C3P [[{'auth_asym_id': 'E', 'label_entity_id': '2'}], [{'auth_asym_id': 'B', 'label_entity_id': '2'}]]
5G15 [[{'auth_asym_id': 'B', 'label_entity_id': '2'}]]
5L8J [[{'auth_asym_id': 'B', 'label_entity_id': '2'}]]
5L8K [[{'auth

In [60]:
errors

{'5uvg': 'Wrong annotation',
 '3vg9': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '3vga': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '5th9': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '5nj3': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4riq': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '5bjz': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '5k1a': ['Molecules of the annotated modulator(s) bind close together but were not grouped']}

#### Error correction

In [61]:
pdb = "3vg9"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1,ASD00180000_1,ADORA2A,Homo sapiens,P29274,3VG9,ASD00180006;ASD00180006,Chain_B;Chain_C,B;C,Pep,Inhibitor,,Chain_B;Chain_C,Inner Protein Regulator,Protein-Protein Interaction,22286059,G-protein-coupled receptor inactivation by an ...,No,3VG9


In [62]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["B", "C"]}
]}}, auto_site_grouping=False)

3VG9 [{'label_asym_id': ['B', 'C']}]


In [63]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb ==  pdb).info

({'label_asym_id': ['B', 'C']},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'B'}],
    'label_entity_id': '2',
    'type': 'polymer',
    'pdbx_description': 'antibody fab fragment light chain',
    'polymer_type': 'polypeptide(L)',
    'length': 212},
   {'modulator': [{'label_asym_id': 'C'}],
    'label_entity_id': '3',
    'type': 'polymer',
    'pdbx_description': 'antibody fab fragment heavy chain',
    'polymer_type': 'polypeptide(L)',
    'length': 224}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P29274']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00180000_1',
       'target_gene': 'ADORA2A',
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'P29274',
       'allosteric_pdb': '3VG9',
       'modulator_serial': 'ASD00180006;ASD00180006',
       'modulator_alias': 'Chain_B;Chain_C',
       'modulator_chain': 'B;C',
     

They are the heavy and light chains of an antibody that bind together.

<br>

In [64]:
pdb = "3vga"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00180000_1,ADORA2A,Homo sapiens,P29274,3VGA,ASD00180006;ASD00180006,Chain_B;Chain_C,B;C,Pep,Inhibitor,,Chain_B;Chain_C,Inner Protein Regulator,Protein-Protein Interaction,22286059,G-protein-coupled receptor inactivation by an ...,No,3VGA


In [65]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["B", "C"]}
]}}, auto_site_grouping=False)

3VGA [{'label_asym_id': ['B', 'C']}]


They are also the heavy and light chains of an antibody that bind together.

<br>

In [66]:
pdb = "5th9"
print(protein_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5TH9', 'mods': [[{'auth_asym_id': 'J', 'label_entity_id': '2'}], [{'auth_asym_id': 'N', 'label_entity_id': '1'}], [{'auth_asym_id': 'H', 'label_entity_id': '2'}], [{'auth_asym_id': 'L', 'label_entity_id': '1'}], [{'auth_asym_id': 'I', 'label_entity_id': '2'}], [{'auth_asym_id': 'M', 'label_entity_id': '1'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD08780000_1,MMP9,Homo sapiens,P14780,5TH9,,GS-5745,"L,M,N,H,I,J",antibody,Inhibitor,GS-5745 Fab,,Allosteric function,Allosteric position,28235803,Biochemical characterization and structure det...,No,5TH9


In [67]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["A", "B"]},
    {"label_asym_id": ["D", "E"]},
    {"label_asym_id": ["G", "H"]},
]}}, auto_site_grouping=False)

5TH9 [{'label_asym_id': ['A', 'B']}, {'label_asym_id': ['D', 'E']}, {'label_asym_id': ['G', 'H']}]


In [68]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['A', 'B']},
 {'equivalent': [{'other_site': {'label_asym_id': ['D', 'E']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9722222222222222},
   {'other_site': {'label_asym_id': ['G', 'H']},
    'res_of_other_in_site': 0.9722222222222222,
    'res_of_site_in_other': 0.9722222222222222}],
  'nonequivalent': []})

They are pairs of heavy-light chains of an antibody that bind together, but the pairs are also close to each other but shouldn't be grouped.

<br>

In [69]:
pdb = "5nj3"
print(protein_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5NJ3', 'mods': [[{'auth_asym_id': 'E', 'label_entity_id': '2'}], [{'auth_asym_id': 'F', 'label_entity_id': '3'}], [{'auth_asym_id': 'D', 'label_entity_id': '3'}], [{'auth_asym_id': 'C', 'label_entity_id': '2'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD09650000_1,ABCG2,Homo sapiens,Q9UNQ0,5NJ3,,5D3,"C,D,E,F",Pep,Inhibitor,5D3-Fab,,Allosteric function,Allosteric position,28554189,Structure of the human multidrug transporter A...,No,5NJ3


In [70]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["C", "D"]},
    {"label_asym_id": ["E", "F"]}
]}}, auto_site_grouping=False)

5NJ3 [{'label_asym_id': ['C', 'D']}, {'label_asym_id': ['E', 'F']}]


In [71]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D']},
 {'equivalent': [{'other_site': {'label_asym_id': ['E', 'F']},
    'res_of_other_in_site': 1.1666666666666667,
    'res_of_site_in_other': 1.135135135135135}],
  'nonequivalent': []})

In [72]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'B'], dtype=object), array(['A', 'B'], dtype=object))

They are two pairs of heavy-light chains of an antibody that bind together, and each pair binds to a monomer of a receptor dimer in what is indeed identified as equivalent sites.

<br>

In [73]:
pdb = "4riq"
print(protein_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4RIQ', 'mods': [[{'auth_asym_id': 'A', 'label_entity_id': '1'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD17200000_1,ASH2L,Homo sapiens,Q9UBL3,4RIQ,,DPY30,A,Pep,Regulator,Protein dpy-30 homolog,,Protein-Protein Interaction,Protein-Protein Interaction,25456412,Molecular Basis for DPY-30 Association to COMP...,No,4RIQ


In [74]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["A", "B"]},
    {"label_asym_id": ["D", "E"]},
    {"label_asym_id": ["G", "H"]},
    {"label_asym_id": ["J", "K"]},
    {"label_asym_id": ["M", "N"]},
    {"label_asym_id": ["P", "Q"]},
    {"label_asym_id": ["S", "T"]},
    {"label_asym_id": ["V", "X"]},
]}}, auto_site_grouping=False)

4RIQ [{'label_asym_id': ['A', 'B']}, {'label_asym_id': ['D', 'E']}, {'label_asym_id': ['G', 'H']}, {'label_asym_id': ['J', 'K']}, {'label_asym_id': ['M', 'N']}, {'label_asym_id': ['P', 'Q']}, {'label_asym_id': ['S', 'T']}, {'label_asym_id': ['V', 'X']}]


In [75]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['A', 'B']},
 {'equivalent': [{'other_site': {'label_asym_id': ['D', 'E']},
    'res_of_other_in_site': 1.105263157894737,
    'res_of_site_in_other': 0.9545454545454546},
   {'other_site': {'label_asym_id': ['G', 'H']},
    'res_of_other_in_site': 1.105263157894737,
    'res_of_site_in_other': 0.9545454545454546},
   {'other_site': {'label_asym_id': ['J', 'K']},
    'res_of_other_in_site': 1.037037037037037,
    'res_of_site_in_other': 1.2727272727272727},
   {'other_site': {'label_asym_id': ['M', 'N']},
    'res_of_other_in_site': 1.1818181818181819,
    'res_of_site_in_other': 1.1818181818181819},
   {'other_site': {'label_asym_id': ['P', 'Q']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['S', 'T']},
    'res_of_other_in_site': 1.08,
    'res_of_site_in_other': 1.2272727272727273},
   {'other_site': {'label_asym_id': ['V', 'X']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0454545454545

There are multiple copies of the annotated allosteric modulator protein (although it is bigger than the target protein), and they seem to bind in dimers to symmetrical dimers of the target protein, and will be added as such.

<br>

In [76]:
pdb = "5bjz"
print(protein_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5BJZ', 'mods': [[{'auth_asym_id': 'C', 'label_entity_id': '2'}], [{'auth_asym_id': 'D', 'label_entity_id': '2'}], [{'auth_asym_id': 'L', 'label_entity_id': '3'}], [{'auth_asym_id': 'H', 'label_entity_id': '3'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD17270000_1,malE,Escherichia coli O157:H7,P0AEY0,5BJZ,,sAB-11M,,Pep,,Synthetic antibody,,,,,,,


In [77]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["B", "C"]},
    {"label_asym_id": ["E", "F"]},
]}}, auto_site_grouping=False)

5BJZ [{'label_asym_id': ['B', 'C']}, {'label_asym_id': ['E', 'F']}]


In [78]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'F']},
 {'equivalent': [{'other_site': {'label_asym_id': ['B', 'C']},
    'res_of_other_in_site': 0.9827586206896551,
    'res_of_site_in_other': 0.890625}],
  'nonequivalent': []})

Also two pairs of light-heavy antibody chains.

<br>

In [79]:
pdb = "5k1a"
print(protein_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5K1A', 'mods': [[{'auth_asym_id': 'D', 'label_entity_id': '2'}], [{'auth_asym_id': 'F', 'label_entity_id': '2'}], [{'auth_asym_id': 'B', 'label_entity_id': '2'}], [{'auth_asym_id': 'H', 'label_entity_id': '2'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD17310000_1,USP12,Homo sapiens,O75317,5K1A,,UAF1,"B,D,F,H",Pep,Activator,WD repeat-containing protein 48,1-677,Protein-Protein Interaction,Protein-Protein Interaction,27373336,Allosteric Activation of Ubiquitin-Specific Pr...,No,5K1A


In [80]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": ["B"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["F"]},
    {"label_asym_id": ["H"]},
]}}, auto_site_grouping=False)

5K1A [{'label_asym_id': ['B']}, {'label_asym_id': ['D']}, {'label_asym_id': ['F']}, {'label_asym_id': ['H']}]


In [81]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['D']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['B']},
    'res_of_other_in_site': 0.7222222222222222,
    'res_of_site_in_other': 0.7222222222222222},
   {'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 0.5909090909090909,
    'res_of_site_in_other': 0.7222222222222222},
   {'other_site': {'label_asym_id': ['H']},
    'res_of_other_in_site': 0.5510204081632653,
    'res_of_site_in_other': 0.75}]})

In [82]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

The annotated allosteric modulator protein looks like it has a beta-barrel domain which can interact with the target protein with the "center" of the barrel, but also with a side of the barrel (in the model, in the assembly only the "center" interacts). Thus, each unit is annotated separately. "D" is recognized as nonequivalent because the laterally-interactin target protein chain has more missing residues in the interacting part than the rest and affects the res_of_X_in_Y calculation.

<br>

In [83]:
errors

{'5uvg': 'Wrong annotation'}

#### Exploration

In [84]:
set(k for site in Site.select() for k in site.modulator.keys())

{'label_asym_id'}

In [85]:
set(len(site.modulator["label_asym_id"]) for site in Site.select())

{1, 2}

In [86]:
[site.pdb for site in Site.select() if len(site.modulator["label_asym_id"]) == 2]

[<PDB: 5lvp>,
 <PDB: 4c3p>,
 <PDB: 4ejf>,
 <PDB: 5c6d>,
 <PDB: 5fvl>,
 <PDB: 3vg9>,
 <PDB: 3vga>,
 <PDB: 5th9>,
 <PDB: 5nj3>,
 <PDB: 4riq>,
 <PDB: 5bjz>]

The PDBs which we did not fix manually due to the modulator being pairs of (antibody) chains are 4c3p, 4ejf, 5fvl and 6ikm.

In [87]:
pdb = "4c3p"
site = Site.get(Site.pdb == pdb)
print(site.modulator, site.related_sites, Site.get(Site.pdb == pdb).info["source"]["allosteric_database"][0]["update"])
prots.query(f"allosteric_pdb == '{pdb.upper()}'")

{'label_asym_id': ['B', 'D']} {'equivalent': [], 'nonequivalent': []} {'pdb': '4C3P', 'mods': [[{'auth_asym_id': 'E', 'label_entity_id': '2'}], [{'auth_asym_id': 'B', 'label_entity_id': '2'}]]}


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
9,ASD03200000_1,AURKA,Homo sapiens,O14965,4C3P,,TPX2,"B,E",Pep,Activator,TARGETING PROTEIN FOR XKLP2,1-43,Allosteric function,Allosteric position,24867643,Molecular Mechanism of Aurora a Kinase Autopho...,No,4C3P


The entry is correct.

In [88]:
pdb = "4ejf"
site = Site.get(Site.pdb == pdb)
print(site.modulator, site.related_sites, Site.get(Site.pdb == pdb).info["source"]["allosteric_database"][0]["update"])
prots.query(f"allosteric_pdb == '{pdb.upper()}'")

{'label_asym_id': ['E', 'F']} {'equivalent': [{'other_site': {'label_asym_id': ['G', 'H']}, 'res_of_other_in_site': 0.9, 'res_of_site_in_other': 0.9473684210526315}], 'nonequivalent': []} {'pdb': '4EJF', 'mods': [[{'auth_asym_id': 'E', 'label_entity_id': '2'}]]}


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
38,ASD04390000_1,CASP6,Homo sapiens,P55212,4EJF,ASD04390003;ASD04390003;ASD04390003;ASD0439000...,ARG;LEU;HIS;CYS;VAL;GLU;TRP;THR;ILE;LEU;GLU;ARG,E;E;E;E;E;E;E;E;E;E;E;E,Pep,Inhibitor,[amino-[[(4S)-4-amino-5-hydroxy-5-oxo-pentyl]a...,39;40;41;42;43;44;45;46;47;48;49;50,Inner Protein Regulator,Protein-Protein Interaction,22683611,Allosteric peptides bind a caspase zymogen and...,No,4EJF


The entry is correct.

In [89]:
pdb = "5fvl"
site = Site.get(Site.pdb == pdb)
print(site.modulator, site.related_sites, Site.get(Site.pdb == pdb).info["source"]["allosteric_database"][0]["update"])
prots.query(f"allosteric_pdb == '{pdb.upper()}'")

{'label_asym_id': ['C', 'D']} {'equivalent': [], 'nonequivalent': []} {'pdb': '5FVL', 'mods': [[{'auth_asym_id': 'D', 'label_entity_id': '2'}], [{'auth_asym_id': 'C', 'label_entity_id': '2'}]]}


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
66,ASD17280000_1,VPS4,Saccharomyces cerevisiae (strain ATCC 204508 /...,P52917,5FVL,,VPS20,"C,D",Pep,Inhibitor,27 peptide,27,Protein-Protein Interaction,Protein-Protein Interaction,27075672,Structural Fine-Tuning of Mit Interacting Moti...,No,


The entry is correct.

In [90]:
pdb = "6ikm"
site = Site.get(Site.pdb == pdb)
print(site.modulator, site.related_sites, Site.get(Site.pdb == pdb).info["source"]["allosteric_database"][0]["update"])
prots.query(f"allosteric_pdb == '{pdb.upper()}'")

{'label_asym_id': ['BA']} {'equivalent': [{'other_site': {'label_asym_id': ['B', 'L']}, 'res_of_other_in_site': 0.9565217391304348, 'res_of_site_in_other': 0.3793103448275862}, {'other_site': {'label_asym_id': ['D', 'F', 'X']}, 'res_of_other_in_site': 0.8461538461538461, 'res_of_site_in_other': 0.7586206896551724}, {'other_site': {'label_asym_id': ['N', 'R', 'DA']}, 'res_of_other_in_site': 0.84, 'res_of_site_in_other': 0.7241379310344828}, {'other_site': {'label_asym_id': ['V', 'Z', 'FA']}, 'res_of_other_in_site': 0.8490566037735849, 'res_of_site_in_other': 0.7758620689655172}, {'other_site': {'label_asym_id': ['H']}, 'res_of_other_in_site': 1.0, 'res_of_site_in_other': 0.39655172413793105}, {'other_site': {'label_asym_id': ['HA']}, 'res_of_other_in_site': 0.94, 'res_of_site_in_other': 0.8103448275862069}, {'other_site': {'label_asym_id': ['J']}, 'res_of_other_in_site': 0.8979591836734694, 'res_of_site_in_other': 0.7586206896551724}, {'other_site': {'label_asym_id': ['JA']}, 'res_of_ot

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
76,ASD17660000_1,spuE,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I6J0,6IKM,,,,Pep,Inhibitor,Anti-SpuE Antibody,,,,,,,


In [91]:
chains6ikm = site.pdb.residues.query("label_entity_id == '2'").label_asym_id.unique().tolist()
chains6ikm

['B',
 'D',
 'F',
 'H',
 'J',
 'L',
 'N',
 'P',
 'R',
 'T',
 'V',
 'X',
 'Z',
 'BA',
 'DA',
 'FA',
 'HA',
 'JA']

In [92]:
site.delete_instance()

1

In [93]:
error_entries.append(prots.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())

In [94]:
solve_error(pdb, {pdb.upper(): {"pdb": pdb.upper(), "mods": [
    {"label_asym_id": [chain]} for chain in chains6ikm
]}}, auto_site_grouping=False)

6IKM [{'label_asym_id': ['B']}, {'label_asym_id': ['D']}, {'label_asym_id': ['F']}, {'label_asym_id': ['H']}, {'label_asym_id': ['J']}, {'label_asym_id': ['L']}, {'label_asym_id': ['N']}, {'label_asym_id': ['P']}, {'label_asym_id': ['R']}, {'label_asym_id': ['T']}, {'label_asym_id': ['V']}, {'label_asym_id': ['X']}, {'label_asym_id': ['Z']}, {'label_asym_id': ['BA']}, {'label_asym_id': ['DA']}, {'label_asym_id': ['FA']}, {'label_asym_id': ['HA']}, {'label_asym_id': ['JA']}]


In [95]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['BA']},
 {'equivalent': [{'other_site': {'label_asym_id': ['B']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.3793103448275862},
   {'other_site': {'label_asym_id': ['D']},
    'res_of_other_in_site': 0.8363636363636363,
    'res_of_site_in_other': 0.7931034482758621},
   {'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.39655172413793105},
   {'other_site': {'label_asym_id': ['H']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.39655172413793105},
   {'other_site': {'label_asym_id': ['J']},
    'res_of_other_in_site': 0.8979591836734694,
    'res_of_site_in_other': 0.7586206896551724},
   {'other_site': {'label_asym_id': ['L']},
    'res_of_other_in_site': 0.96,
    'res_of_site_in_other': 0.41379310344827586},
   {'other_site': {'label_asym_id': ['N']},
    'res_of_other_in_site': 0.9318181818181818,
    'res_of_site_in_other': 0.7068965517241379},
   {'other_site': {'label_a

The asymmetric unit is very big and contains many copies of all the entities. However, in the assembly it is seen that the functional form is single heterodimers of the annotated allosteric modulator protein and its target protein, thus the correct annotation should be a single chain.

In [96]:
errors

{'5uvg': 'Wrong annotation'}

In [97]:
pd.DF(error_entries)

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
0,ASD15440000_1,UHRF1,Homo sapiens,Q96T88,5C6D,,USP7,"A,B",Pep,modulator,Ubiquitin carboxyl-terminal hydrolase 7,1-322,Protein-Protein Interaction,Protein-Protein Interaction,26299963,An Allosteric Interaction Links USP7 to Deubiq...,No,5C6D
1,ASD17360000_1,SMPD3,Homo sapiens,Q9NY59,5UVG,,CAT,A,Pep,Activator,soluble catalytic domain of nSMase2,,Inner Protein regulator,Inner Protein,28652336,Structure of human nSMase2 reveals an interdom...,No,5UVG


In [98]:
# response = requests.get(f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json?include=yes")
# assert response.status_code != 404, f"Uniprot not found: {response.status_code}"
# try:
#     sequence = list(json.loads(response.text)["sequence"]["value"])


# "https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/CHH.cif"

## Multiple modulators

In [99]:
df.query(f"allosteric_pdb in {multiple.allosteric_pdb.to_list()}").allosteric_pdb.value_counts().sort_values()

allosteric_pdb
4YPQ    1
4FYY    1
1CKK    1
1IQ5    1
1NWD    1
       ..
3ETE    2
3FYH    2
1YP4    2
2DEY    2
6Q4D    3
Name: count, Length: 131, dtype: int64

In [100]:
df.query(f'allosteric_pdb in {df.query(f"allosteric_pdb in {multiple.allosteric_pdb.to_list()}").allosteric_pdb.value_counts().loc[lambda x: x>1].index.to_list()}').sort_values("allosteric_pdb")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1095,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007;ASD02670007,HLT;HLT,A;A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4001;4002,Inner Protein Regulator,Inner Protein,10940306.0,Binding of the general anesthetics propofol an...,No,1E7C
1094,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007,HLT,A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4005,Inner Protein Regulator,Inner Protein,10940306.0,Binding of the general anesthetics propofol an...,No,1E7C
2359,ASD11630000_1,,Homo sapiens,Q9UM07,1WD9,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,15247907.0,Structural basis for Ca(2+)-induced activation...,No,1WD9
2358,ASD11630000_1,,Homo sapiens,Q9UM07,1WD9,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,15247907.0,Structural basis for Ca(2+)-induced activation...,No,1WD9
2361,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,15247907.0,Structural basis for Ca(2+)-induced activation...,No,1WDA
2360,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,15247907.0,Structural basis for Ca(2+)-induced activation...,No,1WDA
56,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,2000;2008,Inner Protein Regulator,Protein-Protein Interaction,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,HIS84,GLN314,ARG316,..."
57,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002,SO4,A,Ion,Activator,sulfate,2007,Inner Protein Regulator,Inner Protein,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135; Chain D:ARG83"
58,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,HIS84,GLN314,A..."
59,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002,SO4,A,Ion,Activator,sulfate,1007,Inner Protein Regulator,Inner Protein,15692569.0,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135; Chain D:ARG83"


Some PDBs that have an annotation of multiple modulators have other entries in the dataset with standard modulator ID fields, so these will simply be processed later.

In [101]:
import warnings

In [102]:
multiple_updates = {}
specchar = []

elelist = ["chain", "alias", "resi"]
elecols = ["modulator_chain", "modulator_alias", "modulator_resi"]


with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for i, row in tqdm(multiple.iterrows(), total=len(multiple)):
        pdb = row.allosteric_pdb
        
        if (
            row[elecols].str.contains(r"(@|-|/|；)").sum()
            or row.str.contains(r'[a-zA-Z]', na=False)["modulator_resi"]
            or row.str.contains(r'[0-9]', na=False)["modulator_chain"]
        ):
            print(pdb, "Special characters in multiple modulator annotation")
            # errors[pdb.lower()] = "Special characters in multiple modulator annotation"
            # error_entries.append(row)
            specchar.append(row)
            continue
        else:
            if pdb in multiple_updates:
                pdbd = multiple_updates[pdb]
            else:
                pdbd = {"pdb": pdb, "mods": []}
                
            for element in elelist:
                if row[f'modulator_{element}'] is not np.nan:
                    exec(f"{element}s = row['modulator_{element}'].replace(' ', '').replace(',', ';').split(';')")
                else:
                    exec(f"{element}s = None")
            lens = [len(eval(f"{element}s")) for element in elelist if eval(f"{element}s") is not None]
            if len(set(lens) - {1,}) <= 1:
                if len(set(lens)) == 1 and all(eval(f"{element}s") is not None for element in elelist):
                    for chain, alias, resi in zip(chains, aliass, resis):
                        pdbd["mods"].append([{"auth_asym_id": chain, "auth_comp_id": alias, "auth_seq_id": resi}])
                else:
                    for i in range(max(lens)):
                        for element in elelist:
                            exec(f"{element} = {element}s[i] if {element}s is not None and len({element}s) > 1 else {element}s[0] if {element}s is not None else None")
                        pdbd["mods"].append([{k: v for k,v in [("auth_asym_id", chain), ("auth_comp_id", alias), ("auth_seq_id", resi)] if v is not None}])
                multiple_updates.update({pdb: pdbd})
            else:
                raise Exception(pdb, f"Mismatches in annotated multiple modulators: {lens}")
                break
        
multiple_updates

  0%|          | 0/131 [00:00<?, ?it/s]

6ZYU Special characters in multiple modulator annotation
4WX2 Special characters in multiple modulator annotation
6Q4D Special characters in multiple modulator annotation
5AO4 Special characters in multiple modulator annotation
4TME Special characters in multiple modulator annotation
5UE5 Special characters in multiple modulator annotation
5HUD Special characters in multiple modulator annotation
7LW1 Special characters in multiple modulator annotation
5XZA Special characters in multiple modulator annotation


{'1YP2': {'pdb': '1YP2',
  'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': 'SO4',
     'auth_seq_id': '2000'}],
   [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2008'}]]},
 '1YP3': {'pdb': '1YP3',
  'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': 'SO4',
     'auth_seq_id': '1000'}],
   [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]},
 '1YP4': {'pdb': '1YP4',
  'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': 'SO4',
     'auth_seq_id': '1000'}],
   [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]},
 '4ZHX': {'pdb': '4ZHX',
  'mods': [[{'auth_asym_id': 'E',
     'auth_comp_id': 'C2Z',
     'auth_seq_id': '401'}],
   [{'auth_asym_id': 'F', 'auth_comp_id': 'C2Z', 'auth_seq_id': '402'}]]},
 '4KH0': {'pdb': '4KH0',
  'mods': [[{'auth_asym_id': 'B',
     'auth_comp_id': 'ATP',
     'auth_seq_id': '202'}],
   [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}],
   [{'auth_asym_id': 'B', 'auth_comp_id': 

#### Special characters

In [103]:
multiple.merge(pd.concat(specchar, axis=1).T)

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
0,ASD01230000_2,Gria2,Rattus norvegicus,P19491,6ZYU,ASD01230694_2_ZCH@@ASD01230695_2_ZCH,QSZ@@QSW,"A,B,C@@ABC",,,,,,,34242002.0,,,
1,ASD02900000_2,trpB,Salmonella typhimurium,P0A2K1,4WX2,ASD02908001,F6F,"A,B",Lig,,2-{[4-(TRIFLUOROMETHOXY)BENZOYL]AMINO}ETHYL DI...,"301A,302A,404B",Protein-Protein Interaction,Protein-Protein Interaction,26708480.0,Visualizing the tunnel in tryptophan synthase ...,Yes,"Chain B:ASP176,ASN171,GLY193,SER178,CYS170,ALA..."
2,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,302；303,,,,,,6Q4D
3,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,5AO4,ASD00290006,GTP,"A,C,D",Lig,Regulator,GUANOSINE-5'-TRIPHOSPHATE,1585/1586,Inner Protein Regulator,Allosteric Position,26431200.0,Phospho-Dependent Regulation of Samhd1 Oligome...,No,5AO4
4,ASD13000000_1,eutL,Clostridium perfringens,B1BQ33,4TME,ASD13000001,ETA,"A,B,C",Lig,Inhibitor,ETHANOLAMINE,219/220,Inner Protein Regulator,Allosteric Position,25752492.0,An allosteric model for control of pore openin...,No,"Chain B:HIS220,LEU218,GLU219,HIS221"
5,ASD15190000_1,MMP7,Homo sapiens,P09237,5UE5,ASD15190001;ASD15190002,IDS;SGN,A,Lig,Regulator,"N,O6-DISULFO-GLUCOSAMINE, 2-O-sulfo-alpha-L-id...",305-312,Inner Protein Regulator,Inner Protein,28648610.0,Glycan Activation of a Sheddase: Electrostatic...,No,5UE5
6,ASD16860000_1,,Corynebacterium glutamicum (strain ATCC 13032 ...,Q8NNL5,5HUD,ASD05860010,TRP,"A,B,C,D",Lig,Activator,TRYPTOPHAN,"A:509,B:515,C:532,D:528",Allosteric Function,Allosteric Position,29178787.0,Inter-Enzyme Allosteric Regulation of Chorisma...,No,"Chain D:ALA251,GLY242,LEU117,ALA202,ARG249,ALA..."
7,ASD18500000_1,PFKL,Homo sapiens,P17858,7LW1,ASD18500006,YG1,A@@D@@E@@F,compound,,"N-(2-(2-(5-hydroxypent-1-yn-1-yl)phenyl)-4H,10...",,,,34320407.0,,,7LW1
8,ASD21400000_1,pfkA,Staphylococcus aureus,P99165,5XZA,,ADP-Mg,A,lig,activator,ADENOSINE-5'-DIPHOSPHATE-MG,323,Protein-Protein Interaction,Allosteric position,,,No,


In [104]:
specchard = {
    "6ZYU": {
        "pdb": "6ZYU",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "QSZ"}], 
            [{"auth_asym_id": "B", "auth_comp_id": "QSZ"}], 
            [{"auth_asym_id": "C", "auth_comp_id": "QSZ"}],
            [{"auth_asym_id": "A", "auth_comp_id": "QSW"}],
            [{"auth_asym_id": "B", "auth_comp_id": "QSW"}],
            [{"auth_asym_id": "A", "auth_comp_id": "QSW"}]
        ]
    },
    "4WX2": {
        "pdb": "4WX2",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "F6F", "auth_seq_id": "301", "pdbx_PDB_ins_code": "A"}],
            [{"auth_asym_id": "A", "auth_comp_id": "F6F", "auth_seq_id": "302", "pdbx_PDB_ins_code": "A"}],
            [{"auth_asym_id": "A", "auth_comp_id": "F6F", "auth_seq_id": "404", "pdbx_PDB_ins_code": "B"}],
            [{"auth_asym_id": "B", "auth_comp_id": "F6F", "auth_seq_id": "301", "pdbx_PDB_ins_code": "A"}],
            [{"auth_asym_id": "B", "auth_comp_id": "F6F", "auth_seq_id": "302", "pdbx_PDB_ins_code": "A"}],
            [{"auth_asym_id": "B", "auth_comp_id": "F6F", "auth_seq_id": "404", "pdbx_PDB_ins_code": "B"}],
        ]
    },
    "6Q4D": {
        "pdb": "6Q4D",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "HHT", "auth_seq_id": "302"}],
            [{"auth_asym_id": "A", "auth_comp_id": "HHT", "auth_seq_id": "303"}],
        ]
    },
    "5AO4": {
        "pdb": "5AO4",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "GTP", "auth_seq_id": "1585"}],
            [{"auth_asym_id": "A", "auth_comp_id": "GTP", "auth_seq_id": "1586"}],
            [{"auth_asym_id": "C", "auth_comp_id": "GTP", "auth_seq_id": "1585"}],
            [{"auth_asym_id": "C", "auth_comp_id": "GTP", "auth_seq_id": "1586"}],
            [{"auth_asym_id": "D", "auth_comp_id": "GTP", "auth_seq_id": "1585"}],
            [{"auth_asym_id": "D", "auth_comp_id": "GTP", "auth_seq_id": "1586"}],
        ]
    },
    "4TME": {
        "pdb": "4TME",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "ETA", "auth_seq_id": "219"}],
            [{"auth_asym_id": "A", "auth_comp_id": "ETA", "auth_seq_id": "220"}],
            [{"auth_asym_id": "B", "auth_comp_id": "ETA", "auth_seq_id": "219"}],
            [{"auth_asym_id": "B", "auth_comp_id": "ETA", "auth_seq_id": "220"}],
            [{"auth_asym_id": "C", "auth_comp_id": "ETA", "auth_seq_id": "219"}],
            [{"auth_asym_id": "C", "auth_comp_id": "ETA", "auth_seq_id": "220"}],
        ]
    },
    "5UE5": {
        "pdb": "5UE5",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "IDS", "auth_seq_id": "305"}],
            [{"auth_asym_id": "A", "auth_comp_id": "SGN", "auth_seq_id": "312"}],
        ]
    },
    "5HUD": {
        "pdb": "5HUD",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "TRP", "auth_seq_id": "509"}],
            [{"auth_asym_id": "B", "auth_comp_id": "TRP", "auth_seq_id": "515"}],
            [{"auth_asym_id": "C", "auth_comp_id": "TRP", "auth_seq_id": "532"}],
            [{"auth_asym_id": "D", "auth_comp_id": "TRP", "auth_seq_id": "528"}],
        ]
    },
    "7LW1": {
        "pdb": "7LW1",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "YG1"}],
            [{"auth_asym_id": "D", "auth_comp_id": "YG1"}],
            [{"auth_asym_id": "E", "auth_comp_id": "YG1"}],
            [{"auth_asym_id": "F", "auth_comp_id": "YG1"}],
        ]
    },
    "5XZA": {
        "pdb": "5XZA",
        "mods": [
            [{"auth_asym_id": "A", "auth_comp_id": "ADP", "auth_seq_id": "323"}],
        ]
    },
}
specchard

{'6ZYU': {'pdb': '6ZYU',
  'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'QSZ'}],
   [{'auth_asym_id': 'B', 'auth_comp_id': 'QSZ'}],
   [{'auth_asym_id': 'C', 'auth_comp_id': 'QSZ'}],
   [{'auth_asym_id': 'A', 'auth_comp_id': 'QSW'}],
   [{'auth_asym_id': 'B', 'auth_comp_id': 'QSW'}],
   [{'auth_asym_id': 'A', 'auth_comp_id': 'QSW'}]]},
 '4WX2': {'pdb': '4WX2',
  'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': 'F6F',
     'auth_seq_id': '301',
     'pdbx_PDB_ins_code': 'A'}],
   [{'auth_asym_id': 'A',
     'auth_comp_id': 'F6F',
     'auth_seq_id': '302',
     'pdbx_PDB_ins_code': 'A'}],
   [{'auth_asym_id': 'A',
     'auth_comp_id': 'F6F',
     'auth_seq_id': '404',
     'pdbx_PDB_ins_code': 'B'}],
   [{'auth_asym_id': 'B',
     'auth_comp_id': 'F6F',
     'auth_seq_id': '301',
     'pdbx_PDB_ins_code': 'A'}],
   [{'auth_asym_id': 'B',
     'auth_comp_id': 'F6F',
     'auth_seq_id': '302',
     'pdbx_PDB_ins_code': 'A'}],
   [{'auth_asym_id': 'B',
     'auth_comp_id': 'F6F',
  

In [105]:
multiple_updates.update(specchard)

In [106]:
import re

In [107]:
for pdb in multiple_updates.values():
    for m in pdb["mods"]:
        for d in m:
            try:
                assert all(not re.search(r"\W", v) for v in d.values()), (pdb, m)
            except Exception as e:
                print(e)

({'pdb': '5J8V', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'Ca2+'}]]}, [{'auth_asym_id': 'A', 'auth_comp_id': 'Ca2+'}])


In [108]:
multiple_updates.update({
    "5J8V": {
        'pdb': '5J8V', 
        'mods': [ [{'auth_asym_id': 'A', 'auth_comp_id': 'CA'}] ],
    }
})

### Processing

In [109]:
itermultiple = lambda x = (
    multiple.merge(pd.DF(processed + error_entries), how="outer", indicator=True)
    .query(f"_merge == 'left_only'").drop("_merge", axis=1)
): tqdm(x.iterrows(), total=len(x), smoothing=0)

for i, entry in itermultiple():
    process_entry(entry, updates=multiple_updates)
    processed.append(entry)

  0%|          | 0/131 [00:00<?, ?it/s]

1YP2 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2008'}]]
Downloading 1yp2
1yp2 ['Molecules of the annotated modulator(s) bind close together but were not grouped']
1YP3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]
Downloading 1yp3
1yp3 ['Molecules of the annotated modulator(s) bind close together but were not grouped']
1YP4 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]
Downloading 1yp4
1yp4 ['Molecules of the annotated modulator(s) bind close together but were not grouped']
4ZHX [[{'auth_asym_id': 'E', 'auth_comp_id': 'C2Z', 'auth_seq_id': '401'}], [{'auth_asym_id': 'F', 'auth_comp_id': 'C2Z', 'auth_seq_id': '402'}]]
Downloading 4zhx
4zhx ['Molecules of the annotated modulator(s) bind close together but w

In [110]:
errors

{'5uvg': 'Wrong annotation',
 '1yp2': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1yp3': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1yp4': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4zhx': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4kh0': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4kh1': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4fyx': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4fyy': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4dqw': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1po0': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1po3': ['Molecules 

#### Error correction

In [111]:
errors_groups()

{'using only residue name retrieves more than one site(group)': ['4p86',
  '5tq2',
  '8dd3'],
 "couldn't retrieve modulator in pdb with using residue name": ['5j8v'],
 'Molecules of the annotated modulator(s) bind close together but were not grouped': ['1e7b',
  '1e7c',
  '1h9g',
  '1pcq',
  '1po0',
  '1po3',
  '1pzo',
  '1svt',
  '1w25',
  '1wda',
  '1yp2',
  '1yp3',
  '1yp4',
  '1z62',
  '2c2b',
  '2dew',
  '2dex',
  '2dw5',
  '2v0n',
  '3ete',
  '3fyh',
  '3pxf',
  '3pxq',
  '3pxz',
  '3py1',
  '4bzb',
  '4bzc',
  '4dkt',
  '4dn0',
  '4dqw',
  '4ez7',
  '4fyx',
  '4fyy',
  '4kh0',
  '4kh1',
  '4mz7',
  '4p02',
  '4pkn',
  '4pko',
  '4tme',
  '4wx2',
  '4zhx',
  '5im3',
  '5jyo',
  '5s4x',
  '6q4d',
  '6vvh'],
 'W': ['5uvg']}

##### Molecules of the annotated modulator(s) bind close together but were not grouped

In [112]:
pdb = "1e7b"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1E7B', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4001'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4002'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
39,ASD02670000_1,ALB,Homo sapiens,P02768,1E7B,ASD02670007;ASD02670007,HLT;HLT,A;A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4001;4002,Inner Protein Regulator,Inner Protein,10940305,Binding of the general anesthetics propofol an...,No,1E7B


In [113]:
df.query("allosteric_pdb == '1E7B'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1093,ASD02670000_1,ALB,Homo sapiens,P02768,1E7B,ASD02670007;ASD02670007,HLT;HLT,A;A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4001;4002,Inner Protein Regulator,Inner Protein,10940305,Binding of the general anesthetics propofol an...,No,1E7B


In [114]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1E7B [[{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4001'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4002'}]]


In [115]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D']},
 {'equivalent': [{'other_site': {'label_asym_id': ['F', 'G']},
    'res_of_other_in_site': 0.9523809523809523,
    'res_of_site_in_other': 0.9523809523809523}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['H']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

There is another molecule of the modulator in the structure that binds in a different site but that is not annotated in the dataset. These two definitely bind together.

<br>

In [116]:
pdb = "1e7c"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1E7C', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4001'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4002'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
21,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007;ASD02670007,HLT;HLT,A;A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4001;4002,Inner Protein Regulator,Inner Protein,10940306,Binding of the general anesthetics propofol an...,No,1E7C


In [117]:
df.query("allosteric_pdb == '1E7C'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1094,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007,HLT,A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4005,Inner Protein Regulator,Inner Protein,10940306,Binding of the general anesthetics propofol an...,No,1E7C
1095,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007;ASD02670007,HLT;HLT,A;A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4001;4002,Inner Protein Regulator,Inner Protein,10940306,Binding of the general anesthetics propofol an...,No,1E7C


In [118]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1E7C [[{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4001'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'HLT', 'auth_seq_id': '4002'}]]


In [119]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['G', 'H', 'I']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['J', 'K']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['L']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['M']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

There are many molecules of the modulator in the structure, and besides the two that have provoked the error here there is an additional "simple" annotation of another (single chain id, res name and res id) that will be added later. These two definitely bind together and moreover bind together with a third copy of the molecule that is not in the dataset annotation.

<br>

In [120]:
pdb = "1h9g"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1H9G', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'COA', 'auth_seq_id': '1228'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MYR', 'auth_seq_id': '1229'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
41,ASD11520000_1,fadR,Escherichia coli,P0A8V6,1H9G,ASD11520001;ASD03320088,COA;MYR,A;A,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-4-hydroxy...",1228;1229,Protein-DNA/RNA Interaction Regulator,Inner Protein,11296236,The Structural Basis of Acyl Coenzyme A-Depend...,No,"Chain A:ASP99,LEU101,LEU102,SER103,VAL104,ARG1..."


In [121]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1H9G [[{'auth_asym_id': 'A', 'auth_comp_id': 'COA', 'auth_seq_id': '1228'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MYR', 'auth_seq_id': '1229'}]]


In [122]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['B', 'C']}, {'equivalent': [], 'nonequivalent': []})

These two modulators bind together although they extend to very different parts of the structure, but they are covalently bound.

<br>

In [123]:
pdb = "1pcq"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1PCQ', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD01360000_1,groL,Escherichia coli,P0A6F5,1PCQ,ASD01720027;ASD01360010,ADP;AF3,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",600;602,Inner Protein Regulator,Inner Protein,14517228,Role of the gamma-phosphate of ATP in triggeri...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."


In [124]:
multiple_updates[pdb.upper()]

{'pdb': '1PCQ',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}]]}

In [125]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '1PCQ',
    'mods': [
        [{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}],
        [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}],
        [{'auth_asym_id': 'A', 'auth_comp_id': 'MG', 'auth_seq_id': '601'}],
        [{'auth_asym_id': 'A', 'auth_comp_id': 'K', 'auth_seq_id': '603'}],
    ]
    }
}, auto_site_grouping=True, stringent_site_grouping=False)

1PCQ [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MG', 'auth_seq_id': '601'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'K', 'auth_seq_id': '603'}]]


In [126]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['Z', 'AA', 'BA', 'CA']},
 {'equivalent': [{'other_site': {'label_asym_id': ['DA', 'EA', 'FA', 'GA']},
    'res_of_other_in_site': 0.9736842105263158,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['HA', 'IA', 'JA', 'KA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['LA', 'MA', 'NA', 'OA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['PA', 'QA', 'RA', 'SA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['TA', 'UA', 'VA', 'WA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['V', 'W', 'X', 'Y']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

This ADP definitely bind together with the Aluminum Fluoride and, moreover, with a Mg and a K ions.

<br>

In [127]:
pdb = "1po0"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1PO0', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '742'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '743'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
11,ASD01070000_1,fecA,Escherichia coli,P13036,1PO0,ASD00360007;ASD00360007,FLC;FLC,A;A,Ion,Regulator,"2-hydroxypropane-1,2,3-tricarboxylate;2-hydrox...",742;743,Inner Protein Regulator,Inner Protein,12948487,Structural evidence for iron-free citrate and ...,No,"Chain A:THR138,ARG155,LEU156,GLN176,GLN178,ARG..."


In [128]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1PO0 [[{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '742'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '743'}]]


In [129]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['B', 'C']}, {'equivalent': [], 'nonequivalent': []})

The two annotated molecules definitely bind together in the same site of a single chain.

<br>

In [130]:
pdb = "1po3"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1PO3', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '742'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '743'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FE', 'auth_seq_id': '744'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FE', 'auth_seq_id': '745'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
12,ASD01070000_1,fecA,Escherichia coli,P13036,1PO3,ASD00360007;ASD00360007;ASD01070006;ASD01070006,FLC;FLC;FE;FE,A;A;A;A,Ion,Regulator,"2-hydroxypropane-1,2,3-tricarboxylate;2-hydrox...",742;743;744;745,Inner Protein Regulator,Inner Protein,12948487,Structural evidence for iron-free citrate and ...,No,"Chain A:THR138,GLN176,GLN178,SER180,ARG365,GLN..."


In [131]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1PO3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '742'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FLC', 'auth_seq_id': '743'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FE', 'auth_seq_id': '744'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'FE', 'auth_seq_id': '745'}]]


In [132]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D', 'E', 'F']},
 {'equivalent': [{'other_site': {'label_asym_id': ['G', 'H', 'I', 'J']},
    'res_of_other_in_site': 0.9142857142857143,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

Same as before, now together with Iron ions.

<br>

In [133]:
pdb = "1pzo"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1PZO', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CBT', 'auth_seq_id': '300'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CBT', 'auth_seq_id': '301'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
20,ASD03140000_1,bla,Escherichia coli,P62593,1PZO,ASD03140002;ASD03140002,CBT;CBT,A;A,Lig,Inhibitor,"N,N-bis[(4-chlorophenyl)methyl]-2H-1,2,3,4-tet...",300;301,Inner Protein Regulator,Inner Protein,15037085,Allosteric inhibition through core disruption.,Yes,"Chain A:VAL216,ALA217,LEU220,LEU221,ALA224,SER..."


In [134]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1PZO [[{'auth_asym_id': 'A', 'auth_comp_id': 'CBT', 'auth_seq_id': '300'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CBT', 'auth_seq_id': '301'}]]


In [135]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['B', 'C']}, {'equivalent': [], 'nonequivalent': []})

In [136]:
df.query("allosteric_pdb == '1PZP'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1173,ASD03140000_1,bla,Escherichia coli,P62593,1PZP,ASD03140005,FTA,A,Lig,Inhibitor,"(Z)-3-[(4-phenylazanylphenyl)amino]-2-(2H-1,2,...",300,Inner Protein Regulator,Inner Protein,15037085,Allosteric inhibition through core disruption.,Yes,"Chain A:LEU221,ALA224,LEU225,LEU250,VAL261,ILE..."


The primary citation of the PDB states that both copies of the compound bind in a site different than the active site of the enzyme, although one of the two copies that bind together partially overlaps some of the active site residues. Importantly, in the primary citation an additional PDB structure was solved (1PZP) with a different allosteric modulator for which two binding site were observed, although one is attributed to crystal packing contacts and it was confirmed that in the ASD the correct one is annotated and will be processed later with the "simple" annotations (single chain id, res name, res id).
<br>

In [137]:
pdb = "1svt"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1SVT', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
14,ASD01360000_1,groL,Escherichia coli,P0A6F5,1SVT,ASD01720027;ASD01360010,ADP;AF3,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",600;602,Inner Protein Regulator,Inner Protein,15313620,Exploring the structural dynamics of the E.col...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."


In [138]:
multiple_updates[pdb.upper()]

{'pdb': '1SVT',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}]]}

In [139]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '1SVT',
    'mods': [
        [{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}],
        [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}],
        [{'auth_asym_id': 'A', 'auth_comp_id': 'MG', 'auth_seq_id': '601'}],
        [{'auth_asym_id': 'A', 'auth_comp_id': 'K', 'auth_seq_id': '603'}],
    ]
    }
}, auto_site_grouping=True, stringent_site_grouping=False)

1SVT [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '600'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'AF3', 'auth_seq_id': '602'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MG', 'auth_seq_id': '601'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'K', 'auth_seq_id': '603'}]]


In [140]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['Z', 'AA', 'BA', 'CA']},
 {'equivalent': [{'other_site': {'label_asym_id': ['DA', 'EA', 'FA', 'GA']},
    'res_of_other_in_site': 0.95,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['HA', 'IA', 'JA', 'KA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['LA', 'MA', 'NA', 'OA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['PA', 'QA', 'RA', 'SA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['TA', 'UA', 'VA', 'WA']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['V', 'W', 'X', 'Y']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

Same as 1pcq.

<br>

In [141]:
pdb = "1w25"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1W25', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '505'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
27,ASD07450000_1,pleD,Caulobacter crescentus,B8GZM2,1W25,ASD06980001;ASD06980001,C2E;C2E,A;A,Lig;Lig,Inhibitor,"9,9'-[(2R,3R,3aS,5S,7aR,9R,10R,10aS,12S,14aR)-...",503;505,Inner Protein Regulator,Inner Protein,15569936,Structural Basis of Activity and Allosteric Co...,No,"Chain A:ARG148,GLY153,GLY174,VAL175,HIS177,ARG..."


In [142]:
df.query("allosteric_pdb == '1W25'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1886,ASD07450000_1,pleD,Caulobacter crescentus,B8GZM2,1W25,ASD06980001;ASD06980001,C2E;C2E,A;A,Lig;Lig,Inhibitor,"9,9'-[(2R,3R,3aS,5S,7aR,9R,10R,10aS,12S,14aR)-...",503;505,Inner Protein Regulator,Inner Protein,15569936,Structural Basis of Activity and Allosteric Co...,No,"Chain A:ARG148,GLY153,GLY174,VAL175,HIS177,ARG..."


In [143]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1W25 [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '505'}]]


In [144]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['F', 'G']},
 {'equivalent': [{'other_site': {'label_asym_id': ['I', 'J']},
    'res_of_other_in_site': 0.9629629629629629,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

There are multiple molecules of the modulator in the structure but only these two are annotated in the dataset, and definitely bind together and tightly interacting and packing.

<br>

In [145]:
pdb = "1wda"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1WDA', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
35,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,15247907,Structural basis for Ca(2+)-induced activation...,No,1WDA


In [146]:
df.query("allosteric_pdb == '1WDA'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2360,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,15247907,Structural basis for Ca(2+)-induced activation...,No,1WDA
2361,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,15247907,Structural basis for Ca(2+)-induced activation...,No,1WDA


In [147]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1WDA [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]


In [148]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['F']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['B']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['C', 'D']},
    'res_of_other_in_site': 0.17647058823529413,
    'res_of_site_in_other': 0.2},
   {'other_site': {'label_asym_id': ['E']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [149]:
[s.modulator for s in PDB.get(PDB.entry_id == "1wda").sites]

[{'label_asym_id': ['F']}, {'label_asym_id': ['C', 'D']}]

In [150]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == "1wda").sites]

[1, 1]

In [151]:
multiple_updates[pdb.upper()]

{'pdb': '1WDA',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}

In [152]:
error_entries.append(
    df.query("allosteric_pdb == '1WDA' and modulator_resi == '901;902;904'").squeeze()
)
errors["1wda"] = ""

In [153]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '1WDA',
    'mods': [
        [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'},
         {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'},
         {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}],
        [{"label_asym_id": "B"}],
        [{"label_asym_id": "E"}]
        ]}
}, auto_site_grouping=False, stringent_site_grouping=False)

1WDA [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}, {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}, {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}], [{'label_asym_id': 'B'}], [{'label_asym_id': 'E'}]]


In [154]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['C', 'D', 'F']},
 {'label_asym_id': ['B']},
 {'label_asym_id': ['E']}]

In [155]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C', 'D', 'F']}]

[1, 1]

In [156]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['C', 'D', 'F']}]

There are multiple Calcium ions on the structure but only these three (and another simple annotation) are included, and indeed these three bind together. However, with automatic grouping they form 2 different sites because they are presumably more than 6.1 Angstroms apart from the 904, so they have to be manually passed without automatic grouping.

<br>

In [157]:
pdb = "1yp2"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1YP2', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2008'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,2000;2008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,HIS84,GLN314,ARG316,..."


In [158]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1YP2 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2008'}]]


In [159]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).modulator_residues, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'G']},
   label_comp_id label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code  \
 0           SO4             E               2            .                 ?   
 1           SO4             G               2            .                 ?   
 
   auth_seq_id auth_comp_id auth_asym_id pdbx_PDB_model_num pdbx_label_index  \
 0        2000          SO4            A                  1             2000   
 1        2008          SO4            A                  1             2008   
 
   pdbx_sifts_xref_db_name pdbx_sifts_xref_db_acc pdbx_sifts_xref_db_num  \
 0                       ?                      ?                      ?   
 1                       ?                      ?                      ?   
 
   pdbx_sifts_xref_db_res  
 0                      ?  
 1                      ?  ,
 {'equivalent': [{'other_site': {'label_asym_id': ['I', 'K']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9545454545454546},
   {'other_site': {'l

They are sulfate ions and the two annotated ones bind together in the structure.

<br>

In [160]:
pdb = "1yp3"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1YP3', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,HIS84,GLN314,A..."


In [161]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1YP3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]


In [162]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).modulator_residues, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'G']},
   label_comp_id label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code  \
 0           SO4             E               2            .                 ?   
 1           SO4             G               2            .                 ?   
 
   auth_seq_id auth_comp_id auth_asym_id pdbx_PDB_model_num pdbx_label_index  \
 0        1000          SO4            A                  1             1000   
 1        1008          SO4            A                  1             1008   
 
   pdbx_sifts_xref_db_name pdbx_sifts_xref_db_acc pdbx_sifts_xref_db_num  \
 0                       ?                      ?                      ?   
 1                       ?                      ?                      ?   
 
   pdbx_sifts_xref_db_res  
 0                      ?  
 1                      ?  ,
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'L']},
    'res_of_other_in_site': 0.875,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id'

Same case.

<br>

In [163]:
pdb = "1yp4"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1YP4', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00250000_6,,Solanum tuberosum,P23509,1YP4,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,ARG83,HIS84,GL..."


In [164]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1YP4 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1000'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1008'}]]


In [165]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).modulator_residues, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'G']},
   label_comp_id label_asym_id label_entity_id label_seq_id pdbx_PDB_ins_code  \
 0           SO4             E               2            .                 ?   
 1           SO4             G               2            .                 ?   
 
   auth_seq_id auth_comp_id auth_asym_id pdbx_PDB_model_num pdbx_label_index  \
 0        1000          SO4            A                  1             1000   
 1        1008          SO4            A                  1             1008   
 
   pdbx_sifts_xref_db_name pdbx_sifts_xref_db_acc pdbx_sifts_xref_db_num  \
 0                       ?                      ?                      ?   
 1                       ?                      ?                      ?   
 
   pdbx_sifts_xref_db_res  
 0                      ?  
 1                      ?  ,
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'L']},
    'res_of_other_in_site': 0.875,
    'res_of_site_in_other': 0.875},
   {'other_site': {'label_asym_i

They are sulfate ions and the two annotated ones bind together in the structure.

<br>

In [166]:
pdb = "1z62"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '1Z62', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'IAA', 'auth_seq_id': '990'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'IAA', 'auth_seq_id': '991'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
11,ASD01320000_3,PYGM,Oryctolagus cuniculus,P00489,1Z62,ASD01320116;ASD01320116,IAA;IAA,A;A,Lig;Lig,Inhibitor,2-[[(2Z)-2-(2-oxo-7H-indol-3-ylidene)-7H-indol...,990;991,Inner Protein Regulator,Inner Protein,,Indirubin-3'-Aminooxy-Acetate Inhibits Glycoge...,No,"Chain A:TRP67,ILE68,GLN71,GLN72,TYR75,ARG193,P..."


In [167]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

1Z62 [[{'auth_asym_id': 'A', 'auth_comp_id': 'IAA', 'auth_seq_id': '990'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'IAA', 'auth_seq_id': '991'}]]


In [168]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

The two annotated molecules definitely bound together stacked.

<br>

In [169]:
pdb = "2c2b"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '2C2B', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'SAM', 'auth_seq_id': '500'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SAM', 'auth_seq_id': '501'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
13,ASD02790000_1,,Arabidopsis thaliana,Q9S7B5,2C2B,ASD00350007;ASD00350007,SAM;SAM,A;A,Lig,Activator,S-ADENOSYLMETHIONINE;S-ADENOSYLMETHIONINE,500;501,Inner Protein Regulator,Inner Protein,16319072,Allosteric threonine synthase. Reorganization ...,No,"Chain B:ASP148,GLU137,LYS152,ARG426,PHE134,ASN..."


In [170]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

2C2B [[{'auth_asym_id': 'A', 'auth_comp_id': 'SAM', 'auth_seq_id': '500'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'SAM', 'auth_seq_id': '501'}]]


In [171]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['G', 'H']},
 {'equivalent': [{'other_site': {'label_asym_id': ['K', 'L']},
    'res_of_other_in_site': 0.9090909090909091,
    'res_of_site_in_other': 0.9375},
   {'other_site': {'label_asym_id': ['N', 'O']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.96875},
   {'other_site': {'label_asym_id': ['R', 'S']},
    'res_of_other_in_site': 0.9142857142857143,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['U', 'V']},
    'res_of_other_in_site': 0.9090909090909091,
    'res_of_site_in_other': 0.9375},
   {'other_site': {'label_asym_id': ['X', 'Y']},
    'res_of_other_in_site': 0.96875,
    'res_of_site_in_other': 0.96875}],
  'nonequivalent': []})

The two annotated molecules bind together forming a unique site, as confirmed by the primary citation of the PDB. There are more pairs of the modulator that bind in equivalent sites throughout the Model and these will be identified automatically.

<br>

In [172]:
pdb = "2dew"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '2DEW', 'mods': [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}], [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}], [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
30,ASD11630000_1,,Homo sapiens,Q9UM07,2DEW,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,X;X;X,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEW


In [173]:
multiple_updates[pdb.upper()]

{'pdb': '2DEW',
 'mods': [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}],
  [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}],
  [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}

In [174]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '2DEW',
    'mods': [
        [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'},
         {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'},
         {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}],
        [{"label_asym_id": "C"}],
        [{"label_asym_id": "F"}]
    ]}
}, auto_site_grouping=False, stringent_site_grouping=False)

2DEW [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}, {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}, {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}], [{'label_asym_id': 'C'}], [{'label_asym_id': 'F'}]]


In [175]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['D', 'E', 'G']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [176]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['D', 'E', 'G']},
 {'label_asym_id': ['C']},
 {'label_asym_id': ['F']}]

In [177]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D', 'E', 'G']}]

[1, 1]

Similarly to 1wda, the annotated calciums do bind together although they are not close enough to be grouped automatically.

<br>

In [178]:
pdb = "2dex"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '2DEX', 'mods': [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}], [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}], [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
30,ASD11630000_1,,Homo sapiens,Q9UM07,2DEX,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,X;X;X,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEX


In [179]:
multiple_updates[pdb.upper()]

{'pdb': '2DEX',
 'mods': [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}],
  [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}],
  [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}

In [180]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '2DEX',
    'mods': [
        [{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'},
         {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'},
         {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}],
        [{"label_asym_id": "C"}],
        [{"label_asym_id": "F"}]
    ]}
}, auto_site_grouping=False, stringent_site_grouping=False)

2DEX [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}, {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}, {'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}], [{'label_asym_id': 'C'}], [{'label_asym_id': 'F'}]]


In [181]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['D', 'E', 'G']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [182]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['D', 'E', 'G']},
 {'label_asym_id': ['C']},
 {'label_asym_id': ['F']}]

In [183]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D', 'E', 'G']}]

[1, 1]

Same as before.

<br>

In [184]:
pdb = "2dw5"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '2DW5', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
30,ASD11630000_1,,Homo sapiens,Q9UM07,2DW5,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,17002273,Inhibitors and Inactivators of Protein Arginin...,No,2DW5


In [185]:
multiple_updates[pdb.upper()]

{'pdb': '2DW5',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}]]}

In [186]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '2DW5',
    'mods': [
        [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'},
         {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'},
         {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}],
        [{"label_asym_id": "B"}],
        [{"label_asym_id": "E"}]
    ]}
}, auto_site_grouping=False, stringent_site_grouping=False)

2DW5 [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '901'}, {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '902'}, {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '904'}], [{'label_asym_id': 'B'}], [{'label_asym_id': 'E'}]]


In [187]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D', 'F']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['B']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['E']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [188]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['C', 'D', 'F']},
 {'label_asym_id': ['B']},
 {'label_asym_id': ['E']}]

In [189]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C', 'D', 'F']}]

[1, 1]

Same as before.

<br>

In [190]:
pdb = "2v0n"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '2V0N', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '505'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
21,ASD07450000_1,pleD,Caulobacter crescentus,B8GZM2,2V0N,ASD06980001;ASD06980001,C2E;C2E,A;A,Lig;Lig,Inhibitor,"9,9'-[(2R,3R,3aS,5S,7aR,9R,10R,10aS,12S,14aR)-...",503;505,Inner Protein Regulator,Protein-Protein Interaction,17697997,Structure of Bef3--Modified Response Regulator...,No,"Chain A:SER356,ASN357,VAL358,ARG359,ALA360,ASP..."


In [191]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

2V0N [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '505'}]]


In [192]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'F']},
 {'equivalent': [{'other_site': {'label_asym_id': ['O', 'P']},
    'res_of_other_in_site': 0.9615384615384616,
    'res_of_site_in_other': 0.9259259259259259}],
  'nonequivalent': []})

The two annotated molecules bind together stacked, and there is an additional pair of the molecules in another chain of the protein forming the same site that will be automatically detected.

<br>

In [193]:
pdb = "3ete"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '3ETE', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}], [{'auth_asym_id': 'C', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}], [{'auth_asym_id': 'C', 'auth_comp_id': 'H3P', 'auth_seq_id': '554'}], [{'auth_asym_id': 'D', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}], [{'auth_asym_id': 'F', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
9,ASD01240000_1,GLUD1,Bos taurus,P00366,3ETE,ASD01240021;ASD01240021;ASD01240021;ASD0124002...,H3P;H3P;H3P;H3P;H3P;H3P,A;B;C;C;D;F,Lig,Inhibitor,"3,4,6-trichloro-2-[(2,3,5-trichloro-6-hydroxy-...",552;552;552;554;552;552,Inner Protein Regulator,Protein-Protein Interaction,19531491,Novel Inhibitors Complexed with Glutamate Dehy...,No,"Chain A:THR186,ILE187,TYR190; Chain E:MET150,L..."


In [194]:
multiple_updates[pdb.upper()]

{'pdb': '3ETE',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}],
  [{'auth_asym_id': 'B', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}],
  [{'auth_asym_id': 'C', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}],
  [{'auth_asym_id': 'C', 'auth_comp_id': 'H3P', 'auth_seq_id': '554'}],
  [{'auth_asym_id': 'D', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}],
  [{'auth_asym_id': 'F', 'auth_comp_id': 'H3P', 'auth_seq_id': '552'}]]}

In [195]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '3ETE',
    'mods': [
        {"label_asym_id": ["J", "W"]},
        {"label_asym_id": ["S", "N"]},
        {"label_asym_id": ["R", "DA"]},
    ]
}
}, auto_site_grouping=False, stringent_site_grouping=False)

3ETE [{'label_asym_id': ['J', 'W']}, {'label_asym_id': ['S', 'N']}, {'label_asym_id': ['R', 'DA']}]


In [196]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['R', 'DA']}]

In [197]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['R', 'DA']},
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'W']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['N', 'S']},
    'res_of_other_in_site': 0.8888888888888888,
    'res_of_site_in_other': 0.9411764705882353}],
  'nonequivalent': []})

In [198]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique(), 

(array(['B', 'C', 'D', 'F'], dtype=object), array(['F'], dtype=object))

The annotated molecules bind together in the interface of a homomer of many chains. However, the interface possesses a certain symmetry that allows to group the modulator molecules in pairs that originate equivalent sites.

<br>

In [199]:
pdb = "3fyh"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '3FYH', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '534'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '511'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '512'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '513'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '514'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '515'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '516'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '517'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '518'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '519'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '520'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '521'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '522'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '523'}], [{'auth_asym_id': 'A', 

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
23,ASD08840000_1,radA,Methanococcus voltae,O73948,3FYH,ASD08840007;ASD08840007;ASD08840007;ASD0884000...,W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W,A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A,Ion,Regulator,tungsten(6+),534;511;512;513;514;515;516;517;518;519;520;52...,Protein-DNA/RNA Interaction Regulator,Inner Protein,19555119,Crystal structure of an archaeal Rad51 homolog...,No,"Chain A:ARG218,ARG230"


In [200]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

3FYH [[{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '534'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '511'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '512'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '513'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '514'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '515'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '516'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '517'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '518'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '519'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '520'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '521'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '522'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W', 'auth_seq_id': '523'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'W'

In [201]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['F',
   'G',
   'H',
   'I',
   'J',
   'K',
   'L',
   'M',
   'N',
   'O',
   'P',
   'Q',
   'R',
   'S',
   'T',
   'U',
   'V',
   'W',
   'X',
   'Y',
   'Z',
   'AA',
   'BA',
   'CA']},
 {'equivalent': [], 'nonequivalent': []})

The annotated modulator is a cluster of Tungsten ions that all bind together.

<br>

In [202]:
pdb = "3pxf"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '3PXF', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '304'}], [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '305'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
13,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXF,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,304;305,Inner Protein Regulator,Inner Protein,21291269,Discovery of a potential allosteric ligand bin...,No,3PXF


In [203]:
df.query("allosteric_pdb in ['3PXF', '3PXQ', '3PXR', '3PXY', '3PXZ', '3PY0', '3PY1']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1575,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXF,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,304;305,Inner Protein Regulator,Inner Protein,21291269,Discovery of a potential allosteric ligand bin...,No,3PXF
1576,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXQ,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,301;302,Inner Protein Regulator,Inner Protein,21291269,Discovery of a potential allosteric ligand bin...,No,3PXQ
1577,ASD04590000_1,CDK2,Homo sapiens,P24941,3PXZ,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,299;300,Inner Protein Regulator,Inner Protein,21291269,Discovery of a potential allosteric ligand bin...,No,3PXZ
1578,ASD04590000_1,CDK2,Homo sapiens,P24941,3PY1,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,301;302,Inner Protein Regulator,Inner Protein,21291269,Discovery of a potential allosteric ligand bin...,No,3PY1


In [204]:
[p for p in ['3PXF', '3PXQ', '3PXR', '3PXY', '3PXZ', '3PY0', '3PY1'] if p.lower() in errors]

['3PXF', '3PXQ', '3PXZ', '3PY1']

In [205]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)
solve_error('3PXZ'.lower(), {'3PXZ': 
                  multiple_updates['3PXZ']
}, auto_site_grouping=True, stringent_site_grouping=False)
solve_error('3PY1'.lower(), {'3PY1': 
                  multiple_updates['3PY1']
}, auto_site_grouping=True, stringent_site_grouping=False)

3PXF [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '304'}], [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '305'}]]
3PXZ [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '299'}], [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '300'}]]
3PY1 [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '301'}], [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '302'}]]


In [206]:
multiple_updates['3PXQ']

{'pdb': '3PXQ',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '301'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '302'}]]}

In [207]:
solve_error('3PXQ'.lower(), {'3PXQ': {
    'pdb': '3PXQ',
    'mods': [
        [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '301'},
         {'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '302'}],
        [{"label_asym_id": "C"}]
    ]}
}, auto_site_grouping=False, stringent_site_grouping=False)

3PXQ [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '301'}, {'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '302'}], [{'label_asym_id': 'C'}]]


In [208]:
[s.modulator for s in PDB.get(PDB.entry_id == '3pxq').sites]

[{'label_asym_id': ['D', 'E']}, {'label_asym_id': ['C']}]

In [209]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == '3pxq').sites if s.modulator != {'label_asym_id': ['D', 'E']}]

[1]

In [210]:
[
    (s.pdb, s.modulator, s.related_sites)
    for p in (
        PDB.select()
        .where(
            PDB.entry_id.in_(
                [
                    p.lower() 
                    for p in ['3PXF', '3PXQ', '3PXZ', '3PY1']
                ]
            )
        )
    )
    for s in p.sites
]

[(<PDB: 3pxf>,
  {'label_asym_id': ['G', 'H']},
  {'equivalent': [], 'nonequivalent': []}),
 (<PDB: 3pxq>,
  {'label_asym_id': ['D', 'E']},
  {'equivalent': [],
   'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.24,
     'res_of_site_in_other': 0.2222222222222222}]}),
 (<PDB: 3pxz>,
  {'label_asym_id': ['B', 'C']},
  {'equivalent': [], 'nonequivalent': []}),
 (<PDB: 3py1>,
  {'label_asym_id': ['D', 'E']},
  {'equivalent': [], 'nonequivalent': []})]

The annotated modulators bind together and adjacent but separate from the ATP binding site, as confirmed by the primary citation and the structures therein 3PXZ and 3PY1, in which the same two molecules bind in addition to ATP-competitive inhibitors JWS648 and SU9516, respectively. In the structure 3PXQ, three molecules of the annotated modulator bind, with the third binding in the active site, and thus must be appropriately annotated. All original annotations in the dataset are correct and should be added like that.

<br>

In [211]:
pdb = "4bzb"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4BZB', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '800'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '900'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
20,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4BZB,ASD09690001;ASD09690001,DGT;DGT,A;A,Lig,Activator,"[[(2R,3S,5R)-5-(2-amino-6-oxo-1H-purin-9-yl)-3...",800;900,Inner Protein Regulator,Protein-Protein Interaction,24141705,Mechanism of Allosteric Activation of Samhd1 b...,No,4BZB


In [212]:
df.query("allosteric_pdb == '4BZC'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2096,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4BZC,ASD09690012;ASD09690012,T8T;T8T,A;A,Lig,Activator,2'-deoxyguanosine-5'-O-(1-thiotriphosphate);2'...,800;900,Inner Protein Regulator,Protein-Protein Interaction,24141705,Mechanism of Allosteric Activation of Samhd1 b...,No,4BZC


In [213]:
multiple_updates[pdb.upper()]

{'pdb': '4BZB',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '800'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '900'}]]}

In [214]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4BZB',
    'mods': [
        {'label_asym_id': ["G", "I", "H"]}
    ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4BZB [{'label_asym_id': ['G', 'I', 'H']}]


In [215]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['G', 'H', 'I']},
 {'equivalent': [{'other_site': {'label_asym_id': ['L', 'M', 'N']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['Q', 'R', 'S']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['V', 'W', 'X']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['E', 'F']},
    'res_of_other_in_site': 0.02857142857142857,
    'res_of_site_in_other': 0.024390243902439025},
   {'other_site': {'label_asym_id': ['J', 'K']},
    'res_of_other_in_site': 0.02857142857142857,
    'res_of_site_in_other': 0.024390243902439025},
   {'other_site': {'label_asym_id': ['O', 'P']},
    'res_of_other_in_site': 0.029411764705882353,
    'res_of_site_in_other': 0.024390243902439025},
   {'other_site': {'label_asym_id': ['T', 'U']},
    'res_of_other_in_site': 0.029411764705882353,
    'res_of_site_in_o

In [216]:
solve_error('4BZC'.lower(), {'4BZC': {
    'pdb': '4BZC',
    'mods': [
        {'label_asym_id': ["G", "I", "H"]}
    ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4BZC [{'label_asym_id': ['G', 'I', 'H']}]


In [217]:
Site.get(Site.pdb == '4BZC'.lower()).modulator, Site.get(Site.pdb == '4BZC'.lower()).related_sites

({'label_asym_id': ['G', 'H', 'I']},
 {'equivalent': [{'other_site': {'label_asym_id': ['L', 'M', 'N']},
    'res_of_other_in_site': 0.975609756097561,
    'res_of_site_in_other': 0.975609756097561},
   {'other_site': {'label_asym_id': ['Q', 'R', 'S']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9512195121951219},
   {'other_site': {'label_asym_id': ['V', 'W', 'X']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9512195121951219}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
    'res_of_other_in_site': 0.03125,
    'res_of_site_in_other': 0.024390243902439025},
   {'other_site': {'label_asym_id': ['J']},
    'res_of_other_in_site': 0.03225806451612903,
    'res_of_site_in_other': 0.024390243902439025},
   {'other_site': {'label_asym_id': ['O']},
    'res_of_other_in_site': 0.03225806451612903,
    'res_of_site_in_other': 0.024390243902439025},
   {'other_site': {'label_asym_id': ['T']},
    'res_of_other_in_site': 0.0303030303030303

The annotated modulators indeed bind together as confirmed by the primary citation, and the same correction is applied to 4BZC. There are 4 total sites in the multimer that are expected to be correctly recognized as equivalent.

<br>

In [218]:
pdb = "4dkt"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4DKT', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '702'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '703'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '705'}]]}
ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
21,ASD11630000_1,,Homo sapiens,Q9UM07,4DKT,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,702;703;705,Inner Protein Regulator,Inner Protein,22004374,Synthesis and Screening of a Haloacetamidine C...,No,4DKT


In [219]:
multiple_updates[pdb.upper()]

{'pdb': '4DKT',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '702'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '703'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '705'}]]}

In [220]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4DKT',
    'mods': [
        [{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '702'},
         {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '703'},
         {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '705'}],
        [{"label_asym_id": "C"}],
        [{"label_asym_id": "F"}]
    ]}
}, auto_site_grouping=False, stringent_site_grouping=False)

4DKT [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '702'}, {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '703'}, {'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '705'}], [{'label_asym_id': 'C'}], [{'label_asym_id': 'F'}]]


In [221]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['D', 'E', 'G']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [222]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['D', 'E', 'G']},
 {'label_asym_id': ['C']},
 {'label_asym_id': ['F']}]

In [223]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D', 'E', 'G']}]

[1, 1]

Similar to 1WDA and others.

<br>

In [224]:
pdb = "4dn0"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4DN0', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '501'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '502'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
18,ASD09460000_1,pelD,Pseudomonas aeruginosa,Q02PM6,4DN0,ASD06980001;ASD06980001,C2E;C2E,A;A,Lig,Inhibitor,"9,9'-[(2R,3R,3aS,5S,7aR,9R,10R,10aS,12S,14aR)-...",501;502,Inner Protein Regulator,Inner Protein,22605337,"Structure of the Cytoplasmic Region of PelD, a...",No,"Chain A:ARG161,LEU165,GLN366,ARG367,GLY368,ASP..."


In [225]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4DN0 [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '501'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '502'}]]


In [226]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['B', 'C']}, {'equivalent': [], 'nonequivalent': []})

Similarly to 2V0N, the two annotated molecules bind together stacked.

<br>

In [227]:
pdb = "4dqw"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4DQW', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ATP', 'auth_seq_id': '501'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MN', 'auth_seq_id': '502'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'ATP', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MN', 'auth_seq_id': '504'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
7,ASD00890000_2,,Hepatitis C virus,Q9HXM5,4DQW,ASD01140145;ASD00890024;ASD01140145;ASD00890024,ATP;MN;ATP;MN,A;A;A;A,Lig,Regulator,ADENOSINE-5'-TRIPHOSPHATE;MANGANESE (II) ION;A...,501;502;503;504,Inner Protein Regulator,Inner Protein,23643948,MgATP Regulates Allostery and Fiber Formation ...,No,"Chain B:GLU180,ARG136,ARG178"


In [228]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4DQW [[{'auth_asym_id': 'A', 'auth_comp_id': 'ATP', 'auth_seq_id': '501'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MN', 'auth_seq_id': '502'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'ATP', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'MN', 'auth_seq_id': '504'}]]


In [229]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D', 'E', 'F', 'I', 'J', 'K', 'L']},
 {'equivalent': [], 'nonequivalent': []})

In [230]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'B'], dtype=object), array(['A'], dtype=object))

Two quartets of 2ATP-2Mg bind in the same site in two chains that form a symmetric interface and thus should all be grouped.

<br>

In [231]:
pdb = "4ez7"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4EZ7', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '302'}], [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '303'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
13,ASD04590000_1,CDK2,Homo sapiens,P24941,4EZ7,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,302;303,Inner Protein Regulator,Inner Protein,22893598,A Novel Approach to the Discovery of Small-Mol...,No,4EZ7


In [232]:
df.query("allosteric_pdb in ['3TI1', '3TIY', '3TIZ', '4ERW', '4EZ3', '4EZ7']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1579,ASD04590000_1,CDK2,Homo sapiens,P24941,4EZ7,ASD02210006;ASD02210006,2AN;2AN,A;A,Lig,Inhibitor,8-phenylazanylnaphthalene-1-sulfonic acid,302;303,Inner Protein Regulator,Inner Protein,22893598,A Novel Approach to the Discovery of Small-Mol...,No,4EZ7


In [233]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4EZ7 [[{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '302'}], [{'auth_asym_id': 'A', 'auth_comp_id': '2AN', 'auth_seq_id': '303'}]]


In [234]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D']}, {'equivalent': [], 'nonequivalent': []})

Similarly to 3PXF and related, it is a kinase with ANS as the allosteric modulator that binds close (and 2 together) but not on the active site; and there are multiple structues associated to the primary citation but none of them have annotated allosteric modulators.

<br>

In [235]:
pdb = "4fyx"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4FYX', 'mods': [[{'auth_asym_id': 'B', 'auth_comp_id': 'UTP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'DCP', 'auth_seq_id': '204'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
5,ASD00330000_3,pyrI,Escherichia coli,P0A7F3,4FYX,ASD00150002;ASD00900001;ASD00330030,UTP;MG;DCP,B;B;B,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(2,4-dioxopyrimidin-1-yl)-3,4...",202;203;204,Inner Protein Regulator,Inner Protein,22906065,Metal Ion Involvement in the Allosteric Mechan...,No,"Chain B:LYS6,LEU7,GLN8,VAL9,ALA11,ILE12,VAL17,..."


In [236]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4FYX [[{'auth_asym_id': 'B', 'auth_comp_id': 'UTP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'DCP', 'auth_seq_id': '204'}]]


In [237]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['F', 'G', 'H']},
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'K', 'L']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

In [238]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['B', 'D'], dtype=object), array(['B', 'D'], dtype=object))

In [239]:
Site.get(Site.pdb == pdb).protein_residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LYS,B,2,6,?,6,LYS,B,1,6,UNP,P0A7F3,6,K
1,LEU,B,2,7,?,7,LEU,B,1,7,UNP,P0A7F3,7,L
2,GLN,B,2,8,?,8,GLN,B,1,8,UNP,P0A7F3,8,Q
3,VAL,B,2,9,?,9,VAL,B,1,9,UNP,P0A7F3,9,V
4,GLU,B,2,10,?,10,GLU,B,1,10,UNP,P0A7F3,10,E
5,ALA,B,2,11,?,11,ALA,B,1,11,UNP,P0A7F3,11,A
6,ILE,B,2,12,?,12,ILE,B,1,12,UNP,P0A7F3,12,I
7,LYS,B,2,13,?,13,LYS,B,1,13,UNP,P0A7F3,13,K
8,VAL,B,2,17,?,17,VAL,B,1,17,UNP,P0A7F3,17,V
9,ASP,B,2,19,?,19,ASP,B,1,19,UNP,P0A7F3,19,D


The trio of UTP-DTP-Mg indeed binds together, and there is also another copy of the trio in another copy of the protein chain that should be automatically recognized as equivalent.

<br>

In [240]:
pdb = "4fyy"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4FYY', 'mods': [[{'auth_asym_id': 'B', 'auth_comp_id': 'UTP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'CTP', 'auth_seq_id': '204'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
6,ASD00330000_3,pyrI,Escherichia coli,P0A7F3,4FYY,ASD00150002;ASD00150003;ASD00330030,UTP;MG;CTP,B;B;B,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(2,4-dioxopyrimidin-1-yl)-3,4...",202;203;204,Inner Protein Regulator,Inner Protein,22906065,Metal Ion Involvement in the Allosteric Mechan...,No,"Chain B:LEU7,GLN8,VAL9,ALA11,ILE12,VAL17,ASP19..."


In [241]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4FYY [[{'auth_asym_id': 'B', 'auth_comp_id': 'UTP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'CTP', 'auth_seq_id': '204'}]]


In [242]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['F', 'G', 'H']},
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'K', 'L']},
    'res_of_other_in_site': 0.9032258064516129,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

In [243]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['B', 'D'], dtype=object), array(['B', 'D'], dtype=object))

In [244]:
Site.get(Site.pdb == pdb).protein_residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index,pdbx_sifts_xref_db_name,pdbx_sifts_xref_db_acc,pdbx_sifts_xref_db_num,pdbx_sifts_xref_db_res
0,LEU,B,2,7,?,7,LEU,B,1,7,UNP,P0A7F3,7,L
1,GLN,B,2,8,?,8,GLN,B,1,8,UNP,P0A7F3,8,Q
2,VAL,B,2,9,?,9,VAL,B,1,9,UNP,P0A7F3,9,V
3,GLU,B,2,10,?,10,GLU,B,1,10,UNP,P0A7F3,10,E
4,ALA,B,2,11,?,11,ALA,B,1,11,UNP,P0A7F3,11,A
5,ILE,B,2,12,?,12,ILE,B,1,12,UNP,P0A7F3,12,I
6,LYS,B,2,13,?,13,LYS,B,1,13,UNP,P0A7F3,13,K
7,VAL,B,2,17,?,17,VAL,B,1,17,UNP,P0A7F3,17,V
8,ASP,B,2,19,?,19,ASP,B,1,19,UNP,P0A7F3,19,D
9,HIS,B,2,20,?,20,HIS,B,1,20,UNP,P0A7F3,20,H


Same as before.

<br>

In [245]:
pdb = "4kh0"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4KH0', 'mods': [[{'auth_asym_id': 'B', 'auth_comp_id': 'ATP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'ATP', 'auth_seq_id': '204'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00330000_2,pyrI,Escherichia coli,E8Y329,4KH0,ASD00330030;ASD01140145;ASD00330030,ATP;MG;ATP,B;B;B,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",202;203;204,Inner Protein Regulator,Inner Protein,24138583,New Paradigm for Allosteric Regulation of Esch...,No,"Chain B:ALA11,ILE12,VAL17,ASP19,HIS20,THR43,GL..."


In [246]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4KH0 [[{'auth_asym_id': 'B', 'auth_comp_id': 'ATP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'ATP', 'auth_seq_id': '204'}]]


In [247]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['G', 'H', 'I', 'L', 'M', 'N']},
 {'equivalent': [], 'nonequivalent': []})

In [248]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['B', 'D'], dtype=object), array(['B'], dtype=object))

In [249]:
Site.get(Site.pdb == pdb).protein_residues

Unnamed: 0,label_comp_id,label_asym_id,label_entity_id,label_seq_id,pdbx_PDB_ins_code,auth_seq_id,auth_comp_id,auth_asym_id,pdbx_PDB_model_num,pdbx_label_index
0,ASN,B,2,5,?,5,ASN,B,1,5
1,LYS,B,2,6,?,6,LYS,B,1,6
2,LEU,B,2,7,?,7,LEU,B,1,7
3,GLN,B,2,8,?,8,GLN,B,1,8
4,VAL,B,2,9,?,9,VAL,B,1,9
...,...,...,...,...,...,...,...,...,...,...
62,ILE,D,2,86,?,86,ILE,D,1,86
63,TYR,D,2,89,?,89,TYR,D,1,89
64,GLU,D,2,90,?,90,GLU,D,1,90
65,VAL,D,2,91,?,91,VAL,D,1,91


Same as before, but now the two trios bind more close together and all 6 are going to be grouped in the same site, and now the nonredundant_site is useful.

<br>

In [250]:
pdb = "4kh1"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4KH1', 'mods': [[{'auth_asym_id': 'B', 'auth_comp_id': 'CTP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'UTP', 'auth_seq_id': '204'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD00330000_2,pyrI,Escherichia coli,E8Y329,4KH1,ASD00150003;ASD00150002;ASD00330030,CTP;MG;UTP,B;B;B,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(4-amino-2-oxo-pyrimidin-1-yl...",202;203;204,Inner Protein Regulator,Inner Protein,24138583,New Paradigm for Allosteric Regulation of Esch...,No,"Chain B:LEU7,GLN8,VAL9,ALA11,ILE12,VAL17,ASP19..."


In [251]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4KH1 [[{'auth_asym_id': 'B', 'auth_comp_id': 'CTP', 'auth_seq_id': '202'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '203'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'UTP', 'auth_seq_id': '204'}]]


In [252]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['G', 'H', 'I']},
 {'equivalent': [{'other_site': {'label_asym_id': ['M', 'N', 'O']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

In [253]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['B', 'D'], dtype=object), array(['B', 'D'], dtype=object))

Same as before, but not grouping the 6 again.

<br>

In [254]:
pdb = "4mz7"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4MZ7', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '703'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '704'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
13,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4MZ7,ASD09690001;ASD09690001,DGT;DGT,A;A,Lig,Activator,"[[(2R,3S,5R)-5-(2-amino-6-oxo-1H-purin-9-yl)-3...",703;704,Inner Protein Regulator,Protein-Protein Interaction,24217394,Structural insight into dGTP-dependent activat...,No,4MZ7


In [255]:
multiple_updates[pdb.upper()]

{'pdb': '4MZ7',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '703'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '704'}]]}

In [256]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4MZ7',
    'mods': [
        {"label_asym_id": ["F", "G", "L"]}
        ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4MZ7 [{'label_asym_id': ['F', 'G', 'L']}]


In [257]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'H', 'I']},
 {'equivalent': [{'other_site': {'label_asym_id': ['F', 'G', 'L']},
    'res_of_other_in_site': 0.9428571428571428,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['D']},
    'res_of_other_in_site': 0.03125,
    'res_of_site_in_other': 0.030303030303030304},
   {'other_site': {'label_asym_id': ['K']},
    'res_of_other_in_site': 0.02857142857142857,
    'res_of_site_in_other': 0.030303030303030304}]})

The annotated molecules bind together with other DGTs but the annotated ones do not bind together; acc. to the primary reference, the duos of DGT or DGT with ATP form the allosteric sites and should all be annotated.

<br>

In [258]:
pdb = "4p02"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4P02', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '918'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '919'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
10,ASD07500000_2,RSP_0333,Rhodobacter sphaeroides,Q3J125,4P02,ASD06980001;ASD06980001,C2E;C2E,A;A,Lig,Activator,"9,9'-[(2R,3R,3aS,5S,7aR,9R,10R,10aS,12S,14aR)-...",918;919,Inner Protein Regulator,Inner Protein,24704788,Mechanism of activation of bacterial cellulose...,No,"Chain A:VAL615,GLY614,ILE672,GLN577,VAL671,ARG..."


In [259]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4P02 [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '918'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '919'}]]


In [260]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'F']}, {'equivalent': [], 'nonequivalent': []})

Similarly to 2V0N and 4DN0, the two annotated molecules bind together stacked.

<br>

In [261]:
pdb = "4pkn"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4PKN', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '601'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'BEF', 'auth_seq_id': '602'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD01360000_1,groEL,Escherichia coli,Q548M1,4PKN,ASD01720027;ASD01360014,ADP;BEF,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",601;602,Inner Protein Regulator,Inner Protein,25136110,Formation and structures of GroEL:GroES2 chape...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."


In [262]:
multiple_updates[pdb.upper()]

{'pdb': '4PKN',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '601'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'BEF', 'auth_seq_id': '602'}]]}

In [263]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4PKN',
    'mods': [{"label_asym_id": ["CA", "DA", "EA", "FA"]}]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4PKN [{'label_asym_id': ['CA', 'DA', 'EA', 'FA']}]


In [264]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['AB', 'BB', 'CB', 'DB']},
 {'equivalent': [{'other_site': {'label_asym_id': ['YB', 'ZB', 'AC', 'BC']},
    'res_of_other_in_site': 0.9473684210526315,
    'res_of_site_in_other': 0.9473684210526315},
   {'other_site': {'label_asym_id': ['CA', 'DA', 'EA', 'FA']},
    'res_of_other_in_site': 0.9736842105263158,
    'res_of_site_in_other': 0.9736842105263158},
   {'other_site': {'label_asym_id': ['CC', 'DC', 'EC', 'FC']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9473684210526315},
   {'other_site': {'label_asym_id': ['EB', 'FB', 'GB', 'HB']},
    'res_of_other_in_site': 0.972972972972973,
    'res_of_site_in_other': 0.9473684210526315},
   {'other_site': {'label_asym_id': ['GA', 'HA', 'IA', 'JA']},
    'res_of_other_in_site': 0.9743589743589743,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['IB', 'JB', 'KB', 'LB']},
    'res_of_other_in_site': 0.9722222222222222,
    'res_of_site_in_other': 0.9210526315789473},
   {'other_

The annotated modulators indeed bind together, and moreover are together with a Mg and a K ions.

<br>

In [265]:
pdb = "4pko"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4PKO', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '601'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'BEF', 'auth_seq_id': '602'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
5,ASD01360000_1,groEL,Escherichia coli,Q548M1,4PKO,ASD01720027;ASD01360014,ADP;BEF,A;A,Lig;Ion,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",601;602,Inner Protein Regulator,Inner Protein,25136110,Formation and structures of GroEL:GroES2 chape...,No,"Chain A:THR30,LEU31,GLY32,PRO33,LYS51,ASP52,GL..."


In [266]:
multiple_updates[pdb.upper()]

{'pdb': '4PKO',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ADP', 'auth_seq_id': '601'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'BEF', 'auth_seq_id': '602'}]]}

In [267]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4PKO',
    'mods': [{"label_asym_id": ["CA", "DA", "EA", "FA"]}]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4PKO [{'label_asym_id': ['CA', 'DA', 'EA', 'FA']}]


In [268]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['AB', 'BB', 'CB', 'DB']},
 {'equivalent': [{'other_site': {'label_asym_id': ['YB', 'ZB', 'AC', 'BC']},
    'res_of_other_in_site': 0.9736842105263158,
    'res_of_site_in_other': 0.9736842105263158},
   {'other_site': {'label_asym_id': ['CA', 'DA', 'EA', 'FA']},
    'res_of_other_in_site': 0.9,
    'res_of_site_in_other': 0.9473684210526315},
   {'other_site': {'label_asym_id': ['CC', 'DC', 'EC', 'FC']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9210526315789473},
   {'other_site': {'label_asym_id': ['EB', 'FB', 'GB', 'HB']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9736842105263158},
   {'other_site': {'label_asym_id': ['GA', 'HA', 'IA', 'JA']},
    'res_of_other_in_site': 0.9024390243902439,
    'res_of_site_in_other': 0.9736842105263158},
   {'other_site': {'label_asym_id': ['IB', 'JB', 'KB', 'LB']},
    'res_of_other_in_site': 0.95,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['KA', 'LA', 'MA',

Same as before.

<br>

In [269]:
pdb = "4tme"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4TME', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ETA', 'auth_seq_id': '219'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'ETA', 'auth_seq_id': '220'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'ETA', 'auth_seq_id': '219'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'ETA', 'auth_seq_id': '220'}], [{'auth_asym_id': 'C', 'auth_comp_id': 'ETA', 'auth_seq_id': '219'}], [{'auth_asym_id': 'C', 'auth_comp_id': 'ETA', 'auth_seq_id': '220'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
10,ASD13000000_1,eutL,Clostridium perfringens,B1BQ33,4TME,ASD13000001,ETA,"A,B,C",Lig,Inhibitor,ETHANOLAMINE,219/220,Inner Protein Regulator,Allosteric Position,25752492,An allosteric model for control of pore openin...,No,"Chain B:HIS220,LEU218,GLU219,HIS221"


In [270]:
multiple_updates[pdb.upper()]

{'pdb': '4TME',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ETA', 'auth_seq_id': '219'}],
  [{'auth_asym_id': 'A', 'auth_comp_id': 'ETA', 'auth_seq_id': '220'}],
  [{'auth_asym_id': 'B', 'auth_comp_id': 'ETA', 'auth_seq_id': '219'}],
  [{'auth_asym_id': 'B', 'auth_comp_id': 'ETA', 'auth_seq_id': '220'}],
  [{'auth_asym_id': 'C', 'auth_comp_id': 'ETA', 'auth_seq_id': '219'}],
  [{'auth_asym_id': 'C', 'auth_comp_id': 'ETA', 'auth_seq_id': '220'}]]}

In [271]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4TME',
    'mods': [{"label_asym_id": ["E", "F"]}]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4TME [{'label_asym_id': ['E', 'F']}]


In [272]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'F']},
 {'equivalent': [{'other_site': {'label_asym_id': ['H', 'I']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['J', 'K']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

There seems to be one pair of ETA on each protein chain occupying the same site, although the residue numbers on the dataset do not match the ones available in the PDB. One pair is going to be annotated and the other pairs should be automatically detected as equivalent.

<br>

In [273]:
pdb = "4wx2"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4WX2', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'F6F', 'auth_seq_id': '301', 'pdbx_PDB_ins_code': 'A'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'F6F', 'auth_seq_id': '302', 'pdbx_PDB_ins_code': 'A'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'F6F', 'auth_seq_id': '404', 'pdbx_PDB_ins_code': 'B'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'F6F', 'auth_seq_id': '301', 'pdbx_PDB_ins_code': 'A'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'F6F', 'auth_seq_id': '302', 'pdbx_PDB_ins_code': 'A'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'F6F', 'auth_seq_id': '404', 'pdbx_PDB_ins_code': 'B'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD02900000_2,trpB,Salmonella typhimurium,P0A2K1,4WX2,ASD02908001,F6F,"A,B",Lig,,2-{[4-(TRIFLUOROMETHOXY)BENZOYL]AMINO}ETHYL DI...,"301A,302A,404B",Protein-Protein Interaction,Protein-Protein Interaction,26708480,Visualizing the tunnel in tryptophan synthase ...,Yes,"Chain B:ASP176,ASN171,GLY193,SER178,CYS170,ALA..."


In [274]:
multiple_updates[pdb.upper()]

{'pdb': '4WX2',
 'mods': [[{'auth_asym_id': 'A',
    'auth_comp_id': 'F6F',
    'auth_seq_id': '301',
    'pdbx_PDB_ins_code': 'A'}],
  [{'auth_asym_id': 'A',
    'auth_comp_id': 'F6F',
    'auth_seq_id': '302',
    'pdbx_PDB_ins_code': 'A'}],
  [{'auth_asym_id': 'A',
    'auth_comp_id': 'F6F',
    'auth_seq_id': '404',
    'pdbx_PDB_ins_code': 'B'}],
  [{'auth_asym_id': 'B',
    'auth_comp_id': 'F6F',
    'auth_seq_id': '301',
    'pdbx_PDB_ins_code': 'A'}],
  [{'auth_asym_id': 'B',
    'auth_comp_id': 'F6F',
    'auth_seq_id': '302',
    'pdbx_PDB_ins_code': 'A'}],
  [{'auth_asym_id': 'B',
    'auth_comp_id': 'F6F',
    'auth_seq_id': '404',
    'pdbx_PDB_ins_code': 'B'}]]}

In [275]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4WX2',
    'mods': [{"auth_comp_id": ["F6F"]}]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4WX2 [{'auth_comp_id': ['F6F']}]


In [276]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['H']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['C', 'D']},
    'res_of_other_in_site': 0.07692307692307693,
    'res_of_site_in_other': 0.09375}]})

In [277]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['H']}, {'label_asym_id': ['C', 'D']}]

Literature does not point to F6F being an allosteric modulator of this protein tryptophane synthase but all sites will be kept as allosteric because the protein has an allosteric regulation mechanism with its own substrates/products. The letters at the end of the residue numbers were originally interpreted as insertion codes but are the chain id of the respective residue numbers.

<br>

In [278]:
pdb = "4zhx"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4ZHX', 'mods': [[{'auth_asym_id': 'E', 'auth_comp_id': 'C2Z', 'auth_seq_id': '401'}], [{'auth_asym_id': 'F', 'auth_comp_id': 'C2Z', 'auth_seq_id': '402'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2,ASD00310000_7,PRKAA2,Homo sapiens,P54646,4ZHX,ASD00310181,C2Z,"E,F",Lig,Activator,5-(5-hydroxyl-isoxazol-3-yl)-furan-2-phosphoni...,401402,Allosteric Function,Allosteric Position,26952388,Structural basis of allosteric and synergistic...,No,4ZHX


In [279]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

4ZHX [[{'auth_asym_id': 'E', 'auth_comp_id': 'C2Z', 'auth_seq_id': '401'}], [{'auth_asym_id': 'F', 'auth_comp_id': 'C2Z', 'auth_seq_id': '402'}]]


In [280]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['K', 'L']}, {'equivalent': [], 'nonequivalent': []})

They bind together in the same site of a chain.

<br>

In [281]:
pdb = "5im3"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5IM3', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTP', 'auth_seq_id': '1'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'DTP', 'auth_seq_id': '001'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
9,ASD16930000_1,nrdA,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I4I1,5IM3,ASD00870004,DTP,"A,B",Lig,Regulator,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,1001,Allosteric Fuction,Allosteric Position,27133024,Structural Mechanism of Allosteric Activity Re...,No,"Chain B:LYS48,ILE101,PRO43,TYR44,VAL34,ALA52,I..."


In [282]:
multiple_updates[pdb.upper()]

{'pdb': '5IM3',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTP', 'auth_seq_id': '1'}],
  [{'auth_asym_id': 'B', 'auth_comp_id': 'DTP', 'auth_seq_id': '001'}]]}

In [283]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '5IM3',
    'mods': [
        [{"auth_asym_id": "A", "auth_comp_id": "DTP", "auth_seq_id": "1001"}]
        ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

5IM3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTP', 'auth_seq_id': '1001'}]]


In [284]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D']},
 {'equivalent': [{'other_site': {'label_asym_id': ['H', 'I']},
    'res_of_other_in_site': 0.9666666666666667,
    'res_of_site_in_other': 0.9354838709677419}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['G']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

The annotated resi is actually 1001 and not 1, 001 and both 1001 residues of A and B chains form the same site (together with 1002).

<br>

In [285]:
pdb = "5jyo"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5JYO', 'mods': [[{'auth_asym_id': 'C', 'auth_comp_id': '63J', 'auth_seq_id': '601'}], [{'auth_asym_id': 'D', 'auth_comp_id': '63J', 'auth_seq_id': '601'}], [{'auth_asym_id': 'F', 'auth_comp_id': '63J', 'auth_seq_id': '601'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
6,ASD08170000_1,GLS,Homo sapiens,O94925,5JYO,,63J,"C,D,F",Lig,Inhibitor,2-(pyridin-2-yl)-N-(5-{4-[6-({[3-(trifluoromet...,601,Allosteric Fuction,Allosteric Position,,Allosteric inhibition of KGA,No,5JYO


In [286]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

5JYO [[{'auth_asym_id': 'C', 'auth_comp_id': '63J', 'auth_seq_id': '601'}], [{'auth_asym_id': 'D', 'auth_comp_id': '63J', 'auth_seq_id': '601'}], [{'auth_asym_id': 'F', 'auth_comp_id': '63J', 'auth_seq_id': '601'}]]


In [287]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['I', 'J']},
 {'equivalent': [{'other_site': {'label_asym_id': ['K', 'L']},
    'res_of_other_in_site': 0.8181818181818182,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []})

In [288]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'C', 'D', 'E'], dtype=object), array(['A'], dtype=object))

The annotated modulator indeed binds in pairs, with a pair belonging to auth_asym_ids C and D, and the other pair both to F. The first pair binds in an interface in what looks like equivalent sites, and the second one appears to bind also in an equivalent site only in contact with one of the chains, but should be detected as equivalent.

<br>

In [289]:
pdb = "5s4x"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5S4X', 'mods': [[{'auth_asym_id': 'B', 'auth_comp_id': 'JHD'}], [{'auth_asym_id': 'D', 'auth_comp_id': 'JHD'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD03750000_5,TUBB2B,Bos taurus,Q6B856,5S4X,,JHD,"B, D",Lig,Regulator,"1-(3,4-dimethoxyphenyl)methanamine",,,,,,No,


In [290]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

5S4X [[{'auth_asym_id': 'B', 'auth_comp_id': 'JHD'}], [{'auth_asym_id': 'D', 'auth_comp_id': 'JHD'}]]


In [291]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['O', 'P']},
 {'equivalent': [{'other_site': {'label_asym_id': ['V']},
    'res_of_other_in_site': 0.9333333333333333,
    'res_of_site_in_other': 0.717948717948718}],
  'nonequivalent': []})

The annotated modulator molecules belonging to chain B are a pair that binds together, while in chain D there is a single one, but in both cases the site is the same and should be detected as equivalent. Interestingly, the modulator is not present in all the copies of the protein.

<br>

In [292]:
pdb = "6q4d"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '6Q4D', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'HHT', 'auth_seq_id': '302'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'HHT', 'auth_seq_id': '303'}]]}
ENTRIES: 3 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,302；303,,,,,,6Q4D


In [293]:
df.query("allosteric_pdb == '6Q4D'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1582,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,302；303,,,,,,6Q4D
1583,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,304,,,,,,6Q4D
1584,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,305,,,,,,6Q4D


In [294]:
solve_error(pdb, {pdb.upper(): 
                  multiple_updates[pdb.upper()]
}, auto_site_grouping=True, stringent_site_grouping=False)

6Q4D [[{'auth_asym_id': 'A', 'auth_comp_id': 'HHT', 'auth_seq_id': '302'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'HHT', 'auth_seq_id': '303'}]]


In [295]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['C', 'D', 'E']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['B']},
    'res_of_other_in_site': 0.09523809523809523,
    'res_of_site_in_other': 0.09523809523809523},
   {'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0},
   {'other_site': {'label_asym_id': ['G']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

The annotated modulators indeed bind together, moreover with an additional molecule of the modulator that is not part of the annotation but appears in an additional simple annotation. There is also an annotation of another copy of the molecule that binds separately, and more importantly the molecule that binds in the active site is not included in the dataset as allosteric. 

<br>

In [296]:
pdb = "6vvh"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '6VVH', 'mods': [[{'auth_asym_id': 'AAA', 'auth_comp_id': 'LYS'}], [{'auth_asym_id': 'BBB', 'auth_comp_id': 'LYS'}], [{'auth_asym_id': 'CCC', 'auth_comp_id': 'LYS'}], [{'auth_asym_id': 'DDD', 'auth_comp_id': 'LYS'}]]}
ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
7,ASD20880000_1,DHDPS1,Arabidopsis thaliana,Q9LZX6,6VVH,,LYS,"AAA, BBB, CCC, DDD",Lig,Regulator,LYSINE,,,,,,,


In [297]:
multiple_updates[pdb.upper()]

{'pdb': '6VVH',
 'mods': [[{'auth_asym_id': 'AAA', 'auth_comp_id': 'LYS'}],
  [{'auth_asym_id': 'BBB', 'auth_comp_id': 'LYS'}],
  [{'auth_asym_id': 'CCC', 'auth_comp_id': 'LYS'}],
  [{'auth_asym_id': 'DDD', 'auth_comp_id': 'LYS'}]]}

In [298]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '6VVH',
    'mods': [
        {'label_asym_id': ["E", "H", "J", "O"]}
    ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

6VVH [{'label_asym_id': ['E', 'H', 'J', 'O']}]


In [299]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E', 'H']},
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'O']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9411764705882353}],
  'nonequivalent': []})

The annotated modulator binds in pairs in what looks like are equivalent sites which should be automatically detected.

<br>

##### couldn't retrieve modulator in pdb with using residue name

In [300]:
pdb = "5j8v"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5J8V', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA'}]]}
ENTRIES: 1 SITES: []
ERROR: ["5j8v, [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD06930000_2,RYR1,Oryctolagus cuniculus,P11716,5J8V,,Ca2+,A,Ion,Activator,Ca2+,,,,27573175,Structural insights into Ca(2+)-activated long...,,


In [301]:
errors.update({pdb.lower(): "Wrong annotation" for pdb in ["5J8V"]})

It is a cryoEM structure that doesn't have any residues, ligands or ions besides the protein.

<br>

##### using only residue name retrieves more than one site(group)

In [302]:
pdb = "4p86"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '4P86', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': '5GP', 'auth_seq_id': '182183'}]]}
ENTRIES: 1 SITES: []
ERROR: ["4p86, [[{'label_asym_id': 'E', 'auth_comp_id': '5GP', 'auth_seq_id': '201', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'G', 'auth_comp_id': '5GP', 'auth_seq_id': '201', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'H', 'auth_comp_id': '5GP', 'auth_seq_id': '202', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'I', 'auth_comp_id': '5GP', 'auth_seq_id': '201', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'J', 'auth_comp_id': '5GP', 'auth_seq_id': '202', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'K', 'auth_comp_id': '5GP', 'auth_seq_id': '201', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD16540000_1,pyrR,Bacillus subtilis (strain 168),P39765,4P86,,5GP,A,Lig,Regulator,GUANOSINE-5'-MONOPHOSPHATE,182 183,Allosteric Function,Allosteric Position,,,No,"Chain A:VAL107,THR113,GLY111,LYS40,PRO85,THR11..."


In [303]:
multiple_updates[pdb.upper()]

{'pdb': '4P86',
 'mods': [[{'auth_asym_id': 'A',
    'auth_comp_id': '5GP',
    'auth_seq_id': '182183'}]]}

In [304]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '4P86',
    'mods': [
        [{"auth_asym_id": "A", "auth_comp_id": "5GP", "auth_seq_id": "201"}]
    ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

4P86 [[{'auth_asym_id': 'A', 'auth_comp_id': '5GP', 'auth_seq_id': '201'}]]


In [305]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E']},
 {'equivalent': [{'other_site': {'label_asym_id': ['G']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9130434782608695},
   {'other_site': {'label_asym_id': ['J']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.8695652173913043},
   {'other_site': {'label_asym_id': ['K']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.9565217391304348}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['H', 'I']},
    'res_of_other_in_site': 0.11764705882352941,
    'res_of_site_in_other': 0.08695652173913043}]})

The proposed correction for the modulator annotation is wrong, as the residue ids were separated by a space instead of ';', ',' or other character and were fused together. In any case, those residue numbers are not present in the PDB. The real allosteric modulators seem to bind to the same site in all 4 subunits and in the same site as other allosteric modulators of this protein as seen in 1XZ8, and the other sites where other molecules of the modulator bind should be automatically recognized as nonequivalent.

<br>

In [306]:
pdb = "5tq2"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '5TQ2', 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ZN', 'auth_seq_id': '501'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'ZN', 'auth_seq_id': '501'}]]}
ENTRIES: 1 SITES: []
ERROR: ["5tq2, [[{'label_asym_id': 'E', 'auth_comp_id': 'ZN', 'auth_seq_id': '501', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'L', 'auth_comp_id': 'ZN', 'auth_seq_id': '401', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
5,ASD17140000_1,grin1,Xenopus laevis,A0A1L8F5J9,5TQ2,,ZN,"A,B",Ion,,ZINC ION,501,Inner Protein regulator,Inner Protein,27916457,Molecular Basis for Subtype Specificity and Hi...,No,


In [307]:
multiple_updates[pdb.upper()]

{'pdb': '5TQ2',
 'mods': [[{'auth_asym_id': 'A', 'auth_comp_id': 'ZN', 'auth_seq_id': '501'}],
  [{'auth_asym_id': 'B', 'auth_comp_id': 'ZN', 'auth_seq_id': '501'}]]}

In [308]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '5TQ2',
    'mods': [
        [{"auth_comp_id": "ZN"}]
    ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

5TQ2 [[{'auth_comp_id': 'ZN'}]]


In [309]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['E']},
 {'equivalent': [],
  'nonequivalent': [{'other_site': {'label_asym_id': ['L']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [310]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['E']}, {'label_asym_id': ['L']}]

Zinc 501 is not found in chain B (it's 401) but since both chains are annotated and are part of the same receptor both Zincs are going to be annotated.

<br>

In [311]:
pdb = "8dd3"
print(multiple_updates[pdb.upper()])
get_error(pdb)

{'pdb': '8DD3', 'mods': [[{'auth_asym_id': 'Q', 'auth_comp_id': 'R63'}], [{'auth_asym_id': 'T', 'auth_comp_id': 'R63'}], [{'auth_asym_id': 'U', 'auth_comp_id': 'R63'}]]}
ENTRIES: 1 SITES: []
ERROR: ["8dd3, [[{'label_asym_id': 'Q', 'auth_comp_id': 'R63', 'auth_seq_id': '403', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'T', 'auth_comp_id': 'R63', 'auth_seq_id': '403', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'U', 'auth_comp_id': 'R63', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2,ASD01140000_3,GABRA1,Homo sapiens,P14867,8DD3,ASD01142004,R63,"Q,T,U",Compound,,"methyl 4-ethyl-6,7-dimethoxy-9H-pyrido[3,4-b]i...",,,,35933426,,,8DD3


In [312]:
multiple_updates[pdb.upper()]

{'pdb': '8DD3',
 'mods': [[{'auth_asym_id': 'Q', 'auth_comp_id': 'R63'}],
  [{'auth_asym_id': 'T', 'auth_comp_id': 'R63'}],
  [{'auth_asym_id': 'U', 'auth_comp_id': 'R63'}]]}

In [313]:
solve_error(pdb, {pdb.upper(): {
    'pdb': '8DD3',
    'mods': [
        [{"auth_comp_id": "R63"}]
    ]
}
}, auto_site_grouping=True, stringent_site_grouping=False)

8DD3 [[{'auth_comp_id': 'R63'}]]


In [314]:
Site.get(Site.pdb == pdb).modulator, Site.get(Site.pdb == pdb).related_sites

({'label_asym_id': ['Q']},
 {'equivalent': [{'other_site': {'label_asym_id': ['T']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.7586206896551724}],
  'nonequivalent': [{'other_site': {'label_asym_id': ['U']},
    'res_of_other_in_site': 0.0,
    'res_of_site_in_other': 0.0}]})

In [315]:
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['Q']}, {'label_asym_id': ['U']}]

All modulators bind together in the same site of a single protein chain.

<br>

In [316]:
errors

{'5uvg': 'Wrong annotation', '5j8v': 'Wrong annotation'}

In [317]:
pd.DF(error_entries)

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
0,ASD15440000_1,UHRF1,Homo sapiens,Q96T88,5C6D,,USP7,"A,B",Pep,modulator,Ubiquitin carboxyl-terminal hydrolase 7,1-322,Protein-Protein Interaction,Protein-Protein Interaction,26299963,An Allosteric Interaction Links USP7 to Deubiq...,No,5C6D
1,ASD17360000_1,SMPD3,Homo sapiens,Q9NY59,5UVG,,CAT,A,Pep,Activator,soluble catalytic domain of nSMase2,,Inner Protein regulator,Inner Protein,28652336,Structure of human nSMase2 reveals an interdom...,No,5UVG
3,ASD06930000_2,RYR1,Oryctolagus cuniculus,P11716,5J8V,,Ca2+,A,Ion,Activator,Ca2+,,,,27573175,Structural insights into Ca(2+)-activated long...,,


## Remaining

In [318]:
remaining = (
    df.merge(
        pd.concat([prots, multiple, unknown]),
        how="outer", indicator=True
    )
    .query(f"_merge == 'left_only'").drop("_merge", axis=1)
)

### Processing

In [319]:
iterremaining = lambda x = (
    remaining.merge(pd.DF(processed + error_entries), how="outer", indicator=True)
    .query(f"_merge == 'left_only'").drop("_merge", axis=1)
): tqdm(x.iterrows(), total=len(x), smoothing=0)

for i, entry in iterremaining():
    process_entry(entry, updates={})
    processed.append(entry)

  0%|          | 0/2888 [00:00<?, ?it/s]

1KFL [[{'auth_asym_id': 'A', 'auth_comp_id': 'PHE', 'auth_seq_id': '1354'}]]
Downloading 1kfl
6AGM [[{'auth_asym_id': 'A', 'auth_comp_id': 'TYR', 'auth_seq_id': '601'}]]
Downloading 6agm
3PG9 [[{'auth_asym_id': 'A', 'auth_comp_id': 'TYR', 'auth_seq_id': '339'}]]
Downloading 3pg9
1OF6 [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTY', 'auth_seq_id': '1370'}]]
Downloading 1of6
1OFR [[{'auth_asym_id': 'H', 'auth_comp_id': 'PHE', 'auth_seq_id': '1002'}]]
Downloading 1ofr
1OG0 [[{'auth_asym_id': 'A', 'auth_comp_id': 'PHE', 'auth_seq_id': '1012'}]]
Downloading 1og0
4UC5 [[{'auth_asym_id': 'A', 'auth_comp_id': 'PHE', 'auth_seq_id': '1354'}]]
Downloading 4uc5
3HRF [[{'auth_asym_id': 'A', 'auth_comp_id': 'P47', 'auth_seq_id': '1374'}]]
Downloading 3hrf
3NAX [[{'auth_asym_id': 'A', 'auth_comp_id': 'MP7', 'auth_seq_id': '363'}]]
Downloading 3nax
3ORX [[{'auth_asym_id': 'A', 'auth_comp_id': '1F8', 'auth_seq_id': '1'}]]
Downloading 3orx
3ORZ [[{'auth_asym_id': 'A', 'auth_comp_id': '2A2', 'auth_seq_id':

In [320]:
errors

{'5uvg': 'Wrong annotation',
 '5j8v': 'Wrong annotation',
 '2ha4': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1yp2': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1yp3': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1yp4': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1gph': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '1ecb': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4eag': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '4eak': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '5ezv': ['Molecules of the annotated modulator(s) bind close together but were not grouped'],
 '7dtv': ['combine_sites failed; Molecules of the annotated modulator(s) bind close tog

In [321]:
len(errors)

122

#### Error correction

In [322]:
errors_groups()

{'using only residue name retrieves more than one site(group)': ['1dx5',
  '4tnr',
  '5opm',
  '6n82',
  '6oag',
  '6oah',
  '6qx2',
  '8dd2'],
 'Molecules of the annotated modulator(s) bind close together but were not grouped': ['11bg',
  '1e7c',
  '1ecb',
  '1egy',
  '1eup',
  '1gph',
  '1r1v',
  '1thc',
  '1vm1',
  '1wda',
  '1yp2',
  '1yp3',
  '1yp4',
  '1z62',
  '2d41',
  '2dew',
  '2dex',
  '2dw5',
  '2fsz',
  '2gq1',
  '2ha4',
  '2wrm',
  '3ao1',
  '3cqd',
  '3e3f',
  '3ete',
  '3f3t',
  '3f3u',
  '3f48',
  '3fyh',
  '3k8s',
  '3lsf',
  '3lsl',
  '3mzh',
  '3njq',
  '3umo',
  '3uqd',
  '3zl6',
  '4clz',
  '4dkt',
  '4eag',
  '4eak',
  '4gqq',
  '4i0u',
  '4lrl',
  '4m0z',
  '4ni0',
  '4oyo',
  '4p2t',
  '4p3h',
  '4ple',
  '4qfy',
  '4qsh',
  '4r8z',
  '4rew',
  '4rqo',
  '4z87',
  '5afk',
  '5btr',
  '5cnt',
  '5ezv',
  '5mcp',
  '5olk',
  '5s4r',
  '5s4s',
  '5tc3',
  '5ur3',
  '5ute',
  '5utn',
  '5uv3',
  '5v5d',
  '5v5e',
  '6b0z',
  '6brk',
  '6dja',
  '6i0m',
  '6i0o',
  

In [323]:
def solve_error(pdb, update, auto_site_grouping=True, stringent_site_grouping=True):
    if pdb in errors:
        errors.pop(pdb)
    entry = pd.DF(globals()["error_entries"]).query(f"allosteric_pdb == '{pdb.upper()}'")#.squeeze()
    assert len(entry.squeeze())>0
    globals()["error_entries"] = [
        row for i, row in (
            pd.DF(globals()["error_entries"])
            .merge(entry, how="outer", indicator=True)
            .query(f"_merge == 'left_only'").drop("_merge", axis=1)
            .iterrows()
        )
    ]
    # error_entries.remove(entry)
    updates = {pdb.upper(): {
                      "pdb": pdb.upper(), "mods": update
                  }} if update is not None else {}
    process_entry(entry.squeeze(), 
                  updates = updates, 
                  auto_site_grouping=auto_site_grouping, 
                  stringent_site_grouping=stringent_site_grouping)
    print("SITES:", [(s, s.modulator) for p in [PDB.get_or_none(PDB.entry_id == pdb)] if p is not None for s in p.sites])

##### PDB not found (status_code 404)

In [324]:
errors_groups()["PDB not found (status_code 404)"]

['1hwz', '3mvq', '3mw9', '3qmu', '4gar', '4gau', '4leg']

In [325]:
notfound = remaining.query(f"allosteric_pdb in {[pdb.upper() for pdb in errors_groups()['PDB not found (status_code 404)']]}")
notfound

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
597,ASD01240000_1,GLUD1,Bos taurus,P00366,1HWZ,ASD00290006,GTP,A,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H...",553,Inner Protein Regulator,Inner Protein,11254391,Structures of bovine glutamate dehydrogenase c...,No,"Chain A:HIS209,GLY210,SER213,ARG217,HIS258,ARG..."
605,ASD01240000_1,GLUD1,Bos taurus,P00366,3MVQ,ASD00290006,GTP,A,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H...",503,Inner Protein Regulator,Inner Protein,21749647,A novel mechanism of V-type zinc inhibition of...,No,"Chain A:HIS209,GLY210,SER213,ARG217,LEU257,HIS..."
606,ASD01240000_1,GLUD1,Bos taurus,P00366,3MW9,ASD00290006,GTP,A,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H...",602,Inner Protein Regulator,Inner Protein,10425679,The structure of bovine glutamate dehydrogenas...,No,"Chain A:HIS209,GLY210,SER213,ARG217,HIS258,ARG..."
607,ASD01240000_1,GLUD1,Bos taurus,P00366,3MW9,ASD06128001,NAI,A,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(3-aminocarbonyl-4H-pyridin-...",604,Inner Protein Regulator,Protein-Protein Interaction,10425679,The structure of bovine glutamate dehydrogenas...,No,"Chain A:HIS195,GLN205,GLY206,ASN387,ASN388,HIS..."
608,ASD01240000_1,GLUD1,Bos taurus,P00366,3QMU,ASD01240001,XEG,B,Lig,Inhibitor,"[(2R,3S)-2-(3,4-dihydroxyphenyl)-5,7-dihydroxy...",601,Inner Protein Regulator,Protein-Protein Interaction,21813650,Green Tea Polyphenols Control Dysregulated Glu...,No,"Chain A:ILE203,SER204,HIS209,SER393,ARG396; Ch..."
1608,ASD04800000_1,CTSK,Homo sapiens,P43235,4LEG,ASD04800001,1XF,A,Lig,Inhibitor,2-{[(carbamoylsulfanyl)acetyl]amino}benzoic acid,301,Inner Protein Regulator,Inner Protein,24518821,A novel allosteric mechanism in the cysteine p...,No,4LEG
2190,ASD10220000_1,PG_0396,Escherichia coli,Q7MAW7,4GAU,ASD10228001,NMY,A,Lig,Inhibitor,"(2R,3S,4R,5R,6R)-5-amino-2-(aminomethyl)-6-[(1...",3184,Protein-DNA/RNA Interaction Regulator,Inner DNA/RNA,22902368,Allosteric control of the ribosome by small-mo...,No,Chain A:SEC1923
2504,ASD12030000_1,purN,Escherichia coli,P08179,4GAR,ASD10228001,NMY,A,Lig,Inhibitor,"(2R,3S,4R,5R,6R)-5-amino-2-(aminomethyl)-6-[(1...",3161,Protein-DNA/RNA Interaction Regulator,Inner DNA/RNA,22902368,Allosteric control of the ribosome by small-mo...,No,Chain A:SEC1923


**Confirmed manually:**
- 1hwz is now PDB 6DHD: A, GTP, 604
- 3mw9 is now PDB 6DHM (old cif unavailable) but there's no GTP with residue ID 602, and no molecule with name NAI, although theres NDP (NADP) which is the phosphorylated version of NAI. The correct residue names will be given without more info to attempt the error correction but will be handled separately because it involves two different entries of the dataset.
- 4leg has been superseded by 5J94 andd the modulator occupies the same position with the same IDS
- 4gau has been superseded by 4v9c and the modulator occupies the same position with IDs: /4v9c/CW/DA/NMY`3184
- 4gar has also been superseded by 4v9c with /4v9c/QL/BA/NMY`3161
- 3mvq is superseded by 6DHL but the new pdb doesn't have any GTP (or other) molecule in the same position as in 3mvq, not in the model nor in the assemblies. 6dhl doesn't even have gtp
    - same ^ for 3qmu/6dhq
    - however, if we compare 3mvq with 6dhq and 3qmu with 6dhl (same author in common), there is a better match between the superseded and new structures in terms of the ligands bound to them, and we can find the annotated modulators in them:
        - 3mvq should've been superseded by 6dhq and the IDs are now A GTP 603
        - 3qmu should've been superseded by 6dhl with the same modulator IDs

In [326]:
notfoundd = {
    '1HWZ': {'pdb': '6DHD',
    'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': 'GTP',
     'auth_seq_id': '604'}]]},
    '3MVQ': {'pdb': '6DHQ',
    'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': 'GTP',
     'auth_seq_id': '603'}]]},
    '3QMU': {'pdb': '6DHL',
    'mods': [[{'auth_asym_id': 'B',
     'auth_comp_id': 'XEG',
     'auth_seq_id': '601'}]]},
    '4LEG': {'pdb': '5J94',
    'mods': [[{'auth_asym_id': 'A',
     'auth_comp_id': '1XF',
     'auth_seq_id': '301'}]]},
    '4GAU': {'pdb': '4V9C',
    'mods': [[{'auth_asym_id': 'DA',
     'auth_comp_id': 'NMY',
     'auth_seq_id': '3184'}]]},
    '4GAR': {'pdb': '4V9C',
    'mods': [[{'auth_asym_id': 'BA',
     'auth_comp_id': 'NMY',
     'auth_seq_id': '3161'}]]}
}

In [327]:
#sc
all(pdb.upper() in notfoundd.keys() for pdb in errors_groups()["PDB not found (status_code 404)"] if pdb != "3mw9")

True

**\*Already-present updates**

Check that none of these new PDBs are already in the whole, original dataset:

In [328]:
df.query(f"allosteric_pdb in {[v['pdb'] for v in notfoundd.values() if v is not None] + ['6DHM']}")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1608,ASD04800000_1,CTSK,Homo sapiens,P43235,5J94,ASD04800001,1XF,A,Lig,Inhibitor,2-{[(carbamoylsulfanyl)acetyl]amino}benzoic acid,301,Allosteric Function,Allosteric Position,24518821,A novel allosteric mechanism in the cysteine p...,No,5J94


4LEG -> 5J94 update is already in the database, so the old 4LEG entry will simply be dropped

In [329]:
notfoundd.update({'4LEG': None})

In [330]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(notfound, how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]
for pdb in errors_groups()["PDB not found (status_code 404)"]:
    errors.pop(pdb)

In [331]:
for i, entry in notfound.query("allosteric_pdb != '3MW9'").iterrows():
    process_entry(entry, updates=notfoundd)
    # processed.append(entry)

6DHD [[{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '604'}]]
Downloading 6dhd
6DHQ [[{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '603'}]]
Downloading 6dhq
6DHL [[{'auth_asym_id': 'B', 'auth_comp_id': 'XEG', 'auth_seq_id': '601'}]]
Downloading 6dhl
4V9C [[{'auth_asym_id': 'DA', 'auth_comp_id': 'NMY', 'auth_seq_id': '3184'}]]
Downloading 4v9c
4V9C [[{'auth_asym_id': 'BA', 'auth_comp_id': 'NMY', 'auth_seq_id': '3161'}]]


In [332]:
for i, entry in notfound.query("allosteric_pdb == '3MW9' and modulator_alias == 'GTP'").iterrows():
    process_entry(entry, updates={'3MW9': {'pdb': '6DHM', 'mods': [[{'auth_comp_id': 'GTP'}]]}})
    # processed.append(entry)

6DHM [[{'auth_comp_id': 'GTP'}]]
Downloading 6dhm


In [333]:
for i, entry in notfound.query("allosteric_pdb == '3MW9' and modulator_alias == 'NAI'").iterrows():
    process_entry(entry, updates={'3MW9': {'pdb': '6DHM', 'mods': [[{'auth_comp_id': 'NDP'}]]}})
    # processed.append(entry)

6DHM [[{'auth_comp_id': 'NDP'}]]


##### couldn't retrieve modulator in pdb with using residue name

In [334]:
errors_groups()["couldn't retrieve modulator in pdb with using residue name"]

['1kmp',
 '1tyr',
 '2bk3',
 '3hfr',
 '3pma',
 '3uvv',
 '3v4f',
 '4g0n',
 '4nil',
 '4x9e',
 '5fu8',
 '6ffh',
 '6ffi',
 '6p1q',
 '6p4v',
 '6sfb',
 '6sfc',
 '7lh2']

In [335]:
pdb = "1kmp"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["1kmp, [[{'auth_asym_id': 'A', 'auth_comp_id': 'FDC', 'auth_seq_id': '742'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
23,ASD01070000_1,fecA,Escherichia coli,P13036,1KMP,ASD01070001,FDC,A,Lig,Regulator,iron(+3) cation;iron(+3) cation,742,Inner Protein Regulator,Inner Protein,11872840,Structural basis of gating by the outer membra...,No,"Chain A:THR138,GLN176,GLN178,SER180,ARG365,LEU..."


In [336]:
solve_error(pdb, [
            [{"label_entity_id": "2"}],
            [{"label_entity_id": "3"}]
        ],
stringent_site_grouping=False
)

1KMP [[{'label_entity_id': '2'}], [{'label_entity_id': '3'}]]
SITES: [(<Site: 4493>, {'label_asym_id': ['B', 'C', 'E', 'G']})]


FDC is an obsolete ligand annotation (acc. to https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/FDC.cif) formed by FE(II) and two citrates.

<br>

In [337]:
pdb = "1tyr"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["1tyr, [[{'auth_asym_id': 'A', 'auth_comp_id': 'REA', 'auth_seq_id': '131'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
87,ASD10420000_1,TTR,Homo sapiens,P02766,1TYR,ASD05660009,REA,A,Lig,Inhibitor,"(2E,4E,6E,8E)-3,7-dimethyl-9-(2,6,6-trimethyl-...",131,Inner Protein Regulator,Protein-Protein Interaction,8536704,Crystal structure of the transthyretin--retino...,No,1TYR


In [338]:
solve_error(pdb, [
            [{"auth_comp_id": "9CR"}],
        ]
)

1TYR [[{'auth_comp_id': '9CR'}]]
SITES: [(<Site: 4494>, {'label_asym_id': ['C']}), (<Site: 4495>, {'label_asym_id': ['D']})]


In [339]:
for site in PDB.get(PDB.entry_id == "1tyr").sites:
    print(site, site.modulator, site.related_sites)

4494 {'label_asym_id': ['C']} {'equivalent': [], 'nonequivalent': [{'other_site': {'label_asym_id': ['D']}, 'res_of_other_in_site': 0.6666666666666666, 'res_of_site_in_other': 0.625}]}
4495 {'label_asym_id': ['D']} {'equivalent': [], 'nonequivalent': [{'other_site': {'label_asym_id': ['C']}, 'res_of_other_in_site': 0.625, 'res_of_site_in_other': 0.6666666666666666}]}


Same case as before, where REA has been substituted by the enantiomer 9CR. The two molecules mostly form the same site in the two copies of the protein, but they are not recognized as equivalent.

<br>

In [340]:
pdb = "2bk3"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["2bk3, [[{'auth_asym_id': 'A', 'auth_comp_id': 'FOH', 'auth_seq_id': '1503'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
71,ASD07000000_1,MAOB,Homo sapiens,P27338,2BK3,ASD07000015,FOH,A,Lig,Inhibitor,"(2Z,6Z)-3,7,11-trimethyldodeca-2,6,10-trien-1-ol",1503,Inner Protein Regulator,Inner Protein,15710600,Demonstration of Isoleucine 199 as a Structura...,Yes,2BK3


In [341]:
solve_error(pdb, [
            [{"auth_comp_id": "FOF"}],
        ]
)

2BK3 [[{'auth_comp_id': 'FOF'}]]
SITES: [(<Site: 4496>, {'label_asym_id': ['D']})]


Now the PDB has a FOF ligand which is an isomer of FOH, so it must have been replaced at some point.

<br>

In [342]:
pdb = "3hfr"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["3hfr, [[{'auth_asym_id': 'A', 'auth_comp_id': '6JZ', 'auth_seq_id': '270'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
47,ASD03450000_4,murI,Listeria monocytogenes serovar 1/2a,Q8Y7N7,3HFR,ASD03450007,6JZ,A,Lig,Inhibitor,1-ethoxy-2-[2-[2-(2-ethoxyethoxy)ethoxy]ethoxy...,270,Inner Protein Regulator,Inner Protein,,Crystal structure of glutamate racemase from L...,No,"Chain A:GLY153,TYR155,LYS156,ASP246,ASP250"


In [343]:
solve_error(pdb, [
            [{"auth_asym_id": "A", "auth_comp_id": "JEF", "auth_seq_id": "270"}]
        ]
)

3HFR [[{'auth_asym_id': 'A', 'auth_comp_id': 'JEF', 'auth_seq_id': '270'}]]
SITES: [(<Site: 4498>, {'label_asym_id': ['F']})]


According to https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/6JZ.cif and the corresponding one from the ligand observed in the pdb https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/JEF.cif, the ligand is Jeffamine and it might have been substituted in an update. This correspondence is also found in the entry `_pdbx_nonpoly_scheme` of the PDB cif https://www.ebi.ac.uk/pdbe/entry-files/3hfr_updated.cif, in which 6JZ 270 corresponds to JEF 270 of the same auth_asym_id.

<br>

In [344]:
pdb = "3pma"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["3pma, [[{'auth_asym_id': 'B', 'auth_comp_id': 'SCR', 'auth_seq_id': '248'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
52,ASD03750000_6,F2,Bos taurus,P00735,3PMA,ASD03750006,SCR,B,Lig,Inhibitor,"[(2R,3R,4S,5R,6R)-2-[(2S,3S,4R,5R)-3,4-disulfo...",248,Inner Protein Regulator,Protein-Protein Interaction,21736375,Interaction of thrombin with sucrose octasulfate.,No,"Chain B:ASP125,LYS126,PHE232,LYS235,GLN239; Ch..."


In [345]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3PMA [[{'auth_asym_id': 'B', 'auth_comp_id': 'SCR', 'auth_seq_id': '248'}]]
SITES: [(<Site: 4504>, {'label_asym_id': ['E', 'F']})]


According to https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/SCR.cif the ligand has been superseded with a polysaccharide of two residues, and the automatic correction might have failed due to the close proximity of the two molecules of the disaccharyde.

<br>

In [346]:
pdb = "3uvv"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["3uvv, [[{'auth_asym_id': 'B', 'auth_comp_id': 'REA', 'auth_seq_id': '501'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
67,ASD05660000_1,,Homo sapiens,P19793,3UVV,ASD05660009,REA,B,Lig,Inhibitor,RETINOIC ACID,501,Protein-Protein Interaction Regulator,Inner Protein,22474364,Structural basis for negative cooperativity wi...,No,3UVV


In [347]:
solve_error(pdb, [
            [{"auth_comp_id": "9CR"}],
        ]
)

3UVV [[{'auth_comp_id': '9CR'}]]
SITES: [(<Site: 4505>, {'label_asym_id': ['D']})]


According to https://www.ebi.ac.uk/pdbe/static/files/pdbechem_v2/REA.cif the ligand is retinoic acid and in this PDB is a specific stereoisomer that must have been renamed at some point. The remapping is also present in the PDB's cif https://www.ebi.ac.uk/pdbe/entry-files/3uvv_updated.cif

<br>

In [348]:
pdb = "3v4f"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["3v4f, [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTT', 'auth_seq_id': '201'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
59,ASD04970000_2,HRAS,Rattus norvegicus,P20171,3V4F,ASD04400004,DTT,A,Lig,Regulator,"(2R,3R)-1,4-bis-sulfanylbutane-2,3-diol",201,Inner Protein Regulator,Inner Protein,22845804,Shift in the Equilibrium between On and Off St...,No,"Chain A:HIS94,ASP92,GLU91,TYR96,GLN99,ARG68,GL..."


In [349]:
solve_error(pdb, [
            [{"auth_comp_id": "DTU"}],
        ]
)

3V4F [[{'auth_comp_id': 'DTU'}]]
SITES: [(<Site: 4506>, {'label_asym_id': ['E']})]


The PDB has a single molecule of the ligand DTU which is an enantiomer of DTT, so it must have gotten replaced at some point.

<br>

In [350]:
pdb = "4g0n"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 2950>, {'label_asym_id': ['D']})]
ERROR: ["4g0n, [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTT', 'auth_seq_id': '206'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
58,ASD04970000_1,HRAS,Homo sapiens,P01112,4G0N,ASD04400004,DTT,A,Lig,Regulator,"(2R,3R)-1,4-bis-sulfanylbutane-2,3-diol",206,Inner Protein Regulator,Inner Protein,25684575,Allosteric Effects of the Oncogenic RasQ61L Mu...,No,4G0N


In [351]:
solve_error(pdb, [
            [{"auth_comp_id": "DTU"}],
        ]
)

4G0N [[{'auth_comp_id': 'DTU'}]]
SITES: [(<Site: 2950>, {'label_asym_id': ['D']}), (<Site: 4507>, {'label_asym_id': ['H']})]


Also has a single molecule of DTU.

<br>

In [352]:
pdb = "4nil"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["4nil, [[{'auth_asym_id': 'A', 'auth_comp_id': '208', 'auth_seq_id': '306'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
87,ASD11910000_1,folP,Bacillus anthracis,Q81VW8,4NIL,ASD11910003,208,A,Lig,Inhibitor,"(2S)-3-[1-[[2-(2-chlorophenyl)-5-methyl-1,3-ox...",306,Inner Protein Regulator,Protein-Protein Interaction,24650357,Identification and characterization of an allo...,No,"Chain A:LEU235,MET264,MET261,GLU260,GLY239,GLU236"


In [353]:
solve_error(pdb, [
            [{"auth_comp_id": "2O8"}],
        ]
)

4NIL [[{'auth_comp_id': '2O8'}]]
SITES: [(<Site: 4508>, {'label_asym_id': ['H']})]


It's probably a typo in which "2O8" is written as "208". Although the modulator_name doesn't match (it matches the name of 208, which is only on PDB 2GTK with a different residue id), the chain, resi and primary citation match the annotated PDB.

<br>

In [354]:
pdb = "4x9e"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["4x9e, [[{'auth_asym_id': 'G', 'auth_comp_id': 'DNA', 'auth_seq_id': '1'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
80,ASD11580000_1,dgt,Escherichia coli,P15723,4X9E,ASD11580001,DNA,G,Lig,Activator,RNA (5'-R(P*CP*CP*C)-3'),1,Allosteric function,Allosteric position,25694425,Structure of Escherichia coli dGTP Triphosphoh...,No,"Chain G: C1, C2"


In [355]:
solve_error(pdb, [
            [{"label_entity_id": "2"}],
        ]
)

4X9E [[{'label_entity_id': '2'}]]
polyribonucleotide
SITES: [(<Site: 4509>, {'label_asym_id': ['G']})]


A trinucleotide RNA chain is annotated as the allosteric modulator, which corresponds to the two copies of the entity 2 that form the same site in different protein chains.

<br>

In [356]:
pdb = "5fu8"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["5fu8, [[{'auth_asym_id': 'A', 'auth_comp_id': 'DHS', 'auth_seq_id': '500'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
78,ASD10100000_1,rmlA,Pseudomonas aeruginosa,Q9HU22,5FU8,ASD10100017,DHS,A,Lig,Inhibitor,"N-(6-amino-1-(4-bromo-3-methylbenzyl)-2,4-diox...",500,Allosteric Function,Allosteric Position,,Allosteric Competitive Inhibitors of the Gluco...,No,"Chain A:HIS116,GLN260,ILE256,LYS249,ASP117,HIS..."


In [357]:
solve_error(pdb, [
            [{"auth_comp_id": "DH5"}],
        ]
)

5FU8 [[{'auth_comp_id': 'DH5'}]]
SITES: [(<Site: 4512>, {'label_asym_id': ['K']})]


There is a typo where 'DH5' is written down as 'DHS', because the modulator_name, chain and resi match.

<br>

In [358]:
pdb = "6ffh"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6ffh, [[{'auth_asym_id': 'A', 'auth_comp_id': 'D8B', 'auth_seq_id': '4009'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
38,ASD01910000_1,GRM5,Homo sapiens,P41594,6FFH,ASD01917956,D8B,A,Lig,Inhibitor,2-[2-(3-methoxyphenyl)ethynyl]-6-methyl-pyridine,4009,Allosteric function,Allosteric position,29455526,Structure-Based Optimization Strategies for G ...,No,6FFH


In [359]:
df.query("allosteric_pdb == '6FFI'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
883,ASD01910000_1,GRM5,Homo sapiens,P41594,6FFI,ASD01916182,D7W,A,Lig,Inhibitor,1-(3-chlorophenyl)-3-(3-methyl-5-oxidanylidene...,4006,Allosteric function,Allosteric position,29455526,Structure-Based Optimization Strategies for G ...,No,6FFI


In [360]:
errors["6ffi"]

["6ffi, [[{'auth_asym_id': 'A', 'auth_comp_id': 'D7W', 'auth_seq_id': '4006'}]]: couldn't retrieve modulator in pdb with using residue name"]

In [361]:
solve_error(pdb, [
            [{"auth_comp_id": "D7W"}]
        ]
)
solve_error("6ffi", [
            [{"auth_comp_id": "D8B"}]
        ]
)

6FFH [[{'auth_comp_id': 'D7W'}]]
SITES: [(<Site: 4513>, {'label_asym_id': ['G']})]
6FFI [[{'auth_comp_id': 'D8B'}]]
SITES: [(<Site: 4514>, {'label_asym_id': ['J']})]


The annotated modulator residue name in 6FFH is only found in the whole PDB database in 6FFI, and viceversa for the modulator name in 6FFI's entry which is also an error entry. Both PDBs come from the same primary citation, so the modulator names are just going to be exchanged, as there is only one molecule of each in the structures.

<br>

In [362]:
pdb = "6p1q"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6p1q, [[{'auth_asym_id': 'B', 'auth_comp_id': 'NQ1', 'auth_seq_id': '1102'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
22,ASD01020000_1,EGFR,Homo sapiens,P00533,6P1Q,,NQ1,B,Lig,Inhibitor,"10-benzyl-8-fluoro-5,10-dihydro-11H-dibenzo[b,...",1102,Allosteric function,Allosteric position,,,No,6P1D


In [363]:
df.query("allosteric_pdb == '6P1D'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
347,ASD01020000_1,EGFR,Homo sapiens,P00533,6P1D,,NQ1,A,Lig,Inhibitor,"10-benzyl-8-fluoro-5,10-dihydro-11H-dibenzo[b,...",1302,,,,,,6P1D


In [364]:
errors[pdb] = "Wrong annotation"

The annotation of 6P1Q is a mistake, as the correct modulator NQ1 is present in 6P1D, which has its own entry in the dataset and moreover is the pdb code that appears in the field "allosteric_site_residue".

<br>

In [365]:
pdb = "6p4v"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6p4v, [[{'auth_asym_id': 'A', 'auth_comp_id': '8XY', 'auth_seq_id': '401'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
112,ASD18420000_1,DHPS,Homo sapiens,P49366,6P4V,,8XY,A,,Inhibitor,6-bromo-N-(1H-indol-4-yl)-1-benzothiophene-2-c...,401,,,,,,6PGR


In [366]:
df.query("allosteric_pdb in ['6P4V', '6PGR']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2946,ASD18420000_1,DHPS,Homo sapiens,P49366,6P4V,,8XY,A,,Inhibitor,6-bromo-N-(1H-indol-4-yl)-1-benzothiophene-2-c...,401,,,,,,6PGR
2947,ASD18420000_1,DHPS,Homo sapiens,P49366,6PGR,,8XY,A,Lig,Inhibitor,6-bromo-N-(1H-indol-4-yl)-1-benzothiophene-2-...,401,Allosteric function,Allosteric position,,,No,6PGR


In [367]:
list(PDB.get(PDB.entry_id == "6pgr").sites)

[<Site: 4270>]

In [368]:
errors[pdb] = "Wrong annotation"

6p4v belongs to the same primary citation as 6pgr, which as checked is already present in the dataset and database and is the only entry in the PDB which indeed contains residue 8XY with the annotated chain and resi. The molecule bound in 6P4V does not appear to be allosteric and this PDB should not be part of an annotation.

<br>

In [369]:
pdb = "6sfb"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6sfb, [[{'auth_asym_id': 'A', 'auth_comp_id': 'L9K', 'auth_seq_id': '501'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
96,ASD14640000_1,EED,Homo sapiens,O75530,6SFB,,L9K,A,lig,Inhibitor,(~{E})-~{N}-(1-benzofuran-4-ylmethyl)-8-(4-met...,501,Allosteric function,Allosteric position,,,Yes,6SFB


In [370]:
solve_error(pdb, [
            [{"auth_comp_id": "L9W"}],
        ]
)

6SFB [[{'auth_comp_id': 'L9W'}]]
SITES: [(<Site: 4515>, {'label_asym_id': ['C']})]


L9K is an obsolete residue name and a specific enantiomer of L9W.

<br>

In [371]:
pdb = "6sfc"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6sfc, [[{'auth_asym_id': 'A', 'auth_comp_id': 'L9M', 'auth_seq_id': '503'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
96,ASD14640000_1,EED,Homo sapiens,O75530,6SFC,,L9M,A,lig,Inhibitor,~{N}-(1-benzofuran-4-ylmethyl)-8-(4-methylsulf...,503,Allosteric function,Allosteric position,,,Yes,6SFC


In [372]:
solve_error(pdb, [
            [{"auth_comp_id": "L9W"}],
            [{"auth_comp_id": "L9T"}],
        ]
)

6SFC [[{'auth_comp_id': 'L9W'}], [{'auth_comp_id': 'L9T'}]]
SITES: [(<Site: 4516>, {'label_asym_id': ['E']})]


Similarly as before, now instead of two L9[X] ligands of the same time, there is one L9W and also a L9T, both bound in the same site in the two different protein chains. Both are treated similarly in the primary citation that discusses allosteric modulators.

<br>

In [373]:
pdb = "7lh2"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["7lh2, [[{'auth_asym_id': 'A', 'auth_comp_id': 'CL', 'auth_seq_id': '802'}]]: couldn't retrieve modulator in pdb with using residue name"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
116,ASD21950000_1,SLC26A5,Homo sapiens,P58743,7LH2,,CL,A,Ion,Regulator,CHLORIDE ION,802,,,,,,7LH2


In [374]:
df.query("allosteric_pdb in ['7LGU', '7LGW']")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3073,ASD21950000_1,SLC26A5,Homo sapiens,P58743,7LGU,,CL,A,Ion,Regulator,CHLORIDE ION,801,,,,,,7LGU
3074,ASD21950000_1,SLC26A5,Homo sapiens,P58743,7LGW,,CL,A,Ion,Regulator,CHLORIDE ION,801,,,,,,7LGW


In [375]:
list(PDB.select().where(PDB.entry_id.in_([pdb.lower() for pdb in ['7LGU', '7LGW']]))),
[list(pdb.sites) for pdb in PDB.select().where(PDB.entry_id.in_([pdb.lower() for pdb in ['7LGU', '7LGW']]))]

[[<Site: 4443>], [<Site: 4444>]]

In [376]:
errors[pdb] = "Wrong annotation"

The PDB doesn't have any chloride ions but it does have cholesterol (CLR) for which the residue id and chain match. However, in the primary citation associated only the chloride anions are discussed as allosteric, and indeed the two other PDBs associated with the primary citation that indeed contain chloride ions are already included in the dataset and database

<br>

In [377]:
errors_groups()

{'using only residue name retrieves more than one site(group)': ['1dx5',
  '4tnr',
  '5opm',
  '6n82',
  '6oag',
  '6oah',
  '6qx2',
  '8dd2'],
 'Molecules of the annotated modulator(s) bind close together but were not grouped': ['11bg',
  '1e7c',
  '1ecb',
  '1egy',
  '1eup',
  '1gph',
  '1r1v',
  '1thc',
  '1vm1',
  '1wda',
  '1yp2',
  '1yp3',
  '1yp4',
  '1z62',
  '2d41',
  '2dew',
  '2dex',
  '2dw5',
  '2fsz',
  '2gq1',
  '2ha4',
  '2wrm',
  '3ao1',
  '3cqd',
  '3e3f',
  '3ete',
  '3f3t',
  '3f3u',
  '3f48',
  '3fyh',
  '3k8s',
  '3lsf',
  '3lsl',
  '3mzh',
  '3njq',
  '3umo',
  '3uqd',
  '3zl6',
  '4clz',
  '4dkt',
  '4eag',
  '4eak',
  '4gqq',
  '4i0u',
  '4lrl',
  '4m0z',
  '4ni0',
  '4oyo',
  '4p2t',
  '4p3h',
  '4ple',
  '4qfy',
  '4qsh',
  '4r8z',
  '4rew',
  '4rqo',
  '4z87',
  '5afk',
  '5btr',
  '5cnt',
  '5ezv',
  '5mcp',
  '5olk',
  '5s4r',
  '5s4s',
  '5tc3',
  '5ur3',
  '5ute',
  '5utn',
  '5uv3',
  '5v5d',
  '5v5e',
  '6b0z',
  '6brk',
  '6dja',
  '6i0m',
  '6i0o',
  

##### using only residue name retrieves more than one site(group)

In [378]:
errors_groups()['using only residue name retrieves more than one site(group)']

['1dx5', '4tnr', '5opm', '6n82', '6oag', '6oah', '6qx2', '8dd2']

In [379]:
pdb = "1dx5"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["1dx5, [[{'label_asym_id': 'O', 'auth_comp_id': 'NA', 'auth_seq_id': '503', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'R', 'auth_comp_id': 'NA', 'auth_seq_id': '503', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'U', 'auth_comp_id': 'NA', 'auth_seq_id': '503', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'X', 'auth_comp_id': 'NA', 'auth_seq_id': '503', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'Z', 'auth_comp_id': 'NA', 'auth_seq_id': '302', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'FA', 'auth_comp_id': 'NA', 'auth_seq_id': '302', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'LA', 'auth_comp_id': 'NA', 'auth_seq_id': '302', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'RA', 'auth_comp_id': 'NA', 'auth_seq_id': '302', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
62,ASD05870000_1,F2,Homo sapiens,P00734,1DX5,ASD03160008,,M,Ion,Activator,sodium(+1) cation,2001,Inner Protein Regulator,Inner Protein,10761923,Structural Basis for the Anticoagulant Activit...,No,1DX5


In [380]:
solve_error(pdb, [
            [{"label_asym_id": "Z"}],
        ]
)

1DX5 [[{'label_asym_id': 'Z'}]]
SITES: [(<Site: 4524>, {'label_asym_id': ['Z']})]


Indeed, the NA ions form 2 sets of sites that are different. According to the remapping in https://www.ebi.ac.uk/pdbe/entry-files/1dx5_updated.cif, the corresponding NA ion now is the one of label_asym_id Z

<br>

In [381]:
pdb = "4tnr"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["4tnr, [[{'label_asym_id': 'E', 'auth_comp_id': 'DTP', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'G', 'auth_comp_id': 'DTP', 'auth_seq_id': '703', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'I', 'auth_comp_id': 'DTP', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'L', 'auth_comp_id': 'DTP', 'auth_seq_id': '704', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'M', 'auth_comp_id': 'DTP', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'P', 'auth_comp_id': 'DTP', 'auth_seq_id': '704', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'R', 'auth_comp_id': 'DTP', 'auth_seq_id': '702', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
74,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4TNR,ASD00870004,DTP,A,Lig,Activator,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,701,Allosteric function,Allosteric position,25267621,Structural basis of cellular dNTP regulation b...,Yes,4TNR


In [382]:
solve_error(pdb, [
            [{"auth_asym_id": "A", "auth_comp_id": "DTP", "auth_seq_id": "702"}],
            [{"auth_asym_id": "B", "auth_comp_id": "GTP", "auth_seq_id": "703"}],
            [{"auth_asym_id": "B", "auth_comp_id": "MG", "auth_seq_id": "702"}],
        ],
    auto_site_grouping=True, stringent_site_grouping=False
)

4TNR [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTP', 'auth_seq_id': '702'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'GTP', 'auth_seq_id': '703'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '702'}]]
SITES: [(<Site: 4542>, {'label_asym_id': ['F', 'M', 'Q']})]


In [383]:
remaining.query(f'allosteric_pdb in {"4TNP, 4TNQ, 4TNR, 4TNX, 4TNY, 4TNZ, 4TO0, 4TO1, 4TO2, 4TO3, 4TO4, 4TO5, 4TO6".split(", ")}')

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2111,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4TNP,ASD02588003,DCP,A,Lig,Activator,2'-DEOXYCYTIDINE-5'-TRIPHOSPHATE,701,Allosteric function,Allosteric position,25267621,Structural basis of cellular dNTP regulation b...,Yes,4TNP
2112,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4TNQ,ASD02040015,TTP,A,Lig,Activator,THYMIDINE-5'-TRIPHOSPHATE,701,Allosteric function,Allosteric position,25267621,Structural basis of cellular dNTP regulation b...,Yes,4TNQ
2113,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4TNR,ASD00870004,DTP,A,Lig,Activator,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,701,Allosteric function,Allosteric position,25267621,Structural basis of cellular dNTP regulation b...,Yes,4TNR
2114,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4TNX,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701,Allosteric function,Allosteric position,25267621,Structural basis of cellular dNTP regulation b...,Yes,4TNX


In [384]:
for pdb in PDB.select().where(PDB.entry_id.in_([pdb.lower() for pdb in "4TNP, 4TNQ, 4TNR, 4TNX, 4TNY, 4TNZ, 4TO0, 4TO1, 4TO2, 4TO3, 4TO4, 4TO5, 4TO6".split(", ")])):
    for site in pdb.sites:
        print(pdb, site, site.modulator_residues[["label_asym_id", "auth_comp_id", "auth_seq_id"]], "\n\n")

4tnp 3388   label_asym_id auth_comp_id auth_seq_id
0             E          DCP         701 


4tnq 3389   label_asym_id auth_comp_id auth_seq_id
0             E          TTP         701 


4tnr 4542   label_asym_id auth_comp_id auth_seq_id
0             F          GTP         702
1             M          DTP         701
2             Q           MG         701 


4tnx 3390   label_asym_id auth_comp_id auth_seq_id
0             E          DGT         701 




In [385]:
pdb = "4tnp"
Site.get(Site.pdb == pdb).delete_instance()
entry = df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze()
mods = [
    [{"auth_asym_id": "A", "auth_comp_id": "DCP", "auth_seq_id": "702"}],
    [{"auth_asym_id": "A", "auth_comp_id": "GTP", "auth_seq_id": "703"}],
    [{"auth_asym_id": "D", "auth_comp_id": "MG", "auth_seq_id": "701"}],
]
process_entry(entry, updates={pdb.upper(): {"pdb": pdb.upper(), "mods": mods}},
    auto_site_grouping=True, stringent_site_grouping=False)
print(Site.get(Site.pdb == pdb).modulator)

4TNP [[{'auth_asym_id': 'A', 'auth_comp_id': 'DCP', 'auth_seq_id': '702'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '703'}], [{'auth_asym_id': 'D', 'auth_comp_id': 'MG', 'auth_seq_id': '701'}]]
{'label_asym_id': ['F', 'G', 'Q']}


In [386]:
pdb = "4tnq"
Site.get(Site.pdb == pdb).delete_instance()
entry = df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze()
mods = [
    [{"auth_asym_id": "A", "auth_comp_id": "TTP", "auth_seq_id": "703"}],
    [{"auth_asym_id": "B", "auth_comp_id": "GTP", "auth_seq_id": "702"}],
    [{"auth_asym_id": "B", "auth_comp_id": "MG", "auth_seq_id": "701"}],
]
process_entry(entry, updates={pdb.upper(): {"pdb": pdb.upper(), "mods": mods}},
    auto_site_grouping=True, stringent_site_grouping=False)
print(Site.get(Site.pdb == pdb).modulator)

4TNQ [[{'auth_asym_id': 'A', 'auth_comp_id': 'TTP', 'auth_seq_id': '703'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'GTP', 'auth_seq_id': '702'}], [{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '701'}]]
{'label_asym_id': ['F', 'H', 'I']}


In [387]:
pdb = "4tnx"
Site.get(Site.pdb == pdb).delete_instance()
entry = df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze()
mods = [
    [{"auth_asym_id": "A", "auth_comp_id": "DGT", "auth_seq_id": "701"}],
    [{"auth_asym_id": "A", "auth_comp_id": "GTP", "auth_seq_id": "702"}],
    [{"auth_asym_id": "C", "auth_comp_id": "MG", "auth_seq_id": "701"}],
]
process_entry(entry, updates={pdb.upper(): {"pdb": pdb.upper(), "mods": mods}},
    auto_site_grouping=True, stringent_site_grouping=False)
print(Site.get(Site.pdb == pdb).modulator)

4TNX [[{'auth_asym_id': 'A', 'auth_comp_id': 'DGT', 'auth_seq_id': '701'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '702'}], [{'auth_asym_id': 'C', 'auth_comp_id': 'MG', 'auth_seq_id': '701'}]]
{'label_asym_id': ['E', 'F', 'M']}


The molecules of DTP indeed form two sets of sites, one set that happens in every protein chain individually, and another set in which it binds together with GTP in sites involving multiple chains, but the A-DTP-701 residue is not present in the remapping of the PDB cif. According to the citation stored in the entry (the PDB primary citation), the allosteric sites are the ones formed by DTP, GTP and Mg, while other single nucleotides are at the catalytic site. Therefore, one whole allosteric site will be annotated for correct recognition of the related set of sites.


<br>

In [388]:
pdb = "5opm"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 1088>, {'label_asym_id': ['E']})]
ERROR: ["5opm, [[{'label_asym_id': 'C', 'auth_comp_id': 'MG', 'auth_seq_id': '502', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'D', 'auth_comp_id': 'MG', 'auth_seq_id': '503', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
21,ASD00870000_1,NT5C2,Homo sapiens,P49902,5OPM,ASD00330030,MG,A,Ion,Activator,MAGNESIUM ION,610,Allosteric function,Allosteric position,29535428,Relapsed acute lymphoblastic leukemia-specific...,No,5OPM


In [389]:
solve_error(pdb, [
            [{"auth_asym_id": "A", "auth_comp_id": "MG", "auth_seq_id": "503"}],
            [{"auth_asym_id": "A", "auth_comp_id": "DTP", "auth_seq_id": "504"}]
        ]
)

5OPM [[{'auth_asym_id': 'A', 'auth_comp_id': 'MG', 'auth_seq_id': '503'}], [{'auth_asym_id': 'A', 'auth_comp_id': 'DTP', 'auth_seq_id': '504'}]]
SITES: [(<Site: 4598>, {'label_asym_id': ['D', 'E']})]


The ATP-MG complex of the structure is bound on the allosteric activators site, according to the article included as pubmed_id in the dataset entry.

<br>

In [390]:
remaining.query("allosteric_pdb == '6N82'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1641,ASD04840000_1,FDPS,Homo sapiens,P14324,6N82,,YF7,F,lig,Inhibitor,"[(1S)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6N82


In [391]:
pdb = "6n82"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6n82, [[{'label_asym_id': 'C', 'auth_comp_id': 'YF7', 'auth_seq_id': '402', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'D', 'auth_comp_id': 'YF7', 'auth_seq_id': '403', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
53,ASD04840000_1,FDPS,Homo sapiens,P14324,6N82,,YF7,F,lig,Inhibitor,"[(1S)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6N82


In [392]:
solve_error(pdb, [
            [{"auth_asym_id": "F", "auth_comp_id": "YF7", "auth_seq_id": "402"}]
        ]
)

6N82 [[{'auth_asym_id': 'F', 'auth_comp_id': 'YF7', 'auth_seq_id': '402'}]]
SITES: [(<Site: 4599>, {'label_asym_id': ['C']})]


In [393]:
remaining.query(f'allosteric_pdb in {"6N7Y, 6N7Z, 6N82, 6N83, 6OAG, 6OAH".split(", ")}')

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1639,ASD04840000_1,FDPS,Homo sapiens,P14324,6N7Y,,KFA,F,lig,Inhibitor,"[(1R)-1-{[6-(4-methylphenyl)thieno[2,3-d]pyrim...",404,Allosteric function,Allosteric position,,,No,6N7Y
1640,ASD04840000_1,FDPS,Homo sapiens,P14324,6N7Z,,KF7,F,lig,Inhibitor,"[(1S)-1-{[6-(4-methylphenyl)thieno[2,3-d]pyrim...",404,Allosteric function,Allosteric position,,,No,6N7Z
1641,ASD04840000_1,FDPS,Homo sapiens,P14324,6N82,,YF7,F,lig,Inhibitor,"[(1S)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6N82
1642,ASD04840000_1,FDPS,Homo sapiens,P14324,6N83,,YL6,F,lig,Inhibitor,"[(1R)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6N83
1643,ASD04840000_1,FDPS,Homo sapiens,P14324,6OAG,,M2Y,F,lig,Inhibitor,"[(1S)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6OAG
1644,ASD04840000_1,FDPS,Homo sapiens,P14324,6OAH,,M2V,F,lig,Inhibitor,"[(1R)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6OAH


In [394]:
for pdb in PDB.select().where(PDB.entry_id.in_([pdb.lower() for pdb in "6N7Y, 6N7Z, 6N82, 6N83, 6OAG, 6OAH".split(", ")])):
    for site in pdb.sites:
        print(pdb, site, site.modulator_residues[["label_asym_id", "auth_comp_id", "auth_seq_id"]], "\n\n")

6n7y 2917   label_asym_id auth_comp_id auth_seq_id
0             C          KFA         402 


6n7z 2918   label_asym_id auth_comp_id auth_seq_id
0             C          KF7         402 


6n82 4599   label_asym_id auth_comp_id auth_seq_id
0             C          YF7         402 


6n83 2919   label_asym_id auth_comp_id auth_seq_id
0             B          YL6         401 




The primary citation does not discuss two binding sites, and moreover the related structure 6N83 only exhibits one bound molecule, which is the one that is going to be annotated in this case.

<br>

In [395]:
pdb = "6oag"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6oag, [[{'label_asym_id': 'B', 'auth_comp_id': 'M2Y', 'auth_seq_id': '401', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'C', 'auth_comp_id': 'M2Y', 'auth_seq_id': '402', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
53,ASD04840000_1,FDPS,Homo sapiens,P14324,6OAG,,M2Y,F,lig,Inhibitor,"[(1S)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6OAG


In [396]:
solve_error(pdb, [
            [{"auth_asym_id": "F", "auth_comp_id": "M2Y", "auth_seq_id": "401"}]
        ]
)

6OAG [[{'auth_asym_id': 'F', 'auth_comp_id': 'M2Y', 'auth_seq_id': '401'}]]
SITES: [(<Site: 4600>, {'label_asym_id': ['B']})]


In [397]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [],
 'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
   'res_of_other_in_site': 0.23076923076923078,
   'res_of_site_in_other': 0.125}]}

Same as before.

<br>

In [398]:
pdb = "6oah"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6oah, [[{'label_asym_id': 'C', 'auth_comp_id': 'M2V', 'auth_seq_id': '402', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'D', 'auth_comp_id': 'M2V', 'auth_seq_id': '403', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
53,ASD04840000_1,FDPS,Homo sapiens,P14324,6OAH,,M2V,F,lig,Inhibitor,"[(1R)-1-{[6-(3-chloro-4-methylphenyl)thieno[2,...",404,Allosteric function,Allosteric position,,,No,6OAH


In [399]:
solve_error(pdb, [
            [{"auth_asym_id": "F", "auth_comp_id": "M2V", "auth_seq_id": "402"}]
        ]
)

6OAH [[{'auth_asym_id': 'F', 'auth_comp_id': 'M2V', 'auth_seq_id': '402'}]]
SITES: [(<Site: 4601>, {'label_asym_id': ['C']})]


Same as before.

<br>

In [400]:
pdb = "6qx2"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["6qx2, [[{'label_asym_id': 'KA', 'auth_comp_id': 'JK8', 'auth_seq_id': '501', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'LA', 'auth_comp_id': 'JK8', 'auth_seq_id': '501', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'MA', 'auth_comp_id': 'JK8', 'auth_seq_id': '501', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'NA', 'auth_comp_id': 'JK8', 'auth_seq_id': '501', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'OA', 'auth_comp_id': 'JK8', 'auth_seq_id': '501', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'PA', 'auth_comp_id': 'JK8', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'QA', 'auth_comp_id': 'JK8', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'RA', 'auth_comp_id': 'JK8', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'SA', 'auth_comp_id': 'JK8', 'auth_seq_id': '701', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'TA', 'auth_comp_id': 'JK8', 'auth_seq_id': '501', 'pdbx_PDB_i

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
111,ASD22030000_1,gyrB,Staphylococcus aureus,P0A0K8,6QX2,,JK8,B,Lig,Inhibitor,"(2~{R})-2-[[5-(2-chlorophenyl)-1,2-benzoxazol-...",702,,,,,,


In [401]:
solve_error(pdb, [
            [{"label_asym_id": "KA"}],
            [{"label_asym_id": "LA"}],
        ]
)

6QX2 [[{'label_asym_id': 'KA'}], [{'label_asym_id': 'LA'}]]
SITES: [(<Site: 4602>, {'label_asym_id': ['KA']}), (<Site: 4611>, {'label_asym_id': ['TA']})]


There are no copies of the modulator in chain B and/or with that residue ID, and all the copies of the modulator are seemingly occupying the same site, but the protein entities are 9 entities in total of the same two proteins (protein A and B), and the sites are precluded from being identified as equivalent because they belong to different entities. Thus, only the two molecules that appear in assembly 1 are going to be put down as modulators.

<br>

In [402]:
pdb = "8dd2"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ["8dd2, [[{'label_asym_id': 'P', 'auth_comp_id': 'R5R', 'auth_seq_id': '402', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'S', 'auth_comp_id': 'R5R', 'auth_seq_id': '402', 'pdbx_PDB_ins_code': '?'}], [{'label_asym_id': 'U', 'auth_comp_id': 'R5R', 'auth_seq_id': '601', 'pdbx_PDB_ins_code': '?'}]]: using only residue name retrieves more than one site(group)"]


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
23,ASD01140000_3,GABRA1,Homo sapiens,P14867,8DD2,ASD01140041,R5R,E,Compound,,"N,N-dimethyl-2-(6-methyl-2-p-tolylimidazo[1,2-...",,,,35933426,,,8DD2


In [403]:
solve_error(pdb, [
            [{"auth_comp_id": "R5R"}]
        ]
)

8DD2 [[{'auth_comp_id': 'R5R'}]]
SITES: [(<Site: 4612>, {'label_asym_id': ['P']}), (<Site: 4614>, {'label_asym_id': ['U']})]


Same as in 8DD3, the three molecules are the allosteric modulators (2 of them form the same site, the other doesn't). Although in this entry the label_entity_id are not used.




<br>

In [404]:
errors_groups()

{'Molecules of the annotated modulator(s) bind close together but were not grouped': ['11bg',
  '1e7c',
  '1ecb',
  '1egy',
  '1eup',
  '1gph',
  '1r1v',
  '1thc',
  '1vm1',
  '1wda',
  '1yp2',
  '1yp3',
  '1yp4',
  '1z62',
  '2d41',
  '2dew',
  '2dex',
  '2dw5',
  '2fsz',
  '2gq1',
  '2ha4',
  '2wrm',
  '3ao1',
  '3cqd',
  '3e3f',
  '3ete',
  '3f3t',
  '3f3u',
  '3f48',
  '3fyh',
  '3k8s',
  '3lsf',
  '3lsl',
  '3mzh',
  '3njq',
  '3umo',
  '3uqd',
  '3zl6',
  '4clz',
  '4dkt',
  '4eag',
  '4eak',
  '4gqq',
  '4i0u',
  '4lrl',
  '4m0z',
  '4ni0',
  '4oyo',
  '4p2t',
  '4p3h',
  '4ple',
  '4qfy',
  '4qsh',
  '4r8z',
  '4rew',
  '4rqo',
  '4z87',
  '5afk',
  '5btr',
  '5cnt',
  '5ezv',
  '5mcp',
  '5olk',
  '5s4r',
  '5s4s',
  '5tc3',
  '5ur3',
  '5ute',
  '5utn',
  '5uv3',
  '5v5d',
  '5v5e',
  '6b0z',
  '6brk',
  '6dja',
  '6i0m',
  '6i0o',
  '6l57',
  '6ms7',
  '6q4d',
  '6qku',
  '6qkw',
  '6qxa',
  '6zxm',
  '7a5y',
  '7dtv',
  '7e6t'],
 'W': ['5j8v', '5uvg', '6p1q', '6p4v', '7lh2'

##### combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped

In [405]:
errors_groups()["combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped"]

['3ete', '3fyh', '4lrl', '6l57', '7dtv', '7e6t']

In [406]:
pdb = "3ete"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 491>, {'label_asym_id': ['R', 'DA']}), (<Site: 1649>, {'label_asym_id': ['I']})]
ERROR: ['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
28,ASD01240000_1,GLUD1,Bos taurus,P00366,3ETE,ASD00290006,GTP,A,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H...",553,Inner Protein Regulator,Inner Protein,19531491,Novel Inhibitors Complexed with Glutamate Dehy...,No,"Chain A:ILE212,SER213,ARG217,LEU257,HIS258,ARG..."


In [407]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
600,ASD01240000_1,GLUD1,Bos taurus,P00366,3ETE,ASD01240021;ASD01240021;ASD01240021;ASD0124002...,H3P;H3P;H3P;H3P;H3P;H3P,A;B;C;C;D;F,Lig,Inhibitor,"3,4,6-trichloro-2-[(2,3,5-trichloro-6-hydroxy-...",552;552;552;554;552;552,Inner Protein Regulator,Protein-Protein Interaction,19531491,Novel Inhibitors Complexed with Glutamate Dehy...,No,"Chain A:THR186,ILE187,TYR190; Chain E:MET150,L..."
601,ASD01240000_1,GLUD1,Bos taurus,P00366,3ETE,ASD00290006,GTP,A,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H...",553,Inner Protein Regulator,Inner Protein,19531491,Novel Inhibitors Complexed with Glutamate Dehy...,No,"Chain A:ILE212,SER213,ARG217,LEU257,HIS258,ARG..."


In [408]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['I']}).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['CA']},
   'res_of_other_in_site': 0.9285714285714286,
   'res_of_site_in_other': 0.7222222222222222},
  {'other_site': {'label_asym_id': ['M']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 0.8333333333333334},
  {'other_site': {'label_asym_id': ['Q']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 0.7777777777777778},
  {'other_site': {'label_asym_id': ['V']},
   'res_of_other_in_site': 0.8666666666666667,
   'res_of_site_in_other': 0.7222222222222222},
  {'other_site': {'label_asym_id': ['Z']},
   'res_of_other_in_site': 0.875,
   'res_of_site_in_other': 0.7777777777777778}],
 'nonequivalent': []}

In [409]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['I']}).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'I'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P00366']}],
 'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD01240000_1',
      'target_gene': 'GLUD1',
      'organism': 'Bos taurus',
      'pdb_uniprot': 'P00366',
      'allosteric_pdb': '3ETE',
      'modulator_serial': 'ASD00290006',
      'modulator_alias': 'GTP',
      'modulator_chain': 'A',
      'modulator_class': 'Lig',
      'modulator_feature': 'Inhibitor',
      'modulator_name': '[[(2R,3S,4R,5R)-5-(2-azanyl-6-oxidanylidene-1H-purin-9-yl)-3,4-bis(oxidanyl)oxolan-2-yl]methoxy-oxidanyl-phosphoryl] phosphono hydrogen phosphate',
      'modulator_resi': '553',
      'function': 'Inner Protein Regulator',
      'position': 'Inner Protein',
 

In [410]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]
errors.pop(pdb)

['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']

The already-existing sites of the PDB are already correctly annotated and a successful execution of the `combine_sites` function wouldn't have changed anything.

<br>

In [411]:
pdb = "3fyh"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 520>, {'label_asym_id': ['F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'BA', 'CA']}), (<Site: 3323>, {'label_asym_id': ['B']})]
ERROR: ['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
64,ASD08840000_1,radA,Methanococcus voltae,O73948,3FYH,ASD01720027,ADP,A,Lig,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",401,Protein-DNA/RNA Interaction Regulator,Inner Protein,19555119,Crystal structure of an archaeal Rad51 homolog...,No,"Chain A:VAL106,PHE107,GLY108,SER109,GLY110,LYS..."


In [412]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2035,ASD08840000_1,radA,Methanococcus voltae,O73948,3FYH,ASD01720027,ADP,A,Lig,Regulator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihyd...",401,Protein-DNA/RNA Interaction Regulator,Inner Protein,19555119,Crystal structure of an archaeal Rad51 homolog...,No,"Chain A:VAL106,PHE107,GLY108,SER109,GLY110,LYS..."
2036,ASD08840000_1,radA,Methanococcus voltae,O73948,3FYH,ASD08840007;ASD08840007;ASD08840007;ASD0884000...,W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W;W,A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A;A,Ion,Regulator,tungsten(6+),534;511;512;513;514;515;516;517;518;519;520;52...,Protein-DNA/RNA Interaction Regulator,Inner Protein,19555119,Crystal structure of an archaeal Rad51 homolog...,No,"Chain A:ARG218,ARG230"


In [413]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]
errors.pop(pdb)

['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']

Same as before, the already-existing sites of the PDB are already correctly annotated and a successful execution of the `combine_sites` function wouldn't have changed anything.

<br>

In [414]:
pdb = "4lrl"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 3132>, {'label_asym_id': ['F']}), (<Site: 3133>, {'label_asym_id': ['J']})]
ERROR: ['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
56,ASD06820000_1,EF_1143,Enterococcus faecalis,Q836G9,4LRL,ASD02040015,TTP,B,Lig,Inhibitor,"[(2R,3S,5R)-3-hydroxy-5-(5-methyl-2,4-dioxo-py...",503,Inner Protein Regulator,Protein-Protein Interaction,24338016,Mechanisms of Allosteric Activation and Inhibi...,No,"Chain A:THR55,PHE56,THR197,GLN241,HIS245; Chai..."


In [415]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1845,ASD06820000_1,EF_1143,Enterococcus faecalis,Q836G9,4LRL,ASD09690001,DGT,B,Lig,Activator,"[[(2R,3S,5R)-5-(2-amino-6-oxo-1H-purin-9-yl)-3...",502,Inner Protein Regulator,Protein-Protein Interaction,24338016,Mechanisms of Allosteric Activation and Inhibi...,No,"Chain A:PHE54,THR55,VAL247,ARG326,LYS330; Chai..."
1846,ASD06820000_1,EF_1143,Enterococcus faecalis,Q836G9,4LRL,ASD02040015,TTP,B,Lig,Inhibitor,"[(2R,3S,5R)-3-hydroxy-5-(5-methyl-2,4-dioxo-py...",503,Inner Protein Regulator,Protein-Protein Interaction,24338016,Mechanisms of Allosteric Activation and Inhibi...,No,"Chain A:THR55,PHE56,THR197,GLN241,HIS245; Chai..."


In [416]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['J']}).delete_instance()

1

In [417]:
solve_error(pdb, [[{"label_asym_id": "J"}]], auto_site_grouping=True, stringent_site_grouping=False)

4LRL [[{'label_asym_id': 'J'}]]
SITES: [(<Site: 4622>, {'label_asym_id': ['I', 'J']})]


In [418]:
Site.get(Site.pdb == pdb).related_sites, Site.get(Site.pdb == pdb).info

({'equivalent': [{'other_site': {'label_asym_id': ['F']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.5945945945945946},
   {'other_site': {'label_asym_id': ['O']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.5945945945945946},
   {'other_site': {'label_asym_id': ['R', 'S']},
    'res_of_other_in_site': 0.9736842105263158,
    'res_of_site_in_other': 1.0}],
  'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'I'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'J'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': "THYMIDINE-5'-TRIPHOSPHATE"}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A', 'B', 'D']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['Q836G9']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': '

According to the primary citation, the Model has a total of four allosteric sites where DGT binds, and in two of them a TTP molecule binds together, and these cases should have been automatically identified by combine_sites with the rest of single-DGT sites included as equivalent sites.

<br>

In [419]:
pdb = "6l57"
get_error(pdb)

ENTRIES: 3 SITES: [(<Site: 4116>, {'label_asym_id': ['C']}), (<Site: 4154>, {'label_asym_id': ['D']}), (<Site: 4155>, {'label_asym_id': ['F']})]
ERROR: ['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
93,ASD16670000_1,IDH3G,Homo sapiens,P51553,6L57,,ATP,B,Lig,Activator,ADENOSINE-5'-TRIPHOSPHATE,403,Allosteric function,Allosteric position,,,No,6L57


In [420]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2767,ASD16420000_1,IDH3A,Homo sapiens,P50213,6L57,,Mg,A,Ion,Regulator,MAGNESIUM ION,401,,,,,,6L57
2806,ASD16670000_1,IDH3G,Homo sapiens,P51553,6L57,,CIT,B,Lig,Regulator,CITRIC ACID,401,,,,,,6L57
2807,ASD16670000_1,IDH3G,Homo sapiens,P51553,6L57,,ATP,B,Lig,Activator,ADENOSINE-5'-TRIPHOSPHATE,403,Allosteric function,Allosteric position,,,No,6L57


In [421]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites]

[1, 1, 1]

In [422]:
process_entry(
    entry=df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_alias == 'Mg'").squeeze(),
    updates={
        pdb.upper(): {
            "pdb": pdb.upper(),
            "mods": [[{"auth_asym_id": "B", "auth_comp_id": "MG", "auth_seq_id": "402"}]]
        }
    }
)

6L57 [[{'auth_asym_id': 'B', 'auth_comp_id': 'MG', 'auth_seq_id': '402'}]]


In [423]:
process_entry(
    entry=df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_alias == 'CIT'").squeeze(),
)

6L57 [[{'auth_asym_id': 'B', 'auth_comp_id': 'CIT', 'auth_seq_id': '401'}]]


In [424]:
[s.modulator for s in PDB.get(PDB.entry_id == "6l57").sites]

[{'label_asym_id': ['D', 'E']}]

In [425]:
solve_error(pdb, update = [[{"label_asym_id": "F"}]], auto_site_grouping=True, stringent_site_grouping=False)

6L57 [[{'label_asym_id': 'F'}]]
SITES: [(<Site: 4636>, {'label_asym_id': ['D', 'E', 'F']})]


In [426]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': 'CITRIC ACID'},
  {'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': 'MAGNESIUM ION'},
  {'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '5',
   'type': 'non-polymer',
   'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P50213']},
  {'label_entity_id': '2',
   'interacting_chains': {'label_asym_id': ['B']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P51553']}],
 'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD16420000_1',
      'target_gene': 'IDH3A',
      'organism': 'Homo sapiens',
      'pdb_uniprot': 'P50213',
      'allosteric_pdb': '6L57',
      'modulator_serial': None,
      'modulat

The annotation is wrong, as the allosteric Mg ion is the one in auth_asym_id B, according to the primary citation of the PDB.

<br>

In [427]:
pdb = "7dtv"
get_error(pdb)

ENTRIES: 3 SITES: [(<Site: 963>, {'label_asym_id': ['I']}), (<Site: 964>, {'label_asym_id': ['H']}), (<Site: 965>, {'label_asym_id': ['G']})]
ERROR: ['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
12,ASD00460000_1,CASR,Homo sapiens,P41180,7DTV,,TRP,A,Lig,Activator,TRYPTOPHAN,1101,,,,,,7DTV


In [428]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
157,ASD00460000_1,CASR,Homo sapiens,P41180,7DTV,,CA,A,Ion,Activator,CALCIUM ION,1108,,,,,,7DTV
158,ASD00460000_1,CASR,Homo sapiens,P41180,7DTV,,CA,A,Ion,Activator,CALCIUM ION,1102,,,,,,7DTV
159,ASD00460000_1,CASR,Homo sapiens,P41180,7DTV,,TRP,A,Lig,Activator,TRYPTOPHAN,1101,,,,,,7DTV


In [429]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['G']}).delete_instance()

1

In [430]:
solve_error(pdb, update = [[{"label_asym_id": "G"}]], auto_site_grouping=True, stringent_site_grouping=False)

7DTV [[{'label_asym_id': 'G'}]]
SITES: [(<Site: 4639>, {'label_asym_id': ['I']}), (<Site: 4644>, {'label_asym_id': ['G', 'H']})]


In [431]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['G', 'H']}).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'G'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': 'TRYPTOPHAN'},
  {'modulator': [{'label_asym_id': 'H'}],
   'label_entity_id': '5',
   'type': 'non-polymer',
   'pdbx_description': 'CALCIUM ION'}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P41180']}],
 'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00460000_1',
      'target_gene': 'CASR',
      'organism': 'Homo sapiens',
      'pdb_uniprot': 'P41180',
      'allosteric_pdb': '7DTV',
      'modulator_serial': None,
      'modulator_alias': 'CA',
      'modulator_chain': 'A',
      'modulator_class': 'Ion',
      'modulator_feature': 'Activator',
      'modulator_name': 'CALCIUM ION',
      'modulator_resi': '1102',
      'function': None,
      'position': None,
      'pubmed_id': None,
      'ref_title': None,
      

The annotations are correct, the grouping probably failed because of the stringency argument.

<br>

In [432]:
pdb = "7e6t"
get_error(pdb)

ENTRIES: 4 SITES: [(<Site: 966>, {'label_asym_id': ['D']}), (<Site: 968>, {'label_asym_id': ['E']}), (<Site: 969>, {'label_asym_id': ['I']}), (<Site: 972>, {'label_asym_id': ['F']})]
ERROR: ['combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped', 'combine_sites failed; Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
13,ASD00460000_1,CASR,Homo sapiens,P41180,7E6T,,TCR,A,Lig,Activator,CYCLOMETHYLTRYPTOPHAN,907,,,,,,7E6T
14,ASD00460000_1,CASR,Homo sapiens,P41180,7E6T,,CA,A,Ion,Activator,CALCIUM ION,904,,,,,,7E6T


In [433]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
160,ASD00460000_1,CASR,Homo sapiens,P41180,7E6T,,CA,A,Ion,Activator,CALCIUM ION,902,,,,,,7E6T
161,ASD00460000_1,CASR,Homo sapiens,P41180,7E6T,,CA,A,Ion,Activator,CALCIUM ION,903,,,,,,7E6T
162,ASD00460000_1,CASR,Homo sapiens,P41180,7E6T,,TCR,A,Lig,Activator,CYCLOMETHYLTRYPTOPHAN,907,,,,,,7E6T
163,ASD00460000_1,CASR,Homo sapiens,P41180,7E6T,,CA,A,Ion,Activator,CALCIUM ION,904,,,,,,7E6T


In [434]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '904'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [435]:
[(s, s.modulator, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]], s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 966>,
  {'label_asym_id': ['D']},
    auth_asym_id auth_comp_id auth_seq_id
  0            B           CA         902,
  {'equivalent': [{'other_site': {'label_asym_id': ['K']},
     'res_of_other_in_site': 1.0,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 0.0,
     'res_of_site_in_other': 0.0},
    {'other_site': {'label_asym_id': ['L']},
     'res_of_other_in_site': 0.0,
     'res_of_site_in_other': 0.0},
    {'other_site': {'label_asym_id': ['F']},
     'res_of_other_in_site': 0.0,
     'res_of_site_in_other': 0.0},
    {'other_site': {'label_asym_id': ['M']},
     'res_of_other_in_site': 0.0,
     'res_of_site_in_other': 0.0}]}),
 (<Site: 968>,
  {'label_asym_id': ['E']},
    auth_asym_id auth_comp_id auth_seq_id
  0            B           CA         903,
  {'equivalent': [{'other_site': {'label_asym_id': ['L']},
     'res_of_other_in_site': 1.0,
     'res_of_site_in_other': 0.833333333333333

In [436]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['I']}).delete_instance()

1

In [437]:
solve_error(pdb, update = [[{"label_asym_id": "I"}]], auto_site_grouping=True, stringent_site_grouping=False)

7E6T [[{'label_asym_id': 'I'}]]
SITES: [(<Site: 4647>, {'label_asym_id': ['E']}), (<Site: 4648>, {'label_asym_id': ['F']}), (<Site: 4654>, {'label_asym_id': ['D', 'I']})]


In [438]:
Site.get(Site.pdb == pdb, Site.modulator == {'label_asym_id': ['D', 'I']}).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': 'CALCIUM ION'},
  {'modulator': [{'label_asym_id': 'I'}],
   'label_entity_id': '5',
   'type': 'non-polymer',
   'pdbx_description': 'CYCLOMETHYLTRYPTOPHAN'}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P41180']}],
 'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00460000_1',
      'target_gene': 'CASR',
      'organism': 'Homo sapiens',
      'pdb_uniprot': 'P41180',
      'allosteric_pdb': '7E6T',
      'modulator_serial': None,
      'modulator_alias': 'CA',
      'modulator_chain': 'A',
      'modulator_class': 'Ion',
      'modulator_feature': 'Activator',
      'modulator_name': 'CALCIUM ION',
      'modulator_resi': '902',
      'function': None,
      'position': None,
      'pubmed_id': None,
      'ref_title': No

The annotations are correct; Calcium is considered to bind in the orthosteric site but it's "an allosteric modulator of itself" https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8033781/.

<br>

In [439]:
errors_groups()

{'Molecules of the annotated modulator(s) bind close together but were not grouped': ['11bg',
  '1e7c',
  '1ecb',
  '1egy',
  '1eup',
  '1gph',
  '1r1v',
  '1thc',
  '1vm1',
  '1wda',
  '1yp2',
  '1yp3',
  '1yp4',
  '1z62',
  '2d41',
  '2dew',
  '2dex',
  '2dw5',
  '2fsz',
  '2gq1',
  '2ha4',
  '2wrm',
  '3ao1',
  '3cqd',
  '3e3f',
  '3f3t',
  '3f3u',
  '3f48',
  '3k8s',
  '3lsf',
  '3lsl',
  '3mzh',
  '3njq',
  '3umo',
  '3uqd',
  '3zl6',
  '4clz',
  '4dkt',
  '4eag',
  '4eak',
  '4gqq',
  '4i0u',
  '4m0z',
  '4ni0',
  '4oyo',
  '4p2t',
  '4p3h',
  '4ple',
  '4qfy',
  '4qsh',
  '4r8z',
  '4rew',
  '4rqo',
  '4z87',
  '5afk',
  '5btr',
  '5cnt',
  '5ezv',
  '5mcp',
  '5olk',
  '5s4r',
  '5s4s',
  '5tc3',
  '5ur3',
  '5ute',
  '5utn',
  '5uv3',
  '5v5d',
  '5v5e',
  '6b0z',
  '6brk',
  '6dja',
  '6i0m',
  '6i0o',
  '6ms7',
  '6q4d',
  '6qku',
  '6qkw',
  '6qxa',
  '6zxm',
  '7a5y'],
 'W': ['5j8v', '5uvg', '6p1q', '6p4v', '7lh2']}

##### Molecules of the annotated modulator(s) bind close together but were not grouped

In [440]:
errors_groups()["Molecules of the annotated modulator(s) bind close together but were not grouped"]

['11bg',
 '1e7c',
 '1ecb',
 '1egy',
 '1eup',
 '1gph',
 '1r1v',
 '1thc',
 '1vm1',
 '1wda',
 '1yp2',
 '1yp3',
 '1yp4',
 '1z62',
 '2d41',
 '2dew',
 '2dex',
 '2dw5',
 '2fsz',
 '2gq1',
 '2ha4',
 '2wrm',
 '3ao1',
 '3cqd',
 '3e3f',
 '3f3t',
 '3f3u',
 '3f48',
 '3k8s',
 '3lsf',
 '3lsl',
 '3mzh',
 '3njq',
 '3umo',
 '3uqd',
 '3zl6',
 '4clz',
 '4dkt',
 '4eag',
 '4eak',
 '4gqq',
 '4i0u',
 '4m0z',
 '4ni0',
 '4oyo',
 '4p2t',
 '4p3h',
 '4ple',
 '4qfy',
 '4qsh',
 '4r8z',
 '4rew',
 '4rqo',
 '4z87',
 '5afk',
 '5btr',
 '5cnt',
 '5ezv',
 '5mcp',
 '5olk',
 '5s4r',
 '5s4s',
 '5tc3',
 '5ur3',
 '5ute',
 '5utn',
 '5uv3',
 '5v5d',
 '5v5e',
 '6b0z',
 '6brk',
 '6dja',
 '6i0m',
 '6i0o',
 '6ms7',
 '6q4d',
 '6qku',
 '6qkw',
 '6qxa',
 '6zxm',
 '7a5y']

In [441]:
pdb = "11bg"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
39,ASD03310000_1,SRN,Bos taurus,P00669,11BG,ASD03310002,U2G,A,Lig,Activator,"[(2R,3S,4R,5R)-5-(2-amino-6-oxo-1H-purin-9-yl)...",131,Inner Protein Regulator,Protein-Protein Interaction,10543951,A potential allosteric subsite generated by do...,No,"Chain A:ASP14,ASN24,ASN27,LEU28,ASN94,CYS95; C..."


In [442]:
solve_error(pdb, [
    [{"label_asym_id": "H"}],
    [{"label_asym_id": "I"}],
    [{"label_asym_id": "J"}],
    [{"label_asym_id": "N"}],
], auto_site_grouping=False, stringent_site_grouping=True)

11BG [[{'label_asym_id': 'H'}], [{'label_asym_id': 'I'}], [{'label_asym_id': 'J'}], [{'label_asym_id': 'N'}]]
SITES: [(<Site: 4655>, {'label_asym_id': ['H']}), (<Site: 4656>, {'label_asym_id': ['I']}), (<Site: 4657>, {'label_asym_id': ['J']})]


In [443]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['I']}]

[1, 1]

The two molecules of the modulator that produced the error are not the annotated ones, but for consistency in ´.related_sites´, they will be added individually as sites because they do not seem to bind totally together.

<br>

In [444]:
pdb = "1e7c"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 336>, {'label_asym_id': ['G', 'H', 'I']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
36,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007,HLT,A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4005,Inner Protein Regulator,Inner Protein,10940306,Binding of the general anesthetics propofol an...,No,1E7C


In [445]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1094,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007,HLT,A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4005,Inner Protein Regulator,Inner Protein,10940306,Binding of the general anesthetics propofol an...,No,1E7C
1095,ASD02670000_1,ALB,Homo sapiens,P02768,1E7C,ASD02670007;ASD02670007,HLT;HLT,A;A,Lig,Inhibitor,"(2R)-2-bromo-2-chloro-1,1,1-trifluoro-ethane",4001;4002,Inner Protein Regulator,Inner Protein,10940306,Binding of the general anesthetics propofol an...,No,1E7C


In [446]:
solve_error(pdb, [
    {"label_asym_id": ["J", "K"]},
], auto_site_grouping=True, stringent_site_grouping=False)

1E7C [{'label_asym_id': ['J', 'K']}]
SITES: [(<Site: 336>, {'label_asym_id': ['G', 'H', 'I']}), (<Site: 4665>, {'label_asym_id': ['J', 'K']})]


The existing site was successfuly manually corrected before, and the other simple annotation is again a site formed by two molecules of the modulator that bind together.

<br>

In [447]:
pdb = "1ecb"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
8,ASD00280000_2,purF,Escherichia coli,P0AG16,1ECB,ASD00860008,5GP,A,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(2-amino-6-oxo-1H-purin-9-yl)...",506,Inner Protein Regulator,Protein-Protein Interaction,9333323,Coupled formation of an amidotransferase inter...,Yes,"Chain A:HIS25,TYR258,ALA260,ARG261,PRO262,ARG2..."


In [448]:
solve_error(pdb, [
    [{"label_asym_id": "F"}],
    [{"label_asym_id": "G"}],
    [{"label_asym_id": "I"}],
    [{"label_asym_id": "J"}],
    [{"label_asym_id": "L"}],
    [{"label_asym_id": "M"}],
    [{"label_asym_id": "O"}],
    [{"label_asym_id": "P"}],
], auto_site_grouping=False, stringent_site_grouping=True)

1ECB [[{'label_asym_id': 'F'}], [{'label_asym_id': 'G'}], [{'label_asym_id': 'I'}], [{'label_asym_id': 'J'}], [{'label_asym_id': 'L'}], [{'label_asym_id': 'M'}], [{'label_asym_id': 'O'}], [{'label_asym_id': 'P'}]]
SITES: [(<Site: 4666>, {'label_asym_id': ['F']}), (<Site: 4667>, {'label_asym_id': ['G']})]


In [449]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['G']}]

[1]

In [450]:
PDB.get(PDB.entry_id == pdb).sites[0].related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['J']},
   'res_of_other_in_site': 1.08,
   'res_of_site_in_other': 1.08},
  {'other_site': {'label_asym_id': ['M']},
   'res_of_other_in_site': 1.08,
   'res_of_site_in_other': 1.08},
  {'other_site': {'label_asym_id': ['P']},
   'res_of_other_in_site': 1.0833333333333333,
   'res_of_site_in_other': 1.04}],
 'nonequivalent': [{'other_site': {'label_asym_id': ['F']},
   'res_of_other_in_site': 0.30434782608695654,
   'res_of_site_in_other': 0.28},
  {'other_site': {'label_asym_id': ['I']},
   'res_of_other_in_site': 0.34782608695652173,
   'res_of_site_in_other': 0.32},
  {'other_site': {'label_asym_id': ['L']},
   'res_of_other_in_site': 0.30434782608695654,
   'res_of_site_in_other': 0.28},
  {'other_site': {'label_asym_id': ['O']},
   'res_of_other_in_site': 0.34782608695652173,
   'res_of_site_in_other': 0.32}]}

The annotated molecule is indeed bound in the allosteric site (confirmed with primary citation) but it is adjacent to the active site-bound molecule of the same entity and thus must be manually corrected.

<br>

In [451]:
pdb = "1egy"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD00850000_1,eryF,Saccharopolyspora erythraea,Q00441,1EGY,ASD00850001,9AP,A,Lig,Activator,phenanthren-9-amine,801,Inner Protein Regulator,Inner Protein,10716705,Crystal structures of ligand complexes of P450...,Yes,"Chain A:ALA74,PHE78,PHE86,ARG185,VAL237,ALA241..."


In [452]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
], auto_site_grouping=False, stringent_site_grouping=True)

1EGY [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}]]
SITES: [(<Site: 4668>, {'label_asym_id': ['C']}), (<Site: 4669>, {'label_asym_id': ['D']})]


In [453]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

The annotated molecule is indeed bound in the allosteric site (confirmed by primary citation, in which it looks like the mechanism is cooperative binding of the two to the active site) but it is adjacent to the active site-bound molecule of the same entity and thus must be manually corrected.

<br>

In [454]:
pdb = "1eup"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD00850000_1,eryF,Saccharopolyspora erythraea,Q00441,1EUP,ASD09650004,ASD,A,Lig,Activator,"(8R,9S,10R,13S,14S)-10,13-dimethyl-2,6,7,8,9,1...",452,Inner Protein Regulator,Inner Protein,10716705,Crystal structures of ligand complexes of P450...,Yes,"Chain A:ALA74,TYR75,PHE86,SER171,ILE174,LEU175..."


In [455]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
], auto_site_grouping=False, stringent_site_grouping=True)

1EUP [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}]]
SITES: [(<Site: 4670>, {'label_asym_id': ['C']}), (<Site: 4671>, {'label_asym_id': ['D']})]


In [456]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

Same as before.

<br>

In [457]:
pdb = "1gph"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
7,ASD00280000_1,purF,Bacillus subtilis,P00497,1GPH,ASD00030001,AMP,1,Lig,Inhibitor,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-bis(o...",468,Inner Protein Regulator,Protein-Protein Interaction,8197456,Structure of the allosteric regulatory enzyme ...,Yes,"Chain 1:HIS25,TYR242,SER244,ARG245,PRO246,ARG2..."


In [458]:
solve_error(pdb, [
    [{"label_asym_id": "F"}],
    [{"label_asym_id": "G"}],
    [{"label_asym_id": "I"}],
    [{"label_asym_id": "J"}],
    [{"label_asym_id": "L"}],
    [{"label_asym_id": "M"}],
    [{"label_asym_id": "O"}],
    [{"label_asym_id": "P"}],
], auto_site_grouping=False, stringent_site_grouping=True)

1GPH [[{'label_asym_id': 'F'}], [{'label_asym_id': 'G'}], [{'label_asym_id': 'I'}], [{'label_asym_id': 'J'}], [{'label_asym_id': 'L'}], [{'label_asym_id': 'M'}], [{'label_asym_id': 'O'}], [{'label_asym_id': 'P'}]]
SITES: [(<Site: 4672>, {'label_asym_id': ['F']}), (<Site: 4673>, {'label_asym_id': ['G']})]


In [459]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['G']}]

[1]

The annotated molecule is indeed bound in the allosteric site ("suggested" allostery by primary citation) but it is adjacent to the active site-bound molecule of the same entity and thus must be manually corrected.

<br>

In [460]:
pdb = "1r1v"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
41,ASD04630000_1,rzcA,Staphylococcus aureus,O85142,1R1V,ASD01310003,ZN,A,Ion,Regulator,zinc(+2) cation,501,Protein-DNA/RNA Interaction Regulator,Inner Protein,14568530,A metal-ligand-mediated intersubunit allosteri...,No,"Chain A:ASP84,HIS86"


In [461]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
    [{"label_asym_id": "E"}],
    [{"label_asym_id": "F"}]
], auto_site_grouping=False, stringent_site_grouping=True)

1R1V [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}], [{'label_asym_id': 'E'}], [{'label_asym_id': 'F'}]]
SITES: [(<Site: 4674>, {'label_asym_id': ['C']}), (<Site: 4676>, {'label_asym_id': ['E']}), (<Site: 4677>, {'label_asym_id': ['F']})]


In [462]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C']}]

[1, 1]

The annotated molecule is indeed the allosteric modulator (confirmed by primary citation) but it is close to additional molecules of the modulator and should be annotated manually.

<br>

In [463]:
pdb = "1thc"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
57,ASD10420000_1,TTR,Homo sapiens,P02766,1THC,ASD10420084,FL9,A,Lig,Inhibitor,"2-[(3,5-dibromo-2,4-dihydroxy-phenyl)methylide...",131,Inner Protein Regulator,Protein-Protein Interaction,1631168,Crystal structure determination at 2.3-A resol...,No,1THC


In [464]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

1THC [[{'auth_asym_id': 'A', 'auth_comp_id': 'FL9', 'auth_seq_id': '131'}]]
SITES: [(<Site: 4679>, {'label_asym_id': ['C', 'D']})]


The annotated molecule is indeed the allosteric modulator but it shares occupancy with another copy and they were not recognized as occupying the same site for some reason.

<br>

In [465]:
pdb = "1vm1"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
64,ASD12070000_1,bla,Klebsiella pneumoniae,P0AD64,1VM1,ASD12070001,MA4,A,Lig,Inhibitor,"(2R,3R,4S,5S,6R)-2-[(2R,3S,4R,5R,6R)-6-(6-cycl...",300,Inner Protein Regulator,Inner Protein,11327849,Inhibition of the SHV-1 beta-lactamase by sulf...,Yes,"Chain A:SER26,VAL224,PRO226,ILE231,ILE246,ALA2..."


In [466]:
solve_error(pdb, None, auto_site_grouping=False, stringent_site_grouping=False)

1VM1 [[{'auth_asym_id': 'A', 'auth_comp_id': 'MA4', 'auth_seq_id': '300'}]]
SITES: [(<Site: 4680>, {'label_asym_id': ['B']})]


The annotated molecule is correct but there is a fragment of another molecule of the modulator that is proposed that is part of another complex of the protein with another molecule of the modulator bound in the same position (primary citation).

<br>

In [467]:
pdb = "1wda"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 419>, {'label_asym_id': ['C', 'D', 'F']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
56,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,15247907,Structural basis for Ca(2+)-induced activation...,No,1WDA


In [468]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2360,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,15247907,Structural basis for Ca(2+)-induced activation...,No,1WDA
2361,ASD11630000_1,,Homo sapiens,Q9UM07,1WDA,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,15247907,Structural basis for Ca(2+)-induced activation...,No,1WDA


In [469]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

1WDA [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '903'}]]
1wda ['combine_sites failed; len(all_sites) > (len(old_sites) + len(new_sites))']
SITES: [(<Site: 419>, {'label_asym_id': ['C', 'D', 'F']}), (<Site: 4684>, {'label_asym_id': ['E']})]


In [470]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['Q9UM07']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD11630000_1',
       'target_gene': None,
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'Q9UM07',
       'allosteric_pdb': '1WDA',
       'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001',
       'modulator_alias': 'CA;CA;CA',
       'modulator_chain': 'A;A;A',
       'modulator_class': 'Ion',
       'modula

As previously corrected for 1WDA, it is going to be corrected manually. `combine_sites` is going to fail but the saved sites are correct.

<br>

In [471]:
pdb = "1yp2"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 432>, {'label_asym_id': ['E', 'G']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002,SO4,A,Ion,Activator,sulfate,2007,Inner Protein Regulator,Inner Protein,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135; Chain D:ARG83"


In [472]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
56,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,2000;2008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,HIS84,GLN314,ARG316,..."
57,ASD00250000_6,,Solanum tuberosum,P23509,1YP2,ASD02880002,SO4,A,Ion,Activator,sulfate,2007,Inner Protein Regulator,Inner Protein,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135; Chain D:ARG83"


In [473]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

1YP2 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '2007'}]]
SITES: [(<Site: 432>, {'label_asym_id': ['E', 'G']}), (<Site: 4689>, {'label_asym_id': ['J']})]


In [474]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'SULFATE ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'SULFATE ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P23509']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00250000_6',
       'target_gene': None,
       'organism': 'Solanum tuberosum',
       'pdb_uniprot': 'P23509',
       'allosteric_pdb': '1YP2',
       'modulator_serial': 'ASD02880002;ASD02880002',
       'modulator_alias': 'SO4;SO4',
       'modulator_chain': 'A;A',
       'modulator_class': 'Ion',
       'modulator_feature': 'Activator',
       'modulator_name': 'sulfate;sulfate',
       'modulator_resi': '2000;2008',
       'function': 'Inner Protein Regul

Similar to the previous.

<br>

In [475]:
pdb = "1yp3"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 446>, {'label_asym_id': ['E', 'G']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
5,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002,SO4,A,Ion,Activator,sulfate,1007,Inner Protein Regulator,Inner Protein,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135; Chain D:ARG83"


In [476]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
58,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,HIS84,GLN314,A..."
59,ASD00250000_6,,Solanum tuberosum,P23509,1YP3,ASD02880002,SO4,A,Ion,Activator,sulfate,1007,Inner Protein Regulator,Inner Protein,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135; Chain D:ARG83"


In [477]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

1YP3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1007'}]]
SITES: [(<Site: 446>, {'label_asym_id': ['E', 'G']}), (<Site: 4691>, {'label_asym_id': ['F']})]


In [478]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'SULFATE ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'SULFATE ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P23509']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00250000_6',
       'target_gene': None,
       'organism': 'Solanum tuberosum',
       'pdb_uniprot': 'P23509',
       'allosteric_pdb': '1YP3',
       'modulator_serial': 'ASD02880002;ASD02880002',
       'modulator_alias': 'SO4;SO4',
       'modulator_chain': 'A;A',
       'modulator_class': 'Ion',
       'modulator_feature': 'Activator',
       'modulator_name': 'sulfate;sulfate',
       'modulator_resi': '1000;1008',
       'function': 'Inner Protein Regul

Same as before.

<br>

In [479]:
pdb = "1yp4"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 463>, {'label_asym_id': ['E', 'G']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
5,ASD00250000_6,,Solanum tuberosum,P23509,1YP4,ASD02880002,SO4,A,Ion,Activator,sulfate,1007,Inner Protein Regulator,Inner Protein,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135"


In [480]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
60,ASD00250000_6,,Solanum tuberosum,P23509,1YP4,ASD02880002;ASD02880002,SO4;SO4,A;A,Ion,Activator,sulfate;sulfate,1000;1008,Inner Protein Regulator,Protein-Protein Interaction,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:ARG41,PRO47,ARG53,SER80,ARG83,HIS84,GL..."
61,ASD00250000_6,,Solanum tuberosum,P23509,1YP4,ASD02880002,SO4,A,Ion,Activator,sulfate,1007,Inner Protein Regulator,Inner Protein,15692569,Crystal structure of potato tuber ADP-glucose ...,No,"Chain A:LYS69,GLU133,HIS134,THR135"


In [481]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

1YP4 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '1007'}]]
SITES: [(<Site: 463>, {'label_asym_id': ['E', 'G']}), (<Site: 4697>, {'label_asym_id': ['K']})]


In [482]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'SULFATE ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'SULFATE ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P23509']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00250000_6',
       'target_gene': None,
       'organism': 'Solanum tuberosum',
       'pdb_uniprot': 'P23509',
       'allosteric_pdb': '1YP4',
       'modulator_serial': 'ASD02880002;ASD02880002',
       'modulator_alias': 'SO4;SO4',
       'modulator_chain': 'A;A',
       'modulator_class': 'Ion',
       'modulator_feature': 'Activator',
       'modulator_name': 'sulfate;sulfate',
       'modulator_resi': '1000;1008',
       'function': 'Inner Protein Regul

Same as before.

<br>

In [483]:
pdb = "1z62"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 467>, {'label_asym_id': ['C', 'D']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
18,ASD01320000_3,PYGM,Oryctolagus cuniculus,P00489,1Z62,ASD01320116,IAA,A,Lig,Inhibitor,2-[[(2Z)-2-(2-oxo-7H-indol-3-ylidene)-7H-indol...,992,Inner Protein Regulator,Inner Protein,,Indirubin-3'-Aminooxy-Acetate Inhibits Glycoge...,No,"Chain A:ASN282,PHE285,GLU382,GLU572,ALA610,GLY..."


In [484]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
665,ASD01320000_3,PYGM,Oryctolagus cuniculus,P00489,1Z62,ASD01320116;ASD01320116,IAA;IAA,A;A,Lig;Lig,Inhibitor,2-[[(2Z)-2-(2-oxo-7H-indol-3-ylidene)-7H-indol...,990;991,Inner Protein Regulator,Inner Protein,,Indirubin-3'-Aminooxy-Acetate Inhibits Glycoge...,No,"Chain A:TRP67,ILE68,GLN71,GLN72,TYR75,ARG193,P..."
666,ASD01320000_3,PYGM,Oryctolagus cuniculus,P00489,1Z62,ASD01320116,IAA,A,Lig,Inhibitor,2-[[(2Z)-2-(2-oxo-7H-indol-3-ylidene)-7H-indol...,992,Inner Protein Regulator,Inner Protein,,Indirubin-3'-Aminooxy-Acetate Inhibits Glycoge...,No,"Chain A:ASN282,PHE285,GLU382,GLU572,ALA610,GLY..."


In [485]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

1Z62 [[{'auth_asym_id': 'A', 'auth_comp_id': 'IAA', 'auth_seq_id': '992'}]]
SITES: [(<Site: 467>, {'label_asym_id': ['C', 'D']}), (<Site: 4700>, {'label_asym_id': ['E']})]


In [486]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "({[(3E)-2'-OXO-2',7'-DIHYDRO-2,3'-BIINDOL-3(7H)-YLIDENE]AMINO}OXY)ACETIC ACID"},
   {'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "({[(3E)-2'-OXO-2',7'-DIHYDRO-2,3'-BIINDOL-3(7H)-YLIDENE]AMINO}OXY)ACETIC ACID"}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P00489']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD01320000_3',
       'target_gene': 'PYGM',
       'organism': 'Oryctolagus cuniculus',
       'pdb_uniprot': 'P00489',
       'allosteric_pdb': '1Z62',
       'modulator_serial': 'ASD01320116;ASD01320116',
       'modulator_alias': 'IAA;IAA',
       'modulator_chain': 'A;A',
       'modulator_class': 'Lig;Lig',
       'modulator_fe

Similar as before.

<br>

In [487]:
pdb = "2d41"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
30,ASD03740000_2,,Hepatitis C virus,Q99AU2,2D41,ASD01418019,SNH,A,Lig,Inhibitor,"3-[(2,4-dimethylphenyl)sulfonylamino]-5-(5-eth...",1001,Inner Protein Regulator,Inner Protein,16828488,Non-nucleoside Inhibitors Binding to Hepatitis...,No,"Chain A:LEU419,ARG422,MET423,LEU474,HIS475,SER..."


In [488]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

2D41 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SNH', 'auth_seq_id': '1001'}]]
SITES: [(<Site: 4705>, {'label_asym_id': ['C', 'D']})]


There are two molecules of the modulator that bind in the same site, stacked.

<br>

In [489]:
pdb = "2dew"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 481>, {'label_asym_id': ['D', 'E', 'G']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
52,ASD11630000_1,,Homo sapiens,Q9UM07,2DEW,ASD11630001,CA,X,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEW


In [490]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2362,ASD11630000_1,,Homo sapiens,Q9UM07,2DEW,ASD11630001,CA,X,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEW
2363,ASD11630000_1,,Homo sapiens,Q9UM07,2DEW,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,X;X;X,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEW


In [491]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

2DEW [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '903'}]]
2dew ['combine_sites failed; len(all_sites) > (len(old_sites) + len(new_sites))']
SITES: [(<Site: 481>, {'label_asym_id': ['D', 'E', 'G']}), (<Site: 4709>, {'label_asym_id': ['F']})]


In [492]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['Q9UM07']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD11630000_1',
       'target_gene': None,
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'Q9UM07',
       'allosteric_pdb': '2DEW',
       'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001',
       'modulator_alias': 'CA;CA;CA',
       'modulator_chain': 'X;X;X',
       'modulator_class': 'Ion',
       'modula

Similar as before.

<br>

In [493]:
pdb = "2dex"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 482>, {'label_asym_id': ['D', 'E', 'G']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
52,ASD11630000_1,,Homo sapiens,Q9UM07,2DEX,ASD11630001,CA,X,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEX


In [494]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2364,ASD11630000_1,,Homo sapiens,Q9UM07,2DEX,ASD11630001,CA,X,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEX
2365,ASD11630000_1,,Homo sapiens,Q9UM07,2DEX,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,X;X;X,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,16567635,Structural basis for histone N-terminal recogn...,No,2DEX


In [495]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

2DEX [[{'auth_asym_id': 'X', 'auth_comp_id': 'CA', 'auth_seq_id': '903'}]]
2dex ['combine_sites failed; len(all_sites) > (len(old_sites) + len(new_sites))']
SITES: [(<Site: 482>, {'label_asym_id': ['D', 'E', 'G']}), (<Site: 4713>, {'label_asym_id': ['F']})]


In [496]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['Q9UM07']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD11630000_1',
       'target_gene': None,
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'Q9UM07',
       'allosteric_pdb': '2DEX',
       'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001',
       'modulator_alias': 'CA;CA;CA',
       'modulator_chain': 'X;X;X',
       'modulator_class': 'Ion',
       'modula

Similar as before.

<br>

In [497]:
pdb = "2dw5"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 483>, {'label_asym_id': ['C', 'D', 'F']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
52,ASD11630000_1,,Homo sapiens,Q9UM07,2DW5,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,17002273,Inhibitors and Inactivators of Protein Arginin...,No,2DW5


In [498]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2368,ASD11630000_1,,Homo sapiens,Q9UM07,2DW5,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,903,Inner Protein Regulator,Inner Protein,17002273,Inhibitors and Inactivators of Protein Arginin...,No,2DW5
2369,ASD11630000_1,,Homo sapiens,Q9UM07,2DW5,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,901;902;904,Inner Protein Regulator,Inner Protein,17002273,Inhibitors and Inactivators of Protein Arginin...,No,2DW5


In [499]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

2DW5 [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '903'}]]
2dw5 ['combine_sites failed; len(all_sites) > (len(old_sites) + len(new_sites))']
SITES: [(<Site: 483>, {'label_asym_id': ['C', 'D', 'F']}), (<Site: 4717>, {'label_asym_id': ['E']})]


In [500]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['Q9UM07']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD11630000_1',
       'target_gene': None,
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'Q9UM07',
       'allosteric_pdb': '2DW5',
       'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001',
       'modulator_alias': 'CA;CA;CA',
       'modulator_chain': 'A;A;A',
       'modulator_class': 'Ion',
       'modula

Similar as before.

<br>

In [501]:
pdb = "2fsz"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
8,ASD00680000_1,ESR2,Homo sapiens,Q92731,2FSZ,ASD00680003,OHT,B,Lig,Inhibitor,4-[(Z)-1-[4-(2-dimethylaminoethoxy)phenyl]-2-p...,104,Protein-Protein Interaction Regulator,Protein-Protein Interaction,16782818,A second binding site for hydroxytamoxifen wit...,Yes,2FSZ


In [502]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
    [{"label_asym_id": "E"}],
    [{"label_asym_id": "F"}]
], auto_site_grouping=False, stringent_site_grouping=True)

2FSZ [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}], [{'label_asym_id': 'E'}], [{'label_asym_id': 'F'}]]
SITES: [(<Site: 4718>, {'label_asym_id': ['C']}), (<Site: 4719>, {'label_asym_id': ['D']})]


In [503]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 4718>,
  {'label_asym_id': ['C']},
  {'equivalent': [{'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 1.0,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['D']},
     'res_of_other_in_site': 0.21428571428571427,
     'res_of_site_in_other': 0.10714285714285714},
    {'other_site': {'label_asym_id': ['F']},
     'res_of_other_in_site': 0.15384615384615385,
     'res_of_site_in_other': 0.07142857142857142}]}),
 (<Site: 4719>,
  {'label_asym_id': ['D']},
  {'equivalent': [{'other_site': {'label_asym_id': ['F']},
     'res_of_other_in_site': 1.0,
     'res_of_site_in_other': 0.9285714285714286}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.10714285714285714,
     'res_of_site_in_other': 0.21428571428571427},
    {'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 0.10714285714285714,
     'res_of_site_in_other': 0.21428571428571427}]})]

In [504]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

Similar as previous cases, the allosteric modulator molecule binds close to a molecule bound in the orthosteric site. The annotated molecule of the dataset entry corresponds to `label_asym_id` F.

<br>

In [505]:
pdb = "2gq1"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
12,ASD01120000_7,fbp,Escherichia coli,P0A993,2GQ1,ASD02880002,SO4,A,Ion,Activator,sulfate,344,Inner Protein Regulator,Protein-Protein Interaction,16670087,Novel Allosteric Activation Site in Escherichi...,No,"Chain A:THR3,LEU4,GLY5,LYS30"


In [506]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

2GQ1 [[{'auth_asym_id': 'A', 'auth_comp_id': 'SO4', 'auth_seq_id': '344'}]]
SITES: [(<Site: 4722>, {'label_asym_id': ['D']})]


Only one sulfate is annotated that binds by itself, and the error arises from the grouping of other molecules of the sulfate.

<br>

In [507]:
pdb = "2ha4"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00230000_2,Ache,Mus musculus,P21836,2HA4,ASD00230019,ACH,A,Lig,Regulator,2-acetyloxyethyl-trimethyl-azanium,545,Inner Protein Regulator,Inner Protein,16837465,Substrate and product trafficking through the ...,Yes,"Chain A:TYR72,TYR124,TRP286,PHE297,PHE338,TYR341"


In [508]:
solve_error(pdb, [
    [{"label_asym_id": "D"}],
    [{"label_asym_id": "E"}],
    [{"label_asym_id": "H"}],
    [{"label_asym_id": "I"}]
], auto_site_grouping=False, stringent_site_grouping=True)

2HA4 [[{'label_asym_id': 'D'}], [{'label_asym_id': 'E'}], [{'label_asym_id': 'H'}], [{'label_asym_id': 'I'}]]
SITES: [(<Site: 4723>, {'label_asym_id': ['D']}), (<Site: 4724>, {'label_asym_id': ['E']})]


In [509]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

The annotated molecule is indeed the allosteric modulator (confirmed by primary citation) and thus must be annotated manually.

<br>

In [510]:
pdb = "2wrm"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
27,ASD03740000_2,,Hepatitis C virus,P26663,2WRM,ASD01410467,QQ3,A,Lig,Inhibitor,"(3R)-3-(4-methyl-1,3-dioxo-pyrrolo[3,4-c]quino...",1532,Inner Protein Regulator,Inner Protein,,Identification of Novel Allosteric Inhibitors ...,No,"Chain A:ARG422,MET423,TYR477,LEU497,ARG498,ARG..."


In [511]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

2WRM [[{'auth_asym_id': 'A', 'auth_comp_id': 'QQ3', 'auth_seq_id': '1532'}]]
SITES: [(<Site: 4724>, {'label_asym_id': ['B']})]


The annotated molecule binds by itself and the error is probably due to other two molecules of the modulator that indeed bind together, stacked.

<br>

In [512]:
pdb = "3ao1"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
33,ASD05060000_1,pol,Human immunodeficiency virus type 1,Q72498,3AO1,ASD05062025,BZX,A,Lig,Inhibitor,"1,3-benzodioxol-5-ol",213,Protein-Protein Interaction Regulator,Protein-Protein Interaction,21275048,Fragment-based design of ligands targeting a n...,No,"Chain A:TYR83,ASN184,HIS185,GLY197,VAL201; Cha..."


In [513]:
solve_error(pdb, [
    [{"label_asym_id": "F"}],
    [{"label_asym_id": "G"}],
    [{"label_asym_id": "J"}]
], auto_site_grouping=False, stringent_site_grouping=True)

3AO1 [[{'label_asym_id': 'F'}], [{'label_asym_id': 'G'}], [{'label_asym_id': 'J'}]]
SITES: [(<Site: 4725>, {'label_asym_id': ['F']}), (<Site: 4726>, {'label_asym_id': ['G']})]


In [514]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['F']}]

[1]

The annotated molecule binds close to an additional molecule of the modulator, which in the primary citation is identified as an artifact of crystal contacts.

<br>

In [515]:
pdb = "3cqd"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
21,ASD02220000_1,pfkB,Escherichia coli,P06999,3CQD,ASD00150001,ATP,B,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",312,Inner Protein Regulator,Protein-Protein Interaction,18762190,Crystallographic structure of phosphofructokin...,Yes,"Chain A:ASN187,LYS189,GLY226,PRO227,THR251; Ch..."


In [516]:
solve_error(pdb, [
    [{"label_asym_id": "F"}],
    [{"label_asym_id": "G"}],
    [{"label_asym_id": "I"}],
    [{"label_asym_id": "J"}]
], auto_site_grouping=False, stringent_site_grouping=True)

3CQD [[{'label_asym_id': 'F'}], [{'label_asym_id': 'G'}], [{'label_asym_id': 'I'}], [{'label_asym_id': 'J'}]]
SITES: [(<Site: 4726>, {'label_asym_id': ['F']}), (<Site: 4727>, {'label_asym_id': ['G']})]


In [517]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 4726>,
  {'label_asym_id': ['F']},
  {'equivalent': [{'other_site': {'label_asym_id': ['I']},
     'res_of_other_in_site': 0.9736842105263158,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['G']},
     'res_of_other_in_site': 0.625,
     'res_of_site_in_other': 0.2702702702702703},
    {'other_site': {'label_asym_id': ['J']},
     'res_of_other_in_site': 0.5882352941176471,
     'res_of_site_in_other': 0.2702702702702703}]}),
 (<Site: 4727>,
  {'label_asym_id': ['G']},
  {'equivalent': [{'other_site': {'label_asym_id': ['J']},
     'res_of_other_in_site': 0.9411764705882353,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['F']},
     'res_of_other_in_site': 0.2702702702702703,
     'res_of_site_in_other': 0.625},
    {'other_site': {'label_asym_id': ['I']},
     'res_of_other_in_site': 0.2631578947368421,
     'res_of_site_in_other': 0.625}]})]

In [518]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['G']}]

[1]

The annotated molecule binds close to an additional molecule of the modulator bound in the active site, as confirmed by the primary citation. The annotation in the dataset corresponds to `label_asym_id` J.

<br>

In [519]:
pdb = "3e3f"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
14,ASD01400000_1,can,Haemophilus influenzae,P45148,3E3F,ASD01400003,BCT,B,Ion,Inhibitor,hydrogen carbonate,231,Inner Protein Regulator,Inner Protein,20359198,Evidence for a bicarbonate,No,"Chain B:TRP39,SER45,ARG46,ALA47,ALA49,ARG64"


In [520]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3E3F [[{'auth_asym_id': 'B', 'auth_comp_id': 'BCT', 'auth_seq_id': '231'}]]
SITES: [(<Site: 4733>, {'label_asym_id': ['D', 'E', 'I']})]


There are three molecules of the modulator binding together in the interface of the two protein chains forming a unique site.

<br>

In [521]:
pdb = "3f3t"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
7,ASD00740000_1,SRC,Gallus gallus,P00523,3F3T,ASD00740003,1AU,A,Lig,Inhibitor,3-[2-(3-aminophenyl)-5-tert-butyl-pyrazol-3-yl...,1,Inner Protein Regulator,Inner Protein,19396179,A new screening assay for allosteric inhibitor...,Yes,"Chain A:ALA293,LYS295,GLU310,VAL313,MET314,LEU..."


In [522]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
], auto_site_grouping=False, stringent_site_grouping=True)

3F3T [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}]]
SITES: [(<Site: 4734>, {'label_asym_id': ['C']}), (<Site: 4735>, {'label_asym_id': ['D']})]


In [523]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C']}]

[1]

The annotated molecule is correct binds close to an additional molecule of the modulator bound in the active site, as confirmed by the primary citation (supp. info).

<br>

In [524]:
pdb = "3f3u"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
8,ASD00740000_1,SRC,Gallus gallus,P00523,3F3U,ASD00740002,1AW,A,Lig,Inhibitor,3-[2-(3-aminophenyl)-5-tert-butyl-pyrazol-3-yl...,1,Inner Protein Regulator,Inner Protein,19396179,A new screening assay for allosteric inhibitor...,Yes,"Chain A:GLU310,MET314,LEU317,LEU322,VAL323,THR..."


In [525]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
    [{"label_asym_id": "E"}],
], auto_site_grouping=False, stringent_site_grouping=True)

3F3U [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}], [{'label_asym_id': 'E'}]]
SITES: [(<Site: 4735>, {'label_asym_id': ['C']}), (<Site: 4736>, {'label_asym_id': ['D']})]


In [526]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C']}]

[1]

Same as before.

<br>

In [527]:
pdb = "3f48"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
34,ASD07990000_1,snf,Aquifex aeolicus,O67854,3F48,ASD07990006,BOG,A,Lig,Inhibitor,"(2R,3S,4S,5R,6R)-2-(hydroxymethyl)-6-octoxy-ox...",706,Inner Protein Regulator,Inner Protein,19074341,A competitive inhibitor traps LeuT in an open-...,Yes,"Chain A:LEU25,GLY26,TYR108,ILE111,PHE253,GLY31..."


In [528]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3F48 [[{'auth_asym_id': 'A', 'auth_comp_id': 'BOG', 'auth_seq_id': '706'}]]
SITES: [(<Site: 4741>, {'label_asym_id': ['G']})]


The annotated molecule binds by itself and the error is probably due to other molecules of the modulator that might bind together, establishing pi-interactions.

<br>

In [529]:
pdb = "3k8s"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
38,ASD09120000_1,PPARG,Homo sapiens,P37231,3K8S,ASD09120003,Z27,B,Lig,Activator,"2-chloro-N-[3-chloro-4-[(5-chloro-1,3-benzothi...",2,Inner Protein Regulator,Inner Protein,18263587,"T2384, a novel antidiabetic agent with unique ...",Yes,3K8S


In [530]:
solve_error(pdb, [
    [{"label_asym_id": "C"}],
    [{"label_asym_id": "D"}],
    [{"label_asym_id": "E"}],
], auto_site_grouping=False, stringent_site_grouping=True)

3K8S [[{'label_asym_id': 'C'}], [{'label_asym_id': 'D'}], [{'label_asym_id': 'E'}]]
SITES: [(<Site: 4743>, {'label_asym_id': ['D']}), (<Site: 4744>, {'label_asym_id': ['E']})]


In [531]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

The annotated molecule is discussed in the primary citation as a second binding site but no allostery is analyzed, however it will be retained.

<br>

In [532]:
pdb = "3lsf"
get_error(pdb)

ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
8,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSF,ASD04920002,PZI,E,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,800,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain B:PRO105,SER108,SER217,LYS218,GLY219; Ch..."
9,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSF,ASD04920002,PZI,E,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,802,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain B:ASP248,ASN252; Chain E:ASP216,SER217"


In [533]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
552,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSF,ASD04920002,PZI,E,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,800,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain B:PRO105,SER108,SER217,LYS218,GLY219; Ch..."
553,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSF,ASD04920002,PZI,E,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,802,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain B:ASP248,ASN252; Chain E:ASP216,SER217"


In [534]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '802'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [535]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3LSF [[{'auth_asym_id': 'E', 'auth_comp_id': 'PZI', 'auth_seq_id': '800'}]]
SITES: [(<Site: 4751>, {'label_asym_id': ['E', 'F', 'H', 'I']})]


In [536]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '802'").squeeze())

In [537]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3LSF [[{'auth_asym_id': 'E', 'auth_comp_id': 'PZI', 'auth_seq_id': '802'}]]
SITES: [(<Site: 4767>, {'label_asym_id': ['E', 'F', 'H', 'I']})]


In [538]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'H'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'I'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A', 'B']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P19491']}],
 'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD01230000_2',
      'target_gene': 'Gria2',
      'organism': 'Rattus norvegicus',
      'pdb_uniprot': 'P19491',
    

In [539]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'B'], dtype=object), array(['A'], dtype=object))

There are many molecules of the modulator binding together in the interface of the two protein chains forming a unique site.

<br>

In [540]:
pdb = "3lsl"
get_error(pdb)

ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
8,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSL,ASD04920002,PZI,A,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,801,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain A:ILE92,PRO105,MET107,SER108,SER217,LYS2..."
9,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSL,ASD04920002,PZI,A,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,802,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain A:ASP248,ASN252; Chain D:ASP216,SER217"


In [541]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
554,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSL,ASD04920002,PZI,A,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,801,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain A:ILE92,PRO105,MET107,SER108,SER217,LYS2..."
555,ASD01230000_2,Gria2,Rattus norvegicus,P19491,3LSL,ASD04920002,PZI,A,Lig,Activator,2-(2-oxopyrrolidin-1-yl)ethanamide,802,Inner Protein Regulator,Protein-Protein Interaction,20163115,Piracetam Defines a New Binding Site for Allos...,No,"Chain A:ASP248,ASN252; Chain D:ASP216,SER217"


In [542]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '802'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [543]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3LSL [[{'auth_asym_id': 'A', 'auth_comp_id': 'PZI', 'auth_seq_id': '801'}]]
SITES: [(<Site: 4780>, {'label_asym_id': ['E', 'F', 'G', 'K', 'L', 'M']})]


In [544]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '802'").squeeze())

In [545]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3LSL [[{'auth_asym_id': 'A', 'auth_comp_id': 'PZI', 'auth_seq_id': '802'}]]
SITES: [(<Site: 4806>, {'label_asym_id': ['E', 'F', 'G', 'K', 'L', 'M']})]


In [546]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'G'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'K'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'L'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
  {'modulator': [{'label_asym_id': 'M'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'}],
 'interacting_chains_info': [{'label_entity_i

In [547]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'B'], dtype=object), array(['A'], dtype=object))

Same as before.

<br>

In [548]:
pdb = "3mzh"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
6,ASD00480000_2,crp,Mycobacterium tuberculosis,P9WMH2,3MZH,ASD00480003,CMP,A,Lig,Activator,"(1R,4R,6S,7R,8R)-8-(6-aminopurin-9-yl)-4-hydro...",623,Protein-DNA/RNA Interaction Regulator,Protein-Protein Interaction,,Crystal Structure of Camp Receptor Protein fro...,No,"Chain A:PHE38,ILE57,LEU69,PHE78,GLY79,GLU80,LE..."


In [549]:
solve_error(pdb, [
    [{"label_asym_id": "E"}],
    [{"label_asym_id": "F"}],
    [{"label_asym_id": "G"}],
], auto_site_grouping=False, stringent_site_grouping=True)

3MZH [[{'label_asym_id': 'E'}], [{'label_asym_id': 'F'}], [{'label_asym_id': 'G'}]]
SITES: [(<Site: 4807>, {'label_asym_id': ['E']}), (<Site: 4808>, {'label_asym_id': ['F']})]


In [550]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['E']}]

[1]

There is an additional molecule of the modulator that binds in a different site than the annotated one, but nor structure or the entry have a citation related to them, so the dataset annotation will be taken as correct.

<br>

In [551]:
pdb = "3njq"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
40,ASD12140000_1,,Human herpesvirus 8 type M,P88911,3NJQ,ASD04992001,NJQ,A,Lig,Inhibitor,4-[[6-(cyclohexylmethyl)pyridin-2-yl]carbonyla...,197,Protein-Protein Interaction Regulator,Inner Protein,21723875,Enzyme inhibition by allosteric capture of an ...,No,"Chain A:ILE44,PHE76,LEU79,ALA80,LEU83,ILE105,T..."


In [552]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3NJQ [[{'auth_asym_id': 'A', 'auth_comp_id': 'NJQ', 'auth_seq_id': '197'}]]
SITES: [(<Site: 4808>, {'label_asym_id': ['D']})]


The annotated molecule binds by itself and the error is probably due to the other molecule of the modulator that binds on the same site in another copy of the protein which is stacked with another molecule of the modulator, probably an artifact due to crystal contacts.

<br>

In [553]:
pdb = "3umo"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
13,ASD02220000_1,pfkB,Escherichia coli,P06999,3UMO,ASD00150001,ATP,B,Lig,Inhibitor,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",312,Inner Protein Regulator,Protein-Protein Interaction,23823238,A Ribokinase Family Conserved Monovalent Catio...,Yes,"Chain A:ASN187,LYS189,GLY226,PRO227,THR251; Ch..."


In [554]:
solve_error(pdb, [
    [{"label_asym_id": "E"}],
    [{"label_asym_id": "H"}],
    [{"label_asym_id": "I"}],
    [{"label_asym_id": "K"}],
], auto_site_grouping=False, stringent_site_grouping=True)

3UMO [[{'label_asym_id': 'E'}], [{'label_asym_id': 'H'}], [{'label_asym_id': 'I'}], [{'label_asym_id': 'K'}]]
SITES: [(<Site: 4809>, {'label_asym_id': ['E']}), (<Site: 4810>, {'label_asym_id': ['H']})]


In [555]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 4809>,
  {'label_asym_id': ['E']},
  {'equivalent': [{'other_site': {'label_asym_id': ['K']},
     'res_of_other_in_site': 1.0,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['H']},
     'res_of_other_in_site': 0.5882352941176471,
     'res_of_site_in_other': 0.2857142857142857},
    {'other_site': {'label_asym_id': ['I']},
     'res_of_other_in_site': 0.5555555555555556,
     'res_of_site_in_other': 0.2857142857142857}]}),
 (<Site: 4810>,
  {'label_asym_id': ['H']},
  {'equivalent': [{'other_site': {'label_asym_id': ['I']},
     'res_of_other_in_site': 0.9444444444444444,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 0.2857142857142857,
     'res_of_site_in_other': 0.5882352941176471},
    {'other_site': {'label_asym_id': ['K']},
     'res_of_other_in_site': 0.2857142857142857,
     'res_of_site_in_other': 0.5882352941176471}]})]

In [556]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['H']}]

[1]

Similar to 3CQD. The `label_asym_id` of the annotated molecule is I. 

<br>

In [557]:
pdb = "3uqd"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
14,ASD02220000_1,pfkB,Escherichia coli,P06999,3UQD,,ATP,A,Lig,Inhibitor,ADENOSINE-5'-TRIPHOSPHATE,404,,,,Studying the phosphoryl transfer mechanism of ...,,


In [558]:
solve_error(pdb, [
    [{"label_asym_id": "E"}],
    [{"label_asym_id": "L"}],
    [{"label_asym_id": "M"}],
    [{"label_asym_id": "Q"}],
], auto_site_grouping=False, stringent_site_grouping=True)

3UQD [[{'label_asym_id': 'E'}], [{'label_asym_id': 'L'}], [{'label_asym_id': 'M'}], [{'label_asym_id': 'Q'}]]
SITES: [(<Site: 4811>, {'label_asym_id': ['E']}), (<Site: 4812>, {'label_asym_id': ['L']})]


In [559]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['L']}]

[1]

Same as before. The annotated molecule is not found in the PDB but the corresponding allosteric molecule can be identified due to similarity with previous cases of the same protein.

<br>

In [560]:
pdb = "3zl6"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
54,ASD16900000_1,ispA,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9HWY4,3ZL6,ASD01670005,NVU,A,Lig,Inhibitor,"2-(1,2-benzoxazol-3-yl)ethanoic acid",1297,Allosteric function,Allosteric position,25760619,Structural Characterization of Substrate and I...,No,"Chain A:ILE290,ARG293,LEU286,PHE218,TYR289,LEU..."


In [561]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

3ZL6 [[{'auth_asym_id': 'A', 'auth_comp_id': 'NVU', 'auth_seq_id': '1297'}]]
SITES: [(<Site: 4815>, {'label_asym_id': ['E', 'F']})]


According to the primary citation, the two molecules of the modulator that bind together form the allosteric site and thus should be annotated together.

<br>

In [562]:
pdb = "4clz"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
34,ASD11940000_1,,Homo sapiens,Q96PN6,4CLZ,ASD01100001,ACT,A,Lig,Activator,ACETATE ION,1469,Inner Protein Regulator,Inner Protein,24567411,Crystal Structures of Human Soluble Adenylyl C...,No,4CLZ


In [563]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4CLZ [[{'auth_asym_id': 'A', 'auth_comp_id': 'ACT', 'auth_seq_id': '1469'}]]
SITES: [(<Site: 4816>, {'label_asym_id': ['B']})]


In [564]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [],
 'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
   'res_of_other_in_site': 0.0,
   'res_of_site_in_other': 0.0},
  {'other_site': {'label_asym_id': ['D', 'E']},
   'res_of_other_in_site': 0.0,
   'res_of_site_in_other': 0.0}]}

The annotated molecule binds by itself and the error is probably due to other molecules of the modulator that might bind closer together elsewhere.

<br>

In [565]:
pdb = "4dkt"
get_error(pdb)

ENTRIES: 2 SITES: [(<Site: 570>, {'label_asym_id': ['D', 'E', 'G']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
33,ASD11630000_1,,Homo sapiens,Q9UM07,4DKT,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,704,Inner Protein Regulator,Inner Protein,22004374,Synthesis and Screening of a Haloacetamidine C...,No,4DKT


In [566]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2374,ASD11630000_1,,Homo sapiens,Q9UM07,4DKT,ASD11630001,CA,A,Ion,Activator,CALCIUM ION,704,Inner Protein Regulator,Inner Protein,22004374,Synthesis and Screening of a Haloacetamidine C...,No,4DKT
2375,ASD11630000_1,,Homo sapiens,Q9UM07,4DKT,ASD11630001;ASD11630001;ASD11630001,CA;CA;CA,A;A;A,Ion,Activator,CALCIUM ION;CALCIUM ION;CALCIUM ION,702;703;705,Inner Protein Regulator,Inner Protein,22004374,Synthesis and Screening of a Haloacetamidine C...,No,4DKT


In [567]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4DKT [[{'auth_asym_id': 'A', 'auth_comp_id': 'CA', 'auth_seq_id': '704'}]]
SITES: [(<Site: 570>, {'label_asym_id': ['D', 'E', 'G']}), (<Site: 4820>, {'label_asym_id': ['F']})]


In [568]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'CALCIUM ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['Q9UM07']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD11630000_1',
       'target_gene': None,
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'Q9UM07',
       'allosteric_pdb': '4DKT',
       'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001',
       'modulator_alias': 'CA;CA;CA',
       'modulator_chain': 'A;A;A',
       'modulator_class': 'Ion',
       'modula

Same as previous cases with Calciums and existing sites (e.g., 1wda).

<br>

In [569]:
pdb = "4eag"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00310000_1,Prkag1,Rattus norvegicus,P80385,4EAG,ASD00150001,ATP,C,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",402,Protein-Protein Interaction Regulator,Inner Protein,22659875,AMP-activated protein kinase undergoes nucleot...,No,"Chain C:HIS150,HIS168,THR199,ASN202,ILE203,ALA..."


In [570]:
df.query(f"allosteric_pdb in {'4EAG, 4EAI, 4EAJ, 4EAK, 4EAL'.split(', ')}")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
75,ASD00310000_1,Prkag1,Rattus norvegicus,P80385,4EAG,ASD00150001,ATP,C,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",402,Protein-Protein Interaction Regulator,Inner Protein,22659875.0,AMP-activated protein kinase undergoes nucleot...,No,"Chain C:HIS150,HIS168,THR199,ASN202,ILE203,ALA..."
76,ASD00310000_1,Prkag1,Rattus norvegicus,P80385,4EAJ,ASD00030001,AMP,C,Lig,Activator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-bis(o...",403,Protein-Protein Interaction Regulator,Inner Protein,,AMP-activated protein kinase undergoes nucleot...,No,"Chain C:HIS150,THR199,ASN202,ILE203,ALA204,VAL..."
77,ASD00310000_1,Prkag1,Rattus norvegicus,P80385,4EAK,ASD00150001,ATP,C,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",402,Protein-Protein Interaction Regulator,Inner Protein,22659875.0,AMP-activated protein kinase undergoes nucleot...,No,"Chain C:HIS150,HIS168,THR199,ASN202,ILE203,ALA..."
78,ASD00310000_1,Prkag1,Rattus norvegicus,P80385,4EAL,ASD00030001,AMP,C,Lig,Activator,"[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-bis(o...",402,Protein-Protein Interaction Regulator,Inner Protein,22659875.0,AMP-activated protein kinase undergoes nucleot...,No,"Chain C:HIS150,THR199,ILE203,ALA204,VAL224,SER..."


In [571]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4EAG [[{'auth_asym_id': 'C', 'auth_comp_id': 'ATP', 'auth_seq_id': '402'}]]
SITES: [(<Site: 4823>, {'label_asym_id': ['D', 'E']})]


In [572]:
pdb = "4eak"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD00310000_1,Prkag1,Rattus norvegicus,P80385,4EAK,ASD00150001,ATP,C,Lig,Activator,"[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-3,4-dihy...",402,Protein-Protein Interaction Regulator,Inner Protein,22659875,AMP-activated protein kinase undergoes nucleot...,No,"Chain C:HIS150,HIS168,THR199,ASN202,ILE203,ALA..."


In [573]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4EAK [[{'auth_asym_id': 'C', 'auth_comp_id': 'ATP', 'auth_seq_id': '402'}]]
SITES: [(<Site: 4826>, {'label_asym_id': ['D', 'E']})]


In [574]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '4EAG, 4EAI, 4EAJ, 4EAK, 4EAL'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 4eag>,
  <Site: 4823>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          ATP         401
  1            C          ATP         402),
 (<PDB: 4eaj>,
  <Site: 873>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          AMP         403),
 (<PDB: 4eak>,
  <Site: 4826>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          ATP         401
  1            C          ATP         402),
 (<PDB: 4eal>,
  <Site: 875>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          AMP         402)]

In [575]:
pdb = "4eaj"
process_entry(
    df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze(),
    {pdb.upper(): {"pdb": pdb.upper(), "mods": [[{"auth_comp_id": "ATP"}]]}},
    auto_site_grouping=True,
    stringent_site_grouping=False
)

4EAJ [[{'auth_comp_id': 'ATP'}]]


In [576]:
pdb = "4eal"
Site.get(Site.pdb == pdb).delete_instance()
process_entry(
    df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze(),
    {pdb.upper(): {"pdb": pdb.upper(), "mods": [
        {"label_asym_id": ["D", "E"]}
    ]}},
    auto_site_grouping=False,
    stringent_site_grouping=True
)

4EAL [{'label_asym_id': ['D', 'E']}]


In [577]:
pdb = "4eai"
for p in set('4EAG, 4EAI, 4EAJ, 4EAK, 4EAL'.split(', ')) - set([pdb.upper()]):
    process_entry(
        df.query(f"allosteric_pdb == '{p}'").squeeze(),
        {p: {"pdb": pdb.upper(), "mods": [
        {"label_asym_id": ["D", "E", "F"]}
        ]}},
        auto_site_grouping=False,
        stringent_site_grouping=True
    )

4EAI [{'label_asym_id': ['D', 'E', 'F']}]
Downloading 4eai
4EAI [{'label_asym_id': ['D', 'E', 'F']}]
4EAI [{'label_asym_id': ['D', 'E', 'F']}]
4EAI [{'label_asym_id': ['D', 'E', 'F']}]


In [578]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '4EAG, 4EAI, 4EAJ, 4EAK, 4EAL'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 4eag>,
  <Site: 4823>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          ATP         401
  1            C          ATP         402),
 (<PDB: 4eai>,
  <Site: 4842>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          AMP         401
  1            C          AMP         402
  2            C          AMP         403),
 (<PDB: 4eaj>,
  <Site: 4834>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          ATP         401
  1            C          ATP         402
  2            C          AMP         403),
 (<PDB: 4eak>,
  <Site: 4826>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          ATP         401
  1            C          ATP         402),
 (<PDB: 4eal>,
  <Site: 4835>,
    auth_asym_id auth_comp_id auth_seq_id
  0            C          AMP         401
  1            C          AMP         402)]

All adenine-containing molecules are allosteric modulators of the gamma subunit of the AMPK protein studied in this PDB and all its related entries of the primary citation and thus should all be annotated.

<br>

In [579]:
pdb = "4gqq"
get_error(pdb)

ENTRIES: 4 SITES: [(<Site: 3293>, {'label_asym_id': ['F']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
25,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD08770020,0XR,A,Lig,Inhibitor,"ethyl (2E)-3-(3,4-dihydroxyphenyl)prop-2- enoate",502,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,No,4GQQ
26,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD08770020,0XR,A,Lig,Inhibitor,"ethyl (2E)-3-(3,4-dihydroxyphenyl)prop-2- enoate",503,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,No,4GQQ
27,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD08770020,0XR,A,Lig,Inhibitor,"ethyl (2E)-3-(3,4-dihydroxyphenyl)prop-2- enoate",504,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,No,4GQQ


In [580]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2001,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD08770020,0XR,A,Lig,Inhibitor,"ethyl (2E)-3-(3,4-dihydroxyphenyl)prop-2- enoate",502,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,No,4GQQ
2002,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD08770020,0XR,A,Lig,Inhibitor,"ethyl (2E)-3-(3,4-dihydroxyphenyl)prop-2- enoate",503,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,No,4GQQ
2003,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD08770020,0XR,A,Lig,Inhibitor,"ethyl (2E)-3-(3,4-dihydroxyphenyl)prop-2- enoate",504,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,No,4GQQ
2004,ASD08770000_1,AMY2A,Homo sapiens,P04746,4GQQ,ASD01630003,CL,A,Ion,Activator,chloride,505,Inner Protein Regulator,Inner Protein,23050660,Order and disorder: differential structural im...,Yes,4GQQ


In [581]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi in ['503', '504']"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [582]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4GQQ [[{'auth_asym_id': 'A', 'auth_comp_id': '0XR', 'auth_seq_id': '502'}]]
SITES: [(<Site: 3293>, {'label_asym_id': ['F']}), (<Site: 4843>, {'label_asym_id': ['C']})]


In [583]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '503'").squeeze())

In [584]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4GQQ [[{'auth_asym_id': 'A', 'auth_comp_id': '0XR', 'auth_seq_id': '503'}]]
SITES: [(<Site: 3293>, {'label_asym_id': ['F']}), (<Site: 4843>, {'label_asym_id': ['C']}), (<Site: 4847>, {'label_asym_id': ['D', 'E']})]


In [585]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '504'").squeeze())

In [586]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4GQQ [[{'auth_asym_id': 'A', 'auth_comp_id': '0XR', 'auth_seq_id': '504'}]]
SITES: [(<Site: 4852>, {'label_asym_id': ['F']}), (<Site: 4853>, {'label_asym_id': ['C']}), (<Site: 4856>, {'label_asym_id': ['D', 'E']})]


In [587]:
[s.info for s in PDB.get(PDB.entry_id == pdb).sites]

[{'modulator_info': [{'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'CHLORIDE ION'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P04746']}],
  'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD08770000_1',
       'target_gene': 'AMY2A',
       'organism': 'Homo sapiens',
       'pdb_uniprot': 'P04746',
       'allosteric_pdb': '4GQQ',
       'modulator_serial': 'ASD01630003',
       'modulator_alias': 'CL',
       'modulator_chain': 'A',
       'modulator_class': 'Ion',
       'modulator_feature': 'Activator',
       'modulator_name': 'chloride',
       'modulator_resi': '505',
       'function': 'Inner Protein Regulator',
       'position': 'Inner Protein',
       'pubmed_id': '23050660',
       'ref_title': 'Order and disorder: differential structural impacts of myricetin and ethyl 

Two of the annotated modulators bind close together and both are allosteric so should proceed with automatic grouping.

<br>

In [588]:
pdb = "4i0u"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
23,ASD07400000_1,,Thermotoga maritima,Q9WZ31,4I0U,ASD00330030,MG,A,Ion,Regulator,MAGNESIUM ION,401,Inner Protein Regulator,Inner Protein,23425532,Exploring the structure and function of Thermo...,No,"Chain D:ASP253,ASP179,ASP175,TYR171,HIS257"


In [589]:
solve_error(pdb, [
    {"label_asym_id": ["K", "FA"]},
    {"label_asym_id": ["N", "O"]},
    {"label_asym_id": ["P", "Y"]},
    {"label_asym_id": ["Z", "JA"]},
    {"label_asym_id": ["GA", "KA"]},
    {"label_asym_id": ["EA"]},
    {"label_asym_id": ["AB", "ZA"]},
    {"label_asym_id": ["PB", "QB"]},
    {"label_asym_id": ["MB", "LB"]},
    {"label_asym_id": ["KB", "TA"]},
    {"label_asym_id": ["SA", "YA"]},
    {"label_asym_id": ["OB"]}, {"label_asym_id": ["WA"]},
    {"label_asym_id": ["XA"]},
], auto_site_grouping=False, stringent_site_grouping=True)

4I0U [{'label_asym_id': ['K', 'FA']}, {'label_asym_id': ['N', 'O']}, {'label_asym_id': ['P', 'Y']}, {'label_asym_id': ['Z', 'JA']}, {'label_asym_id': ['GA', 'KA']}, {'label_asym_id': ['EA']}, {'label_asym_id': ['AB', 'ZA']}, {'label_asym_id': ['PB', 'QB']}, {'label_asym_id': ['MB', 'LB']}, {'label_asym_id': ['KB', 'TA']}, {'label_asym_id': ['SA', 'YA']}, {'label_asym_id': ['OB']}, {'label_asym_id': ['WA']}, {'label_asym_id': ['XA']}]
SITES: [(<Site: 4857>, {'label_asym_id': ['K', 'FA']}), (<Site: 4862>, {'label_asym_id': ['EA']}), (<Site: 4869>, {'label_asym_id': ['WA']}), (<Site: 4870>, {'label_asym_id': ['XA']})]


In [590]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['K', 'FA']}]

[1, 1, 1]

In [591]:
[s.related_sites for s in PDB.get(PDB.entry_id == pdb).sites]

[{'equivalent': [{'other_site': {'label_asym_id': ['N', 'O']},
    'res_of_other_in_site': 0.875,
    'res_of_site_in_other': 0.9333333333333333},
   {'other_site': {'label_asym_id': ['P', 'Y']},
    'res_of_other_in_site': 0.8235294117647058,
    'res_of_site_in_other': 0.9333333333333333},
   {'other_site': {'label_asym_id': ['Z', 'JA']},
    'res_of_other_in_site': 0.9333333333333333,
    'res_of_site_in_other': 0.9333333333333333},
   {'other_site': {'label_asym_id': ['GA', 'KA']},
    'res_of_other_in_site': 0.8235294117647058,
    'res_of_site_in_other': 0.9333333333333333},
   {'other_site': {'label_asym_id': ['ZA', 'AB']},
    'res_of_other_in_site': 0.875,
    'res_of_site_in_other': 0.9333333333333333},
   {'other_site': {'label_asym_id': ['PB', 'QB']},
    'res_of_other_in_site': 0.8823529411764706,
    'res_of_site_in_other': 1.0},
   {'other_site': {'label_asym_id': ['LB', 'MB']},
    'res_of_other_in_site': 0.8235294117647058,
    'res_of_site_in_other': 0.933333333333333

According to literature (not only primary citation), the pairs of Mg ions that bind in the interfaces between the monomers of the transproters are all responsible for allosteric conformational regulation and thus should be annotated. However, the Mg pairs are slightly too far away from each other to be grouped automatically (6.3 Angstroms instead of 6.1) and groups will have to be provided manually. The annotated ion that will be part of a pair that must be the annotated one has `label_asym_id` K.

<br>

In [592]:
pdb = "4m0z"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
24,ASD08820000_2,ITK,Homo sapiens,Q08881,4M0Z,ASD08820001,M0Z,A,Lig,Inhibitor,4-(aminocarbonylamino)-1-(7-methoxynaphthalen-...,701,Inner Protein Regulator,Inner Protein,24593284,Selectively targeting an inactive conformation...,Yes,4M0Z


In [593]:
solve_error(pdb, [
    {"label_asym_id": ["B"]},
    {"label_asym_id": ["C"]},
], auto_site_grouping=False, stringent_site_grouping=True)

4M0Z [{'label_asym_id': ['B']}, {'label_asym_id': ['C']}]
SITES: [(<Site: 4858>, {'label_asym_id': ['B']}), (<Site: 4859>, {'label_asym_id': ['C']})]


In [594]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['B']}]

[1]

The annotated molecule is correct but binds close together with another molecule of the modulator bound in the active site and thus must be annotated manually.

<br>

In [595]:
pdb = "4ni0"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
15,ASD03910000_1,HBA1,Homo sapiens,P69905,4NI0,ASD03910126,2P3,A,Lig,Activator,"5-[(2S)-2,3-dihydro-1,4-benzodioxin-2-yl]-2,4-...",203,Allosteric function,Allosteric position,25061917,Identification of a Small Molecule that Increa...,No,4NI0


In [596]:
solve_error(pdb, [
    {"label_asym_id": ["E"]},
    {"label_asym_id": ["M"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4NI0 [{'label_asym_id': ['E']}, {'label_asym_id': ['M']}]
SITES: [(<Site: 4861>, {'label_asym_id': ['M']}), (<Site: 4863>, {'label_asym_id': ['E', 'F', 'N']})]


All of the molecules of the modulator that appear in the structure are allosterically modulating the affinity of the protein for its main substracte, according to the primary reference, and thus should all be annotated.

<br>

In [597]:
pdb = "4oyo"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
26,ASD11940000_1,,Homo sapiens,Q96PN6,4OYO,ASD11940024,1WC,A,Lig,Inhibitor,4-(2-chlorophenyl)-5-methyl-1H-pyrazole,502,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYO


In [598]:
solve_error(pdb, [
    {"label_asym_id": ["B"]},
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
], auto_site_grouping=False, stringent_site_grouping=True)

4OYO [{'label_asym_id': ['B']}, {'label_asym_id': ['C']}, {'label_asym_id': ['D']}]
SITES: [(<Site: 4864>, {'label_asym_id': ['B']}), (<Site: 4865>, {'label_asym_id': ['C']}), (<Site: 4866>, {'label_asym_id': ['D']})]


In [599]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C']}]

[1, 1]

In [600]:
df.query(f"allosteric_pdb in {'4OYA, 4OYB, 4OYI, 4OYM, 4OYO, 4OYP, 4OYW, 4OYX, 4OYZ, 4OZ2, 4OZ3'.split(', ')}")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2475,ASD11940000_1,,Homo sapiens,Q96PN6,4OYA,ASD11940004,1VE,A,Lig,Inhibitor,"(4-azanyl-1,2,5-oxadiazol-3-yl)-[3-(1H-benzimi...",501,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYA
2476,ASD11940000_1,,Homo sapiens,Q96PN6,4OYB,ASD11940005,1VJ,A,Lig,Inhibitor,"ethyl 2-[3-[(4-azanyl-1,2,5-oxadiazol-3-yl)car...",502,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYB
2477,ASD11940000_1,,Homo sapiens,Q96PN6,4OYI,ASD11940006,1VK,A,Lig,Inhibitor,"(4-azanyl-1,2,5-oxadiazol-3-yl)-phenyl-methanone",501,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYI
2478,ASD11940000_1,,Homo sapiens,Q96PN6,4OYM,ASD11940007,1ZC,A,Lig,Inhibitor,"(4-azanyl-1,2,5-oxadiazol-3-yl)-(3-methoxyphen...",501,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYM
2479,ASD11940000_1,,Homo sapiens,Q96PN6,4OYO,ASD11940024,1WC,A,Lig,Inhibitor,4-(2-chlorophenyl)-5-methyl-1H-pyrazole,502,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYO
2480,ASD11940000_1,,Homo sapiens,Q96PN6,4OYP,ASD11940009,BZ2,A,Lig,Inhibitor,1-benzofuran-2-carboxylic acid,502,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYP
2481,ASD11940000_1,,Homo sapiens,Q96PN6,4OYZ,ASD01400003,BCT,A,Lig,Activator,BICARBONATE ION,502,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OYZ
2482,ASD11940000_1,,Homo sapiens,Q96PN6,4OZ2,ASD11940026,1Z6,A,Lig,Inhibitor,4-(4-fluorophenyl)-3-methyl-1H-pyrazole,501,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OZ2
2483,ASD11940000_1,,Homo sapiens,Q96PN6,4OZ3,ASD11940011,1Z8,A,Lig,Inhibitor,4-phenyl-3-(trifluoromethyl)-1H-pyrazole,501,Inner Protein Regulator,Inner Protein,24616449,Crystal structure of human soluble adenylate c...,No,4OZ3


In [601]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '4OYA, 4OYB, 4OYI, 4OYM, 4OYO, 4OYP, 4OYW, 4OYX, 4OYZ, 4OZ2, 4OZ3'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 4oya>,
  <Site: 3822>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          1VE         501),
 (<PDB: 4oyb>,
  <Site: 3823>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          1VJ         502),
 (<PDB: 4oyi>,
  <Site: 3824>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          1VK         501),
 (<PDB: 4oym>,
  <Site: 3825>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          1ZC         501),
 (<PDB: 4oyo>,
  <Site: 4865>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          1WC         502),
 (<PDB: 4oyp>,
  <Site: 3826>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          BZ2         502),
 (<PDB: 4oyz>,
  <Site: 3827>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          BCT         502),
 (<PDB: 4oz2>,
  <Site: 3828>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          1Z6         501),
 (<PDB: 4oz3>,
  <Site: 3829>,
    auth_asym_id auth_comp_id aut

According to the primary citation, only the bicarbonate binding site (4OYZ) is the allosteric site, and thus the molecules overlapping it should be the ones annotated among all the PDBs related to the primary citation, which already seems to be the case.

<br>

In [602]:
pdb = "4p2t"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
26,ASD12130000_1,ORF17,Human herpesvirus 8 type P,Q2HRB6,4P2T,ASD12130001,24Q,A,Lig,Inhibitor,4-[[6-(cyclohexylmethyl)pyridin-2-yl]carbonyla...,201,Protein-Protein Interaction Regulator,Inner Protein,24977643,Broad-spectrum allosteric inhibition of herpes...,No,"Chain A:ILE44,PHE76,LEU79,ALA80,LEU83,ALA90,IL..."


In [603]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["E"]},
    {"label_asym_id": ["F"]},
], auto_site_grouping=False, stringent_site_grouping=True)

4P2T [{'label_asym_id': ['C']}, {'label_asym_id': ['E']}, {'label_asym_id': ['F']}]
SITES: [(<Site: 4866>, {'label_asym_id': ['C']}), (<Site: 4867>, {'label_asym_id': ['E']})]


In [604]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 4866>,
  {'label_asym_id': ['C']},
  {'equivalent': [{'other_site': {'label_asym_id': ['F']},
     'res_of_other_in_site': 0.8235294117647058,
     'res_of_site_in_other': 0.4827586206896552}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 0.7,
     'res_of_site_in_other': 0.7241379310344828}]}),
 (<Site: 4867>,
  {'label_asym_id': ['E']},
  {'equivalent': [],
   'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.7241379310344828,
     'res_of_site_in_other': 0.7},
    {'other_site': {'label_asym_id': ['F']},
     'res_of_other_in_site': 0.5294117647058824,
     'res_of_site_in_other': 0.3}]})]

In [605]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['E']}]

[1]

According to the primary citation, the most probably binding mode on the modulator in solution (and not in the crystal lattice) is the buried one on chain B (hypothesis), while the exposed modulator molecule binding close to it on the same protein chain is termed "bridging" as linking between symmetry mates. Surprisingly, the binding sites of the "buried"-A and the "bridging" have more in common than the two "buried", so only the proposed "buried"-B site is going to be left as annotation.

<br>

In [606]:
pdb = "4p3h"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
17,ASD04990000_1,,Human herpesvirus 8,O36607,4P3H,ASD04990001,25G,A,Lig,Inhibitor,4-[[6-(cyclohexylmethyl)pyridin-2-yl]carbonyla...,201,Protein-Protein Interaction Regulator,Inner Protein,24977643,Broad-spectrum allosteric inhibition of herpes...,No,"Chain A:ILE44,PHE76,LEU79,ALA80,LEU83,ALA90,IL..."


In [607]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["G"]},
    {"label_asym_id": ["H"]},
], auto_site_grouping=False, stringent_site_grouping=True)

4P3H [{'label_asym_id': ['C']}, {'label_asym_id': ['G']}, {'label_asym_id': ['H']}]
SITES: [(<Site: 4869>, {'label_asym_id': ['G']}), (<Site: 4870>, {'label_asym_id': ['H']})]


In [608]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 4869>,
  {'label_asym_id': ['G']},
  {'equivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.8076923076923077,
     'res_of_site_in_other': 0.6774193548387096}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['H']},
     'res_of_other_in_site': 0.5294117647058824,
     'res_of_site_in_other': 0.2903225806451613}]}),
 (<Site: 4870>,
  {'label_asym_id': ['H']},
  {'equivalent': [],
   'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.4230769230769231,
     'res_of_site_in_other': 0.6470588235294118},
    {'other_site': {'label_asym_id': ['G']},
     'res_of_other_in_site': 0.2903225806451613,
     'res_of_site_in_other': 0.5294117647058824}]})]

In [609]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['G']}]

[1]

Same as before (4P2T), but now the two bound molecules in each protein chain are recognized as being in an equivalent site (and the "bridging" will be eliminated).

<br>

In [610]:
pdb = "4ple"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
42,ASD17180000_1,NR5A2,Homo sapiens,O00482,4PLE,ASD17180001,CPS,A,Lig,activator,3-[(3-CHOLAMIDOPROPYL)DIMETHYLAMMONIO]-1-PROPA...,603,,,26553876,Unexpected Allosteric Network Contributes to L...,,4PLE


In [611]:
errors[pdb] = "Wrong annotation"

In [612]:
df.query("allosteric_pdb == '4PLD'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue


The primary citation doesn't discuss the allosteric role of the annotated molecule, and in fact it looks like the allosteric modulator is the phospholipid, and thus this annotation is wrong.

<br>

In [613]:
pdb = "4qfy"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
23,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QFY,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QFY


In [614]:
solve_error(pdb, [
    {"label_asym_id": ["E", "S", "O"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QFY [{'label_asym_id': ['E', 'S', 'O']}]
SITES: [(<Site: 4886>, {'label_asym_id': ['E', 'O', 'S']})]


In [615]:
df.query(f"allosteric_pdb in {'4QFX, 4QFY, 4QFZ, 4QG0, 4QG1, 4QG2, 4QG4'.split(', ')}")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2098,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QFX,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701.0,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QFX
2099,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QFY,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701.0,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QFY
2100,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QFZ,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701.0,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QFZ
2101,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QG0,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701.0,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QG0
2102,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,4QG1,,GTP,,,Activator,GUANOSINE-5'-TRIPHOSPHATE,,,,25288794,Structural Basis of Allosteric Activation of S...,,4QG1
2103,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QG2,ASD00290006,GTP,A,Lig,Activator,GUANOSINE-5'-TRIPHOSPHATE,701.0,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QG2
2104,ASD09690000_1,,Homo sapiens,Q9Y3Z3,4QG4,ASD00290006,GTP,A,Lig,Activator,GUANOSINE-5'-TRIPHOSPHATE,701.0,Inner Protein Regulator,Protein-Protein Interaction,25288794,Structural Basis of Allosteric Activation of S...,No,4QG4


In [616]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '4QFX, 4QFY, 4QFZ, 4QG0, 4QG1, 4QG2, 4QG4'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 4qfx>,
  <Site: 3379>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          DGT         701),
 (<PDB: 4qfy>,
  <Site: 4886>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          DGT         701
  1            C          DGT         701
  2            C           MG         705),
 (<PDB: 4qfz>,
  <Site: 3380>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          DGT         701),
 (<PDB: 4qg0>,
  <Site: 3381>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          DGT         701),
 (<PDB: 4qg1>,
  <Site: 3382>,
    auth_asym_id auth_comp_id auth_seq_id
  0            B          GTP         701),
 (<PDB: 4qg2>,
  <Site: 3383>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          GTP         701),
 (<PDB: 4qg4>,
  <Site: 3384>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          GTP         701)]

In [617]:
pdb = "4qfx"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["G", "I", "Q"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QFX [{'label_asym_id': ['G', 'I', 'Q']}]
SITES: [(<Site: 4907>, {'label_asym_id': ['E', 'P', 'S']})]


In [618]:
pdb = "4qfz"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["G", "L", "P"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QFZ [{'label_asym_id': ['G', 'L', 'P']}]
SITES: [(<Site: 4929>, {'label_asym_id': ['E', 'N', 'Y']})]


In [619]:
pdb = "4qg0"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["G", "H", "P"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QG0 [{'label_asym_id': ['G', 'H', 'P']}]
SITES: [(<Site: 4949>, {'label_asym_id': ['E', 'O', 'V']})]


In [620]:
pdb = "4qg1"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["I", "O", "Q"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QG1 [{'label_asym_id': ['I', 'O', 'Q']}]
SITES: [(<Site: 4966>, {'label_asym_id': ['F', 'H', 'M']})]


In [621]:
pdb = "4qg2"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["F", "H", "J"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QG2 [{'label_asym_id': ['F', 'H', 'J']}]
SITES: [(<Site: 4979>, {'label_asym_id': ['E', 'L', 'P']})]


In [622]:
pdb = "4qg4"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["F", "M", "K"]},
], auto_site_grouping=True, stringent_site_grouping=False)

4QG4 [{'label_asym_id': ['F', 'M', 'K']}]
SITES: [(<Site: 4992>, {'label_asym_id': ['E', 'O', 'S']})]


The allosteric sites in this multimer protein binds pairs of dNTPs, together with a Mg (confirmed by the primary citation), thus all of the PDBs associated to it will be corrected.

<br>

In [623]:
pdb = "4qsh"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
20,ASD05550000_5,,Listeria monocytogenes R479a,L8DYY9,4QSH,ASD05550002,2BA,A,Lig,Regulator,"(2R,3R,3aS,5R,7aR,9R,10R,10aS,12R,14aR)-2,9- b...",2004,Inner Protein Regulator,Inner Protein,25215494,The Cyclic Dinucleotide c-di-AMP Is an Alloste...,No,Chain D:SER756


In [624]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4QSH [[{'auth_asym_id': 'A', 'auth_comp_id': '2BA', 'auth_seq_id': '2004'}]]
SITES: [(<Site: 4996>, {'label_asym_id': ['G', 'H']})]


The annotated molecule of the modulator binds together with another one, stacked. Interestingly, the annotated is probably an artifact of crystal contacts, while the one that it binds together with looks like the more plausible binding mode, and moreover has another molecule binding in the same way in the additional protein chain, in the same site.

<br>

In [625]:
pdb = "4r8z"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
42,ASD17410000_1,,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9HV27,4R8Z,,NI,,Ion,,NICKEL (II) ION,401,Allosteric function,Allosteric position,25691523,Structural basis of functional diversification...,No,


In [626]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4R8Z [[{'auth_comp_id': 'NI', 'auth_seq_id': '401'}]]
SITES: [(<Site: 5001>, {'label_asym_id': ['C', 'D']})]


They are a pair of NI ions that definitely bind together.

<br>

In [627]:
pdb = "4rew"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
36,ASD16240000_1,PRKAG1,Homo sapiens,P54619,4REW,ASD00030001,AMP,,Lig,Regulator,ADENOSINE MONOPHOSPHATE,401,Protein-Protein Interaction,Protein-Protein Interaction,25412657,Structural basis of AMPK regulation by adenine...,No,4REW


In [628]:
solve_error(pdb, [
    {"label_asym_id": ["F", "E", "G"]}
], auto_site_grouping=False, stringent_site_grouping=True)

4REW [{'label_asym_id': ['F', 'E', 'G']}]
SITES: [(<Site: 5002>, {'label_asym_id': ['E', 'F', 'G']})]


In [629]:
pdb = '4rer'
[s.modulator for s in PDB.get(PDB.entry_id == pdb).sites]

[{'label_asym_id': ['D']}]

In [630]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
solve_error(pdb, [
    {"label_asym_id": ["G", "H", "I"]},
], auto_site_grouping=False, stringent_site_grouping=True)

4RER [{'label_asym_id': ['G', 'H', 'I']}]
SITES: [(<Site: 893>, {'label_asym_id': ['D']}), (<Site: 5003>, {'label_asym_id': ['G', 'H', 'I']})]


Similarly to another previous case (4EAK), all the nucleotides bound to the gamma subunit of this kinase are allosteric modulators.

<br>

In [631]:
pdb = "4rqo"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
39,ASD17210000_1,tdcG,Legionella pneumophila,A0A130G9H0,4RQO,,Cys,,Lig,Inhibitor,C-Terminal Cysteine residue,458,Allosteric Function,Allosteric Position,,,Yes,


In [632]:
errors[pdb] = "Wrong annotation"

Primary citation does not support this annotation.

<br>

In [633]:
pdb = "4z87"
get_error(pdb)

ENTRIES: 3 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
23,ASD13050000_1,,Ashbya gossypii,Q756Z6,4Z87,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,602,Allosteric Function,Allosteric Position,26558346,Guanine nucleotide binding to the Bateman doma...,No,"Chain B:LYS208,ARG167,LYS207,THR206"
24,ASD13050000_1,,Ashbya gossypii,Q756Z6,4Z87,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,603,Allosteric Function,Allosteric Position,26558346,Guanine nucleotide binding to the Bateman doma...,No,"Chain A:MET223,SER166,LYS115,ASN118,GLY147,ILE..."
25,ASD13050000_1,,Ashbya gossypii,Q756Z6,4Z87,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,604,Allosteric Function,Allosteric Position,26558346,Guanine nucleotide binding to the Bateman doma...,No,"Chain A:GLU197,ALA242,ASN118,LYS231,GLU117,SER..."


In [634]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi in ['603', '604']"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [635]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4Z87 [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '602'}]]
SITES: [(<Site: 5020>, {'label_asym_id': ['F', 'G', 'H', 'L', 'M', 'N']})]


In [636]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '603'").squeeze())

In [637]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4Z87 [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '603'}]]
SITES: [(<Site: 5054>, {'label_asym_id': ['F', 'G', 'H', 'L', 'M', 'N']})]


In [638]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '604'").squeeze())

In [639]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

4Z87 [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '604'}]]
SITES: [(<Site: 5088>, {'label_asym_id': ['F', 'G', 'H', 'L', 'M', 'N']})]


In [640]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['Y', 'Z', 'AA']},
   'res_of_other_in_site': 0.984375,
   'res_of_site_in_other': 0.984375},
  {'other_site': {'label_asym_id': ['R', 'S', 'T']},
   'res_of_other_in_site': 0.9206349206349206,
   'res_of_site_in_other': 0.90625}],
 'nonequivalent': []}

Indeed, the three molecules bind together in each protein chain. However, the trio is also together in the model with the trio of the adjacent protein chains, so they must be annotated manually.

<br>

In [641]:
pdb = "5afk"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
12,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFK,ASD03091862,5VU,A,Lig,Regulator,"N-(2,4-difluorophenyl)pyrrolidine-1-carboxamide",1207,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFK


In [642]:
solve_error(pdb, [
    {"label_asym_id": ["M"]},
    {"label_asym_id": ["P"]},
    {"label_asym_id": ["T"]},
    {"label_asym_id": ["V"]},
    {"label_asym_id": ["Y"]},
    {"label_asym_id": ["Z"]},
], auto_site_grouping=False, stringent_site_grouping=True)

5AFK [{'label_asym_id': ['M']}, {'label_asym_id': ['P']}, {'label_asym_id': ['T']}, {'label_asym_id': ['V']}, {'label_asym_id': ['Y']}, {'label_asym_id': ['Z']}]
SITES: [(<Site: 5089>, {'label_asym_id': ['M']}), (<Site: 5093>, {'label_asym_id': ['Y']})]


In [643]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['M']}]

[1]

In [644]:
df.query(f"allosteric_pdb in {'5AFH, 5AFJ, 5AFK, 5AFL, 5AFM, 5AFN'.split(', ')}")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1163,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFJ,ASD03091859,42R,A,Lig,Regulator,"(3S)-6-(4-bromophenyl)-3-hydroxy-1,3-dimethyl-...",1205,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFJ
1164,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFJ,ASD03091859,42R,A,Lig,Regulator,"(3S)-6-(4-bromophenyl)-3-hydroxy-1,3-dimethyl-...",1206,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFJ
1165,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFK,ASD03091862,5VU,A,Lig,Regulator,"N-(2,4-difluorophenyl)pyrrolidine-1-carboxamide",1207,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFK
1166,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFL,ASD03091906,FHV,E,Lig,Regulator,N-(3-METHYLPHENYL)PYRROLIDINE-1-CARBOXAMIDE,1207,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFL
1167,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFM,ASD03091903,9Z0,A,Lig,Regulator,"4,5-dibromo-N-(3-hydroxypropyl)-1H-pyrrole- 2-...",1207,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFM
1168,ASD03090000_1,CHRNA7,Homo sapiens,P36544,5AFN,ASD03091910,OJD,A,Lig,Regulator,(4R)-4-(2-phenylethyl)pyrrolidin-2-one,1215,Allosteric Function,Allosteric Position,25918415,Molecular Blueprint of Allosteric Binding Site...,No,5AFN


In [645]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '5AFH, 5AFJ, 5AFK, 5AFL, 5AFM, 5AFN'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 5afj>,
  <Site: 2339>,
    auth_asym_id auth_comp_id auth_seq_id
  0            E          42R        1205),
 (<PDB: 5afj>,
  <Site: 2340>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          42R        1206),
 (<PDB: 5afk>,
  <Site: 5089>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          5VU        1207),
 (<PDB: 5afl>,
  <Site: 2341>,
    auth_asym_id auth_comp_id auth_seq_id
  0            E          FHV        1207),
 (<PDB: 5afm>,
  <Site: 2343>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          9Z0        1207),
 (<PDB: 5afn>,
  <Site: 2344>,
    auth_asym_id auth_comp_id auth_seq_id
  0            E          OJD        1215)]

The primary citation does not discuss the covalently bound molecule that binds close to the annotated molecule of the modulator in two of the 5 monomers, so for consistency they will not be grouped and only the annotated molecule will be retained.

<br>

In [646]:
pdb = "5btr"
get_error(pdb)

ENTRIES: 3 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
17,ASD05250000_1,SIRT1,Homo sapiens,Q96EB6,5BTR,ASD05250001,STL,A,Lig,Regulator,RESVERATROL,702,Allosteric Function,Allosteric Position,26109052,"Structural basis for allosteric, substrate-dep...",No,5BTR
18,ASD05250000_1,SIRT1,Homo sapiens,Q96EB6,5BTR,ASD05250001,STL,D,Lig,Regulator,RESVERATROL,102,Allosteric Function,Allosteric Position,26109052,"Structural basis for allosteric, substrate-dep...",No,5BTR
19,ASD05250000_1,SIRT1,Homo sapiens,Q96EB6,5BTR,ASD05250001,STL,D,Lig,Regulator,RESVERATROL,102,Allosteric Function,Allosteric Position,26109052,"Structural basis for allosteric, substrate-dep...",No,5BTR


In [647]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '102'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [648]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5BTR [[{'auth_asym_id': 'A', 'auth_comp_id': 'STL', 'auth_seq_id': '702'}]]
SITES: [(<Site: 5100>, {'label_asym_id': ['H', 'O', 'P']})]


In [649]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '102'").drop_duplicates().squeeze())

In [650]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5BTR [[{'auth_asym_id': 'D', 'auth_comp_id': 'STL', 'auth_seq_id': '102'}]]
SITES: [(<Site: 5122>, {'label_asym_id': ['H', 'O', 'P']})]


In [651]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['J', 'K', 'Q']},
   'res_of_other_in_site': 0.9523809523809523,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['M', 'N', 'R']},
   'res_of_other_in_site': 0.95,
   'res_of_site_in_other': 0.95}],
 'nonequivalent': []}

The three of them are considered to bind on the same allosteric site.

<br>

In [652]:
pdb = "5cnt"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
11,ASD02580000_1,nrdA,Escherichia coli,P00452,5CNT,ASD00870004,DTP,A,Lig,Regulator,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,802,Allosteric function,Allosteric position,26754917,Molecular basis for allosteric specificity reg...,Yes,"Chain A:GLY95,VAL7,ASP11,THR8,GLN96,ARG10,PHE8..."


In [653]:
solve_error(pdb, [
    {"label_asym_id": ["J", "K", "L"]}
], auto_site_grouping=True, stringent_site_grouping=False)

5CNT [{'label_asym_id': ['J', 'K', 'L']}]
SITES: [(<Site: 5143>, {'label_asym_id': ['BA', 'CA', 'DA']})]


In [654]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['J', 'K', 'L']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 0.967741935483871},
  {'other_site': {'label_asym_id': ['P', 'Q', 'R']},
   'res_of_other_in_site': 0.96875,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['V', 'W', 'X']},
   'res_of_other_in_site': 0.9655172413793104,
   'res_of_site_in_other': 0.9032258064516129}],
 'nonequivalent': [{'other_site': {'label_asym_id': ['EA', 'FA']},
   'res_of_other_in_site': 0.0,
   'res_of_site_in_other': 0.0},
  {'other_site': {'label_asym_id': ['M', 'N']},
   'res_of_other_in_site': 0.0,
   'res_of_site_in_other': 0.0},
  {'other_site': {'label_asym_id': ['S', 'T']},
   'res_of_other_in_site': 0.0,
   'res_of_site_in_other': 0.0},
  {'other_site': {'label_asym_id': ['Y', 'Z']},
   'res_of_other_in_site': 0.0,
   'res_of_site_in_other': 0.0}]}

In [655]:
df.query(f"allosteric_pdb in {'5CNS, 5CNT, 5CNU, 5CNV'.split(', ')}")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1055,ASD02580000_1,nrdA,Escherichia coli,P00452,5CNS,ASD00870004,DTP,A,Lig,Regulator,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,804,Allosteric function,Allosteric position,26754917,Molecular basis for allosteric specificity reg...,Yes,"Chain B:GLN250,GLY295,LYS246,GLN294,SER291,GLY..."
1056,ASD02580000_1,nrdA,Escherichia coli,P00452,5CNS,ASD00870004,DTP,A,Lig,Inhibitor,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,804,Allosteric Function,Allosteric Position,26754917,Molecular basis for allosteric specificity reg...,No,"Chain B:GLN250,GLY295,LYS246,GLN294,SER291,GLY..."
1057,ASD02580000_1,nrdA,Escherichia coli,P00452,5CNT,ASD00870004,DTP,A,Lig,Regulator,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,802,Allosteric function,Allosteric position,26754917,Molecular basis for allosteric specificity reg...,Yes,"Chain A:GLY95,VAL7,ASP11,THR8,GLN96,ARG10,PHE8..."
1058,ASD02580000_1,nrdA,Escherichia coli,P00452,5CNU,ASD09690001,DGT,A,Lig,Regulator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,801,Allosteric function,Allosteric position,26754917,Molecular basis for allosteric specificity reg...,Yes,"Chain B:GLN250,GLY295,LYS246,GLN294,SER291,GLY..."
1059,ASD02580000_1,nrdA,Escherichia coli,P00452,5CNV,ASD02040015,TTP,A,Lig,Regulator,THYMIDINE-5'-TRIPHOSPHATE,804,Allosteric function,Allosteric position,26754917,Molecular basis for allosteric specificity reg...,Yes,"Chain A:LYS91,PHE87,PHE97,GLU15,LYS9,LYS21,ARG..."


In [656]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '5CNS, 5CNT, 5CNU, 5CNV'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 5cns>,
  <Site: 2214>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          DTP         804),
 (<PDB: 5cnt>,
  <Site: 5143>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          DTP         802
  1            D           MG         803
  2            D          DTP         804),
 (<PDB: 5cnu>,
  <Site: 2215>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          DGT         804),
 (<PDB: 5cnv>,
  <Site: 2216>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          TTP         804)]

In [657]:
pdb = "5cns"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").iloc[0])
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["K", "J"]},
], auto_site_grouping=True, stringent_site_grouping=False)

5CNS [{'label_asym_id': ['K', 'J']}]
SITES: [(<Site: 5156>, {'label_asym_id': ['J', 'K']})]


In [658]:
pdb = "5cnu"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["L", "M"]},
], auto_site_grouping=True, stringent_site_grouping=False)

5CNU [{'label_asym_id': ['L', 'M']}]
SITES: [(<Site: 5169>, {'label_asym_id': ['L', 'M']})]


In [659]:
pdb = "5cnv"
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze())
Site.get(Site.pdb == pdb).delete_instance()
solve_error(pdb, [
    {"label_asym_id": ["J", "K", "L"]},
], auto_site_grouping=True, stringent_site_grouping=False)

5CNV [{'label_asym_id': ['J', 'K', 'L']}]
SITES: [(<Site: 5190>, {'label_asym_id': ['BA', 'CA', 'DA']})]


In [660]:
[(s.pdb, s, s.modulator_residues[["auth_asym_id", "auth_comp_id", "auth_seq_id"]]) for p in '5CNS, 5CNT, 5CNU, 5CNV'.split(', ') if PDB.get_or_none(PDB.entry_id == p.lower()) is not None for s in PDB.get(PDB.entry_id == p.lower()).sites]

[(<PDB: 5cns>,
  <Site: 5156>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          DAT         802
  1            A           MG         803),
 (<PDB: 5cnt>,
  <Site: 5143>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          DTP         802
  1            D           MG         803
  2            D          DTP         804),
 (<PDB: 5cnu>,
  <Site: 5169>,
    auth_asym_id auth_comp_id auth_seq_id
  0            A          DAT         804
  1            A           MG         805),
 (<PDB: 5cnv>,
  <Site: 5190>,
    auth_asym_id auth_comp_id auth_seq_id
  0            D          DAT         802
  1            D           MG         803
  2            D          TTP         804)]

The primary citation does not discuss the (d)NTP dimers bridged by Mg ions, but this site is indeed the "allosteric activity site". Other PDBs associated to the same primary citation have wrong annotated allosteric modulators and will be corrected.

<br>

In [661]:
pdb = "5ezv"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
3,ASD00310000_7,PRKAA2,Homo sapiens,P54646,5EZV,ASD00310181,C2Z,E,Lig,Acttivator,5-(5-hydroxyl-isoxazol-3-yl)-furan-2-phosphoni...,401,Protein-Protein Interaction,Protein-Protein Interaction,26952388,Structural basis of allosteric and synergistic...,No,5EZV


In [662]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5EZV [[{'auth_asym_id': 'E', 'auth_comp_id': 'C2Z', 'auth_seq_id': '401'}]]
SITES: [(<Site: 5195>, {'label_asym_id': ['K', 'L']})]


The molecules of the modulator bind in pairs in the same site of a protein chain with a symmetric fold.

<br>

In [663]:
pdb = "5mcp"
get_error(pdb)

ENTRIES: 2 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
18,ASD13050000_1,,Ashbya gossypii,Q756Z6,5MCP,ASD00150001,ATP,A,Lig,Inhibitor,ADENOSINE-5'-TRIPHOSPHATE,601,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain E:SER166,GLN170,ARG167,PHE171"
19,ASD13050000_1,,Ashbya gossypii,Q756Z6,5MCP,ASD00150001,ATP,A,Lig,Inhibitor,ADENOSINE-5'-TRIPHOSPHATE,602,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain E:GLN170,PHE171"


In [664]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '602'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [665]:
solve_error(pdb, [
    {"label_asym_id": ["I", "J", "DA"]},
    {"label_asym_id": ["AA", "BA", "EA"]},
    {"label_asym_id": ["L", "M", "EA"]},
    {"label_asym_id": ["O", "P", "Q"]},
    {"label_asym_id": ["R", "S", "U"]},
    {"label_asym_id": ["W", "X", "KA"]},
    {"label_asym_id": ["Z", "HA", "IA"]},
], auto_site_grouping=False, stringent_site_grouping=True)

5MCP [{'label_asym_id': ['I', 'J', 'DA']}, {'label_asym_id': ['AA', 'BA', 'EA']}, {'label_asym_id': ['L', 'M', 'EA']}, {'label_asym_id': ['O', 'P', 'Q']}, {'label_asym_id': ['R', 'S', 'U']}, {'label_asym_id': ['W', 'X', 'KA']}, {'label_asym_id': ['Z', 'HA', 'IA']}]
SITES: [(<Site: 5197>, {'label_asym_id': ['AA', 'BA', 'EA']})]


In [666]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '602'").squeeze())

In [667]:
solve_error(pdb, [
    {"label_asym_id": ["I", "J", "DA"]},
    {"label_asym_id": ["AA", "BA", "EA"]},
    {"label_asym_id": ["L", "M", "EA"]},
    {"label_asym_id": ["O", "P", "Q"]},
    {"label_asym_id": ["R", "S", "U"]},
    {"label_asym_id": ["W", "X", "KA"]},
    {"label_asym_id": ["Z", "HA", "IA"]},
], auto_site_grouping=False, stringent_site_grouping=True)

5MCP [{'label_asym_id': ['I', 'J', 'DA']}, {'label_asym_id': ['AA', 'BA', 'EA']}, {'label_asym_id': ['L', 'M', 'EA']}, {'label_asym_id': ['O', 'P', 'Q']}, {'label_asym_id': ['R', 'S', 'U']}, {'label_asym_id': ['W', 'X', 'KA']}, {'label_asym_id': ['Z', 'HA', 'IA']}]
SITES: [(<Site: 5200>, {'label_asym_id': ['AA', 'BA', 'EA']})]


In [668]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'E'], dtype=object), array(['A', 'E'], dtype=object))

The separately annotated molecules indeed bind together, also with a Mg. However, the trio binds close to the trio of another monomer of the complex, and thus must be annotated individually.

<br>

In [669]:
pdb = "5olk"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
25,ASD16710000_1,,Leeuwenhoekiella blandensis (strain CECT 7118 ...,A3XHF9,5OLK,ASD00870004,DTP,A,Lig,Inhibitor,2'-DEOXYADENOSINE 5'-TRIPHOSPHATE,1001,Protein-Protein Interaction,Protein-Protein Interaction,29388911,Novel ATP-cone-driven allosteric regulation of...,No,"Chain A:VAL75,GLN72,ARG12,VAL20,ILE9,VAL68,ALA..."


In [670]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5OLK [[{'auth_asym_id': 'A', 'auth_comp_id': 'DTP', 'auth_seq_id': '1001'}]]
SITES: [(<Site: 5209>, {'label_asym_id': ['E', 'F']})]


Confirmed by the primary citation, the two molecules bind together.

<br>

In [671]:
pdb = "5s4r"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
10,ASD03750000_5,TUBB2B,Bos taurus,Q6B856,5S4R,,NW7,B,Lig,Regulator,"3-ethyl-5-methyl-N-(5-methyl-1,2-oxazol-3-yl)-...",,,,,,No,


In [672]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5S4R [[{'auth_asym_id': 'B', 'auth_comp_id': 'NW7'}]]
SITES: [(<Site: 5211>, {'label_asym_id': ['O']}), (<Site: 5213>, {'label_asym_id': ['N', 'P']})]


The structure comes from a massive fragment screening effort on tubuline where all bound fragments have been annotated as allosteric sites and thus there is no functional impact in grouping molecules that bind close together as part of the same site.

<br>

In [673]:
pdb = "5s4s"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
11,ASD03750000_5,TUBB2B,Bos taurus,Q6B856,5S4S,,K0M,A,Lig,Regulator,"3-methyl-N-(1-methyl-1H-pyrazol-3-yl)-1,2-oxaz...",,,,,,No,


In [674]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5S4S [[{'auth_asym_id': 'A', 'auth_comp_id': 'K0M'}]]
SITES: [(<Site: 5218>, {'label_asym_id': ['J', 'O']})]


Same as before.

<br>

In [675]:
pdb = "5tc3"
get_error(pdb)

ENTRIES: 3 SITES: [(<Site: 3966>, {'label_asym_id': ['D', 'I']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD13050000_1,,Ashbya gossypii,Q756Z6,5TC3,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,602,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain A:MET223,SER166,LYS115,ASN118,GLY147,ILE..."
17,ASD13050000_1,,Ashbya gossypii,Q756Z6,5TC3,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,603,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain A:GLU197,ALA242,GLN233,ASN118,LYS231,GLU..."


In [676]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
2607,ASD13050000_1,,Ashbya gossypii,Q756Z6,5TC3,ASD00150001,ATP,A,Lig,Inhibitor,ADENOSINE-5'-TRIPHOSPHATE,601,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain B:ARG226,MET230,ARG167,LYS207"
2608,ASD13050000_1,,Ashbya gossypii,Q756Z6,5TC3,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,602,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain A:MET223,SER166,LYS115,ASN118,GLY147,ILE..."
2609,ASD13050000_1,,Ashbya gossypii,Q756Z6,5TC3,ASD00860009,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,603,Inner Protein Regulator,Inner Protein,28572600,A nucleotide-controlled conformational switch ...,No,"Chain A:GLU197,ALA242,GLN233,ASN118,LYS231,GLU..."


In [677]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '603'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [678]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5TC3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '602'}]]
SITES: [(<Site: 5232>, {'label_asym_id': ['D', 'E', 'F', 'I', 'J', 'K']})]


In [679]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '603'").squeeze())

In [680]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

5TC3 [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '603'}]]
SITES: [(<Site: 5246>, {'label_asym_id': ['D', 'E', 'F', 'I', 'J', 'K']})]


In [681]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'I'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'J'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'K'}],
   'label_entity_id': '4',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': 

In [682]:
Site.get(Site.pdb == pdb).protein_residues.label_asym_id.unique(), Site.get(Site.pdb == pdb).nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'B'], dtype=object), array(['A'], dtype=object))

Similar to a previous case (5MCP), the three molecules form a single site (and are automatically grouped as they bind close together with the ones from another chain).

<br>

In [683]:
pdb = "5ur3"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
15,ASD15170000_1,ORF17,Human herpesvirus 8,O40922,5UR3,ASD15170006,8OY,A,Lig,Inhibitor,4-{[6-(cyclohexylmethyl)pyridine-2-carbonyl]am...,201,Allosteric function,Allosteric position,,Kaposi's Sarcoma Herpesvirus Protease in Compl...,No,"Chain A:PRO192,ILE105,TRP109,ILE44,LEU106,PHE1..."


In [684]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=False, stringent_site_grouping=True)

5UR3 [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5249>, {'label_asym_id': ['E']})]


In [685]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['C']},
   'res_of_other_in_site': 0.8518518518518519,
   'res_of_site_in_other': 0.71875},
  {'other_site': {'label_asym_id': ['D']},
   'res_of_other_in_site': 0.7142857142857143,
   'res_of_site_in_other': 0.3125}],
 'nonequivalent': []}

There is no associated primary reference and the paper in which they claim to solve this PDB does not discuss the existance of two bound molecules. Similar to a case before (4P2T), the more solvent-exposed molecule should not going to be annotated.

<br>

In [686]:
pdb = "5ute"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD15170000_1,ORF17,Human herpesvirus 8,O40922,5UTE,ASD15170001,8M4,A,Lig,Inhibitor,4-{[6-(cyclohexylmethyl)pyridine-2-carbonyl]am...,201,Allosteric function,Allosteric position,,Kaposi's Sarcoma Herpesvirus Protease in Compl...,No,"Chain A:ALA75,ILE105,TRP109,PRO192,ILE44,LEU10..."


In [687]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=False, stringent_site_grouping=True)

5UTE [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5250>, {'label_asym_id': ['C']}), (<Site: 5252>, {'label_asym_id': ['E']})]


In [688]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['C']}]

[1]

In [689]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [],
 'nonequivalent': [{'other_site': {'label_asym_id': ['D']},
   'res_of_other_in_site': 0.45454545454545453,
   'res_of_site_in_other': 0.22727272727272727},
  {'other_site': {'label_asym_id': ['E']},
   'res_of_other_in_site': 0.47058823529411764,
   'res_of_site_in_other': 0.7272727272727273}]}

Same as before, but now the site of the solvent-exposed "bridging" molecule is separated and must be deleted.

<br>

In [690]:
pdb = "5utn"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD15170000_1,ORF17,Human herpesvirus 8,O40922,5UTN,ASD15170002,8MA,A,Lig,Inhibitor,4-{[6-(cyclohexylmethyl)pyridine-2-carbonyl]am...,201,Allosteric function,Allosteric position,,Kaposi's Sarcoma Herpesvirus Protease in Compl...,No,"Chain A:PRO192,ILE105,TRP109,ILE44,LEU106,PHE1..."


In [691]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=False, stringent_site_grouping=True)

5UTN [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5252>, {'label_asym_id': ['D']}), (<Site: 5253>, {'label_asym_id': ['E']})]


In [692]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 5252>,
  {'label_asym_id': ['D']},
  {'equivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.9473684210526315,
     'res_of_site_in_other': 0.6}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 0.6363636363636364,
     'res_of_site_in_other': 0.23333333333333334}]}),
 (<Site: 5253>,
  {'label_asym_id': ['E']},
  {'equivalent': [],
   'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.42105263157894735,
     'res_of_site_in_other': 0.7272727272727273},
    {'other_site': {'label_asym_id': ['D']},
     'res_of_other_in_site': 0.23333333333333334,
     'res_of_site_in_other': 0.6363636363636364}]})]

In [693]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

Same as before.

<br>

In [694]:
pdb = "5uv3"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD15170000_1,ORF17,Human herpesvirus 8,O40922,5UV3,ASD15170003,8N4,A,Lig,Inhibitor,4-{[6-(cyclohexylmethyl)pyridine-2-carbonyl]am...,201,Allosteric function,Allosteric position,,Kaposi's Sarcoma Herpesvirus Protease in Compl...,No,"Chain A:PRO192,ILE105,TRP109,LEU47,ILE44,LEU10..."


In [695]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=False, stringent_site_grouping=True)

5UV3 [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5255>, {'label_asym_id': ['E']})]


Same as before.

<br>

In [696]:
pdb = "5v5d"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD15170000_1,ORF17,Human herpesvirus 8,O40922,5V5D,ASD15170006,8OY,A,Lig,Inhibitor,4-{[6-(cyclohexylmethyl)pyridine-2-carbonyl]am...,201,Inner Protein Regulator,Inner Protein,28759216,"Allosteric Inhibitors, Crystallography, and Co...",No,"Chain A:PRO192,ILE105,TRP109,ILE44,LEU106,PHE1..."


In [697]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=False, stringent_site_grouping=True)

5V5D [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5257>, {'label_asym_id': ['D']}), (<Site: 5258>, {'label_asym_id': ['E']})]


In [698]:
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

[(<Site: 5257>,
  {'label_asym_id': ['D']},
  {'equivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.875,
     'res_of_site_in_other': 0.65625}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['E']},
     'res_of_other_in_site': 0.7333333333333333,
     'res_of_site_in_other': 0.34375}]}),
 (<Site: 5258>,
  {'label_asym_id': ['E']},
  {'equivalent': [],
   'nonequivalent': [{'other_site': {'label_asym_id': ['C']},
     'res_of_other_in_site': 0.4583333333333333,
     'res_of_site_in_other': 0.7333333333333333},
    {'other_site': {'label_asym_id': ['D']},
     'res_of_other_in_site': 0.34375,
     'res_of_site_in_other': 0.7333333333333333}]})]

In [699]:
[s.delete_instance() for s in PDB.get(PDB.entry_id == pdb).sites if s.modulator != {'label_asym_id': ['D']}]

[1]

Same as before.

<br>

In [700]:
pdb = "5v5e"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
16,ASD15170000_1,ORF17,Human herpesvirus 8,O40922,5V5E,ASD15170003,8N4,A,Lig,Inhibitor,4-{[6-(cyclohexylmethyl)pyridine-2-carbonyl]am...,201,Inner Protein Regulator,Inner Protein,28759216,"Allosteric Inhibitors, Crystallography, and Co...",No,"Chain A:PRO192,ILE105,TRP109,LEU47,LEU110,ILE4..."


In [701]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=False, stringent_site_grouping=True)

5V5E [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5259>, {'label_asym_id': ['D']})]


In [702]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['C']},
   'res_of_other_in_site': 0.8461538461538461,
   'res_of_site_in_other': 0.6875},
  {'other_site': {'label_asym_id': ['E']},
   'res_of_other_in_site': 0.75,
   'res_of_site_in_other': 0.28125}],
 'nonequivalent': []}

Same as before.

<br>

In [703]:
pdb = "6b0z"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
14,ASD13040000_1,IDH1,Homo sapiens,O75874,6B0Z,ASD01720194,C81,A,Lig,Inhibitor,(4R)-4-[(1S)-1-fluoroethyl]-3-[2-({(1S)-1-[4-m...,502,Allosteric function,Allosteric position,29057061,Discovery and Evaluation of Clinical Candidate...,No,6B0Z


In [704]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6B0Z [[{'auth_asym_id': 'A', 'auth_comp_id': 'C81', 'auth_seq_id': '502'}]]
SITES: [(<Site: 5267>, {'label_asym_id': ['F', 'H', 'J']})]


Three molecules of the annotated modulator bind together in an interfacial site, with one of them in the middle bridging the other two that bind in the same site in different chains of the same protein, respectively.

<br>

In [705]:
pdb = "6brk"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
15,ASD17150000_1,Samhd1,Mus musculus,Q60710,6BRK,ASD09690001,DGT,A,Lig,Activator,2'-DEOXYGUANOSINE-5'-TRIPHOSPHATE,701,Allosteric function,Allosteric position,29379009,The SAM domain of mouse SAMHD1 is critical for...,No,"Chain A:ALA416,TYR417,GLY185,ASN423,HIS194,LYS..."


In [706]:
solve_error(pdb, [
    {"label_asym_id": ["C"]},
    {"label_asym_id": ["D"]},
    {"label_asym_id": ["E"]}
], auto_site_grouping=True, stringent_site_grouping=False)

6BRK [{'label_asym_id': ['C']}, {'label_asym_id': ['D']}, {'label_asym_id': ['E']}]
SITES: [(<Site: 5272>, {'label_asym_id': ['C', 'D', 'E']})]


According to the primary citation, the allosteric site is the one with the nucleotides that (can) bind in pairs (and together with a Mg).

<br>

In [707]:
pdb = "6dja"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
20,ASD21450000_1,,Bacillus cereus,P14488,6DJA,,ZN,A,ion,activator,ZINC ION,258,Allosteric function,Allosteric position,,,Yes,


In [708]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6DJA [[{'auth_asym_id': 'A', 'auth_comp_id': 'ZN', 'auth_seq_id': '258'}]]
SITES: [(<Site: 5275>, {'label_asym_id': ['B', 'C']})]


Similarly to a previous case, the pair of ZN bind together.

<br>

In [709]:
pdb = "6i0m"
get_error(pdb)

ENTRIES: 3 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD01630000_1,IMPDH2,Homo sapiens,P12268,6I0M,,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,603,,,30664871,A Nucleotide-Dependent Conformational Switch C...,,6I0M
5,ASD01630000_1,IMPDH2,Homo sapiens,P12268,6I0M,,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,604,,,30664871,A Nucleotide-Dependent Conformational Switch C...,,6I0M
6,ASD01630000_1,IMPDH2,Homo sapiens,P12268,6I0M,,GDP,A,Lig,Inhibitor,GUANOSINE-5'-DIPHOSPHATE,602,,,30664871,A Nucleotide-Dependent Conformational Switch C...,,6I0M


In [710]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi in ['603', '604']"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [711]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6I0M [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '602'}]]
SITES: [(<Site: 5284>, {'label_asym_id': ['D', 'E', 'F', 'O', 'P', 'Q']})]


In [712]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '603'").squeeze())

In [713]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6I0M [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '603'}]]
SITES: [(<Site: 5302>, {'label_asym_id': ['D', 'E', 'F', 'O', 'P', 'Q']})]


In [714]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '604'").squeeze())

In [715]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6I0M [[{'auth_asym_id': 'A', 'auth_comp_id': 'GDP', 'auth_seq_id': '604'}]]
SITES: [(<Site: 5320>, {'label_asym_id': ['D', 'E', 'F', 'O', 'P', 'Q']})]


In [716]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'F'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'O'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'P'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'Q'}],
   'label_entity_id': '3',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['

Similar to a previous case (5MCP), the three molecules form a single site (and are automatically grouped as they bind close together with the ones from another chain).

<br>

In [717]:
pdb = "6i0o"
get_error(pdb)

ENTRIES: 3 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD01630000_1,IMPDH2,Homo sapiens,P12268,6I0O,,GTP,A,Lig,Inhibitor,GUANOSINE-5'-TRIPHOSPHATE,601,,,30664871,A Nucleotide-Dependent Conformational Switch C...,,6I0O
5,ASD01630000_1,IMPDH2,Homo sapiens,P12268,6I0O,,GTP,A,Lig,Inhibitor,GUANOSINE-5'-TRIPHOSPHATE,602,,,30664871,A Nucleotide-Dependent Conformational Switch C...,,6I0O
6,ASD01630000_1,IMPDH2,Homo sapiens,P12268,6I0O,,GTP,A,Lig,Inhibitor,GUANOSINE-5'-TRIPHOSPHATE,603,,,30664871,A Nucleotide-Dependent Conformational Switch C...,,6I0O


In [718]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi in ['602', '603']"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [719]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6I0O [[{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '601'}]]
SITES: [(<Site: 5329>, {'label_asym_id': ['C', 'D', 'E', 'P', 'Q', 'R']})]


In [720]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '602'").squeeze())

In [721]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6I0O [[{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '602'}]]
SITES: [(<Site: 5347>, {'label_asym_id': ['C', 'D', 'E', 'P', 'Q', 'R']})]


In [722]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '603'").squeeze())

In [723]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6I0O [[{'auth_asym_id': 'A', 'auth_comp_id': 'GTP', 'auth_seq_id': '603'}]]
SITES: [(<Site: 5365>, {'label_asym_id': ['C', 'D', 'E', 'P', 'Q', 'R']})]


In [724]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'D'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'P'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'Q'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"},
  {'modulator': [{'label_asym_id': 'R'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': "GUANOSINE-5'-TRIPHOSPHATE"}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_i

Similar to a previous case (5MCP), the three molecules form a single site (and are automatically grouped as they bind close together with the ones from another chain).

<br>

In [725]:
pdb = "6ms7"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
6,ASD09120000_1,PPARG,Homo sapiens,P37231,6MS7,,V77,A,Lig,Activator,{[(1S)-1-(4-chlorophenyl)octyl]oxy}acetic acid,600,Allosteric function,Allosteric position,,,No,6MS7


In [726]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6MS7 [[{'auth_asym_id': 'A', 'auth_comp_id': 'V77', 'auth_seq_id': '600'}]]
SITES: [(<Site: 5368>, {'label_asym_id': ['C', 'D']})]


Both molecules are identified as allosteric in the primary citation and will be annotated as a single allosteric site.

<br>

In [727]:
pdb = "6q4d"
get_error(pdb)

ENTRIES: 3 SITES: [(<Site: 778>, {'label_asym_id': ['C', 'D', 'E']})]
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped', 'Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,304,,,,,,6Q4D
5,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,305,,,,,,6Q4D


In [728]:
df.query(f"allosteric_pdb == '{pdb.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1582,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,302；303,,,,,,6Q4D
1583,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,304,,,,,,6Q4D
1584,ASD04590000_1,CDK2,Homo sapiens,P24941,6Q4D,,HHT,A,Lig,Inhibitor,2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid,305,,,,,,6Q4D


In [729]:
error_entries = [
    row for i, row in (
        pd.DF(error_entries)
        .merge(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '305'"), how="outer", indicator=True)
        .query(f"_merge == 'left_only'").drop("_merge", axis=1)
        .iterrows()
    )
]

In [730]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6Q4D [[{'auth_asym_id': 'A', 'auth_comp_id': 'HHT', 'auth_seq_id': '304'}]]
SITES: [(<Site: 5382>, {'label_asym_id': ['C', 'D', 'E']})]


In [731]:
Site.get(Site.pdb == pdb).info

{'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': '2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid'},
  {'modulator': [{'label_asym_id': 'D'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': '2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid'},
  {'modulator': [{'label_asym_id': 'E'}],
   'label_entity_id': '2',
   'type': 'non-polymer',
   'pdbx_description': '2-(4-bromanyl-2-methoxy-phenyl)ethanoic acid'}],
 'interacting_chains_info': [{'label_entity_id': '1',
   'interacting_chains': {'label_asym_id': ['A']},
   'polymer_type': 'polypeptide(L)',
   'Uniprot': ['P24941']}],
 'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD04590000_1',
      'target_gene': 'CDK2',
      'organism': 'Homo sapiens',
      'pdb_uniprot': 'P24941',
      'allosteric_pdb': '6Q4D',
      'modulator_serial': None,
      'modulator_alias': 'HHT',
      'modulator_chain': 'A',
      'modul

In [732]:
error_entries.append(df.query(f"allosteric_pdb == '{pdb.upper()}' and modulator_resi == '305'").squeeze())

In [733]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6Q4D [[{'auth_asym_id': 'A', 'auth_comp_id': 'HHT', 'auth_seq_id': '305'}]]
SITES: [(<Site: 5382>, {'label_asym_id': ['C', 'D', 'E']}), (<Site: 5387>, {'label_asym_id': ['F']})]


Corrections implemented as described before for this PDB and its "multiple" annotation.

<br>

In [734]:
pdb = "6qku"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
8,ASD19220000_1,RPA1163,Rhodopseudomonas palustris (strain ATCC BAA-98...,Q6NAM1,6QKU,,R3W,A,Lig,Regulator,chloroacetic acid,402,Allosteric function,Allosteric position,,,Yes,


In [735]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6QKU [[{'auth_asym_id': 'A', 'auth_comp_id': 'R3W', 'auth_seq_id': '402'}]]
SITES: [(<Site: 5390>, {'label_asym_id': ['C', 'D']})]


The modulator is small and the two molecules in the structure bind together in the same site.

<br>

In [736]:
pdb = "6qkw"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
9,ASD19220000_1,RPA1163,Rhodopseudomonas palustris (strain ATCC BAA-98...,Q6NAM1,6QKW,,FAH,B,Lig,Regulator,fluoroacetic acid,402,Allosteric function,Allosteric position,,,Yes,


In [737]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6QKW [[{'auth_asym_id': 'B', 'auth_comp_id': 'FAH', 'auth_seq_id': '402'}]]
SITES: [(<Site: 5393>, {'label_asym_id': ['D', 'E']})]


Same as before.

<br>

In [738]:
pdb = "6qxa"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
9,ASD21620000_1,hppA,Thermotoga maritima,Q9S5X0,6QXA,,GQB,A,lig,Inhibitor,"~{N}-[(2-azanyl-3~{H}-1,3-benzothiazol-6-yl)me...",809,Allosteric function,Allosteric position,,,No,


In [739]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6QXA [[{'auth_asym_id': 'A', 'auth_comp_id': 'GQB', 'auth_seq_id': '809'}]]
SITES: [(<Site: 5397>, {'label_asym_id': ['M', 'N']})]


The annotated molecule of the modulator binds together and stacked with another molecule and thus should be grouped.

<br>

In [740]:
pdb = "6zxm"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
9,ASD21890000_1,LEPBI_p0053,Leptospira biflexa serovar Patoc,B0SUI1,6ZXM,,C2E,A,Lig,Inhibitor,"9,9'-[(2R,3R,3aS,5S,7aR,9R,10R,10aS,12S,14aR)-...",301,,,,,,


In [741]:
solve_error(pdb, None, auto_site_grouping=True, stringent_site_grouping=False)

6ZXM [[{'auth_asym_id': 'A', 'auth_comp_id': 'C2E', 'auth_seq_id': '301'}]]
SITES: [(<Site: 5410>, {'label_asym_id': ['G', 'H']})]


The annotated molecule of the modulator binds together and stacked with another molecule and thus should be grouped.

<br>

In [742]:
pdb = "7a5y"
get_error(pdb)

ENTRIES: 1 SITES: []
ERROR: ['Molecules of the annotated modulator(s) bind close together but were not grouped']


Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
4,ASD09690000_1,SAMHD1,Homo sapiens,Q9Y3Z3,7A5Y,,MG,A,Ion,Regulator,MAGNESIUM ION,703,,,,,,7A5Y


In [743]:
solve_error(pdb, [
    {"label_asym_id": ["EA", "N", "M"]}
], auto_site_grouping=True, stringent_site_grouping=False)

7A5Y [{'label_asym_id': ['EA', 'N', 'M']}]
SITES: [(<Site: 5463>, {'label_asym_id': ['W', 'X', 'DA']})]


In [744]:
Site.get(Site.pdb == pdb).related_sites

{'equivalent': [{'other_site': {'label_asym_id': ['ZA', 'AB', 'FB']},
   'res_of_other_in_site': 0.975,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['M', 'N', 'EA']},
   'res_of_other_in_site': 0.975,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['O', 'P', 'FA']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['PA', 'XA', 'GB']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['QA', 'RA', 'HB']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['NA', 'SA', 'TA']},
   'res_of_other_in_site': 1.0,
   'res_of_site_in_other': 1.0},
  {'other_site': {'label_asym_id': ['Q', 'R', 'Y']},
   'res_of_other_in_site': 0.975,
   'res_of_site_in_other': 1.0}],
 'nonequivalent': [{'other_site': {'label_asym_id': ['AA', 'BA', 'CA']},
   'res_of_other_in_site': 0.02857142857142857,
   'res_of_site_in

According to the primary citation, the allosteric site binds two GTPs together with a Mg, while the active site will only have on GTP with Mg and Fe, and thus the annotation must be corrected.

<br>

In [745]:
errors_groups()

{'combine_sites failed; len(all_sites) > (len(old_sites) + len(new_sites))': ['1wda',
  '2dew',
  '2dex',
  '2dw5'],
 'W': ['4ple', '4rqo', '5j8v', '5uvg', '6p1q', '6p4v', '7lh2']}

## Unknowns

In [746]:
unknown

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
482,ASD01140000_3,GABRA1,Homo sapiens,P14867,6I53,,,,Pep,Regulator,Megabody38,,,,30602789.0,Cryo-EM structure of the human alpha 1 beta 3 ...,,6I53
1892,ASD07480000_1,NTRK1,Homo sapiens,P04629,6D22,,,,,,,,,,29672039.0,,,6D22
2911,ASD17660000_1,spuE,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9I6J0,6IKM,,,,Pep,Inhibitor,Anti-SpuE Antibody,,,,,,,
2912,ASD17720000_1,lpxD,Pseudomonas aeruginosa (strain ATCC 15692 / DS...,Q9HXY6,6UEC,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3039,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEE,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3040,ASD21730000_1,lpxA,Pseudomonas aeruginosa,A6V1E4,6UEG,,,,Lig,,,,Inner Protein Regulator,Inner Protein,,,No,
3079,ASD22000000_1,,Sulfolobus islandicus,Q54324,6GVT,,,,DNA,Regulator,5'-D(*CP*TP*GP*TP*GP*CP*TP*CP*A)-3',,,,,,,
3080,ASD22010000_1,csm2,Streptococcus thermophilus,A0A0A7HIX1,6NUD,,,,RNA,Activator,target ssRNA,,,,,,,


In [747]:
for pdb in unknown.allosteric_pdb:
    Pdb = PDB.get_or_none(PDB.entry_id == pdb.lower())
    if Pdb is not None:
        print(Pdb)
        try:
            print([(s, s.modulator) for s in Pdb.sites])
        except:
            pass

6i53
[(<Site: 6>, {'label_asym_id': ['F']})]
6ikm
[(<Site: 113>, {'label_asym_id': ['BA']})]


The two 'unknown' entries that are annotated as _class peptides were already corrected.

### Error correction

In [748]:
get_unknown = lambda pdb: unknown.query(f"allosteric_pdb == '{pdb.upper()}'").squeeze()    

In [749]:
pdb = "6d22"
get_unknown(pdb)

target_id                  ASD07480000_1
target_gene                        NTRK1
organism                    Homo sapiens
pdb_uniprot                       P04629
allosteric_pdb                      6D22
modulator_serial                     NaN
modulator_alias                      NaN
modulator_chain                      NaN
modulator_class                      NaN
modulator_feature                    NaN
modulator_name                       NaN
modulator_resi                       NaN
function                             NaN
position                             NaN
pubmed_id                       29672039
ref_title                            NaN
site_overlap                         NaN
allosteric_site_residue             6D22
Name: 1892, dtype: object

In [750]:
errors[pdb] = "Wrong annotation"

6D22 does not have any other molecule besides the protein and water that could be identified as the allosteric modulator so it is a wrong entry.

<br>

In [751]:
pdb = "6uec"
get_unknown(pdb)

target_id                                                      ASD17720000_1
target_gene                                                             lpxD
organism                   Pseudomonas aeruginosa (strain ATCC 15692 / DS...
pdb_uniprot                                                           Q9HXY6
allosteric_pdb                                                          6UEC
modulator_serial                                                         NaN
modulator_alias                                                          NaN
modulator_chain                                                          NaN
modulator_class                                                          Lig
modulator_feature                                                        NaN
modulator_name                                                           NaN
modulator_resi                                                           NaN
function                                             Inner Protein Regulator

In [752]:
process_entry(
    get_unknown(pdb),
    updates = {
        pdb.upper(): {
            "pdb": pdb.upper(),
            "mods": [[{"label_asym_id": "B"}]]
        }
    },
    auto_site_grouping=True,
    stringent_site_grouping=True
)

6UEC [[{'label_asym_id': 'B'}]]
Downloading 6uec


In [753]:
pdb = "6uee"
process_entry(
    get_unknown(pdb),
    updates = {
        pdb.upper(): {
            "pdb": pdb.upper(),
            "mods": [[{"label_asym_id": "G"}]]
        }
    },
    auto_site_grouping=True,
    stringent_site_grouping=True
)
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

6UEE [[{'label_asym_id': 'G'}]]
Downloading 6uee


[(<Site: 5465>,
  {'label_asym_id': ['G']},
  {'equivalent': [{'other_site': {'label_asym_id': ['J']},
     'res_of_other_in_site': 0.8260869565217391,
     'res_of_site_in_other': 0.95},
    {'other_site': {'label_asym_id': ['M']},
     'res_of_other_in_site': 0.9523809523809523,
     'res_of_site_in_other': 1.0},
    {'other_site': {'label_asym_id': ['O']},
     'res_of_other_in_site': 0.8636363636363636,
     'res_of_site_in_other': 0.95},
    {'other_site': {'label_asym_id': ['P']},
     'res_of_other_in_site': 0.8260869565217391,
     'res_of_site_in_other': 0.95},
    {'other_site': {'label_asym_id': ['Q']},
     'res_of_other_in_site': 0.9523809523809523,
     'res_of_site_in_other': 1.0}],
   'nonequivalent': [{'other_site': {'label_asym_id': ['H']},
     'res_of_other_in_site': 0.0,
     'res_of_site_in_other': 0.0},
    {'other_site': {'label_asym_id': ['R']},
     'res_of_other_in_site': 0.0,
     'res_of_site_in_other': 0.0}]})]

In [754]:
pdb = "6ueg"
process_entry(
    get_unknown(pdb),
    updates = {
        pdb.upper(): {
            "pdb": pdb.upper(),
            "mods": [[{"label_asym_id": "G"}]]
        }
    },
    auto_site_grouping=True,
    stringent_site_grouping=True
)
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

6UEG [[{'label_asym_id': 'G'}]]
Downloading 6ueg


[(<Site: 5466>,
  {'label_asym_id': ['G']},
  {'equivalent': [{'other_site': {'label_asym_id': ['H']},
     'res_of_other_in_site': 0.8214285714285714,
     'res_of_site_in_other': 0.8518518518518519},
    {'other_site': {'label_asym_id': ['J']},
     'res_of_other_in_site': 0.8888888888888888,
     'res_of_site_in_other': 0.8888888888888888},
    {'other_site': {'label_asym_id': ['K']},
     'res_of_other_in_site': 0.9629629629629629,
     'res_of_site_in_other': 0.9629629629629629},
    {'other_site': {'label_asym_id': ['L']},
     'res_of_other_in_site': 0.92,
     'res_of_site_in_other': 0.8518518518518519},
    {'other_site': {'label_asym_id': ['M']},
     'res_of_other_in_site': 0.8571428571428571,
     'res_of_site_in_other': 0.8888888888888888}],
   'nonequivalent': []})]

The primary reference supports that the ligands could be allosteric modulators.

<br>

In [755]:
pdb = "6gvt"
get_unknown(pdb)

target_id                                        ASD22000000_1
target_gene                                                NaN
organism                                 Sulfolobus islandicus
pdb_uniprot                                             Q54324
allosteric_pdb                                            6GVT
modulator_serial                                           NaN
modulator_alias                                            NaN
modulator_chain                                            NaN
modulator_class                                            DNA
modulator_feature                                    Regulator
modulator_name             5'-D(*CP*TP*GP*TP*GP*CP*TP*CP*A)-3'
modulator_resi                                             NaN
function                                                   NaN
position                                                   NaN
pubmed_id                                                  NaN
ref_title                                              

In [756]:
process_entry(
    get_unknown(pdb),
    updates = {
        pdb.upper(): {
            "pdb": pdb.upper(),
            "mods": [[{"label_entity_id": "3"}]]
        }
    },
    auto_site_grouping=True,
    stringent_site_grouping=True
)
[(s, s.modulator, s.related_sites) for s in PDB.get(PDB.entry_id == pdb).sites]

6GVT [[{'label_entity_id': '3'}]]
Downloading 6gvt


[(<Site: 5469>,
  {'label_asym_id': ['C', 'D']},
  {'equivalent': [], 'nonequivalent': []})]

In the primary reference it is discussed how the DNA allows for ATP binding due to a conformational change, but then the ATP molecules kickstart a conformational switch that enables the sequence specificity of the DNA. As a single DNA Molecule annotated as the allosteric modulator will be an outlier in the database and the entry is complete as-is, the ATP molecules that are also discussed in a follow-up paper as allosteric are going to be annotated.

<br>

In [757]:
pdb = "6nud"
get_unknown(pdb)

target_id                               ASD22010000_1
target_gene                                      csm2
organism                   Streptococcus thermophilus
pdb_uniprot                                A0A0A7HIX1
allosteric_pdb                                   6NUD
modulator_serial                                  NaN
modulator_alias                                   NaN
modulator_chain                                   NaN
modulator_class                                   RNA
modulator_feature                           Activator
modulator_name                           target ssRNA
modulator_resi                                    NaN
function                                          NaN
position                                          NaN
pubmed_id                                         NaN
ref_title                                         NaN
site_overlap                                      NaN
allosteric_site_residue                           NaN
Name: 3080, dtype: object

In [758]:
errors[pdb] = "Wrong annotation"

In the primary reference, it is discussed how the 3' end of the guide ssRNA most probably allosterically regulates the DNase at a long distance, but this end is not solved in structure and also for uniformity, as this would be the only RNA sequence in the database, it is not going to be annotated.

<br>

In [759]:
errors_groups()

{'combine_sites failed; len(all_sites) > (len(old_sites) + len(new_sites))': ['1wda',
  '2dew',
  '2dex',
  '2dw5'],
 'W': ['4ple', '4rqo', '5j8v', '5uvg', '6d22', '6nud', '6p1q', '6p4v', '7lh2']}

# Statistics

In [760]:
# Total number of sites
len(Site.select())

3088

In [761]:
# Total number of different PDBs
len(PDB.select())

2953

In [762]:
# PDBs with no sites (expected 0)
[(pdb.entry_id, pdb) for p in PDB.select() if len(p.sites) == 0]

[]

In [763]:
# Number of sites in PDBs
set(len(p.sites) for p in PDB.select())

{1, 2, 3, 4}

In [764]:
# Exploration of the PDBs that have 4 different sites
[p for p in PDB.select() if len(p.sites) == 4]

[<PDB: 1iq5>, <PDB: 3j41>, <PDB: 3l76>]

In [765]:
[(s, s.modulator, s.info) for s in PDB.get(PDB.entry_id == "1iq5").sites]

[(<Site: 117>,
  {'label_asym_id': ['C']},
  {'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
     'label_entity_id': '3',
     'type': 'non-polymer',
     'pdbx_description': 'CALCIUM ION'}],
   'interacting_chains_info': [{'label_entity_id': '1',
     'interacting_chains': {'label_asym_id': ['A']},
     'polymer_type': 'polypeptide(L)',
     'Uniprot': ['P0DP33']}],
   'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00470000_3',
        'target_gene': 'CALM1',
        'organism': 'Homo sapiens',
        'pdb_uniprot': 'P62158',
        'allosteric_pdb': '1IQ5',
        'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001;ASD11630001',
        'modulator_alias': 'CA;CA;CA;CA',
        'modulator_chain': 'A;A;A;A',
        'modulator_class': 'Ion',
        'modulator_feature': 'Regulator',
        'modulator_name': 'CALCIUM ION',
        'modulator_resi': '361;362;363;364',
        'function': 'Inner Protein Regulator',
        'position': 'Inner Protein'

In [766]:
[(s, s.modulator, s.info) for s in PDB.get(PDB.entry_id == "3j41").sites]

[(<Site: 124>,
  {'label_asym_id': ['G']},
  {'modulator_info': [{'modulator': [{'label_asym_id': 'G'}],
     'label_entity_id': '3',
     'type': 'non-polymer',
     'pdbx_description': 'CALCIUM ION'}],
   'interacting_chains_info': [{'label_entity_id': '2',
     'interacting_chains': {'label_asym_id': ['E']},
     'polymer_type': 'polypeptide(L)',
     'Uniprot': ['P0DP23']}],
   'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00470000_3',
        'target_gene': 'CALM1',
        'organism': 'Homo sapiens',
        'pdb_uniprot': 'P62158',
        'allosteric_pdb': '3J41',
        'modulator_serial': 'ASD11630001;ASD11630001;ASD11630001;ASD11630001',
        'modulator_alias': 'CA;CA;CA;CA',
        'modulator_chain': 'E;E;E;E',
        'modulator_class': 'Ion',
        'modulator_feature': 'Regulator',
        'modulator_name': 'CALCIUM ION',
        'modulator_resi': '201;202;203;204',
        'function': 'Inner Protein Regulator',
        'position': 'Inner Protein'

In [767]:
[(s, s.modulator, s.info) for s in PDB.get(PDB.entry_id == "3l76").sites]

[(<Site: 925>,
  {'label_asym_id': ['E']},
  {'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
     'label_entity_id': '3',
     'type': 'non-polymer',
     'pdbx_description': 'LYSINE'}],
   'interacting_chains_info': [{'label_entity_id': '1',
     'interacting_chains': {'label_asym_id': ['A']},
     'polymer_type': 'polypeptide(L)',
     'Uniprot': ['P74569']}],
   'source': {'allosteric_database': [{'entry': [{'target_id': 'ASD00350000_3',
        'target_gene': 'lysC',
        'organism': 'Synechocystis sp.',
        'pdb_uniprot': 'P74569',
        'allosteric_pdb': '3L76',
        'modulator_serial': 'ASD00910001',
        'modulator_alias': 'LYS',
        'modulator_chain': 'A',
        'modulator_class': 'Lig',
        'modulator_feature': 'Inhibitor',
        'modulator_name': '[(5S)-5-amino-6-hydroxy-6-oxo-hexyl]azanium',
        'modulator_resi': '603',
        'function': 'Inner Protein Regulator',
        'position': 'Inner Protein',
        'pubmed_id': '2039867

They indeed bind far away from each other.

<br>

In [768]:
[p for p in PDB.select() if len(p.sites) == 3]

[<PDB: 1wd9>,
 <PDB: 2dey>,
 <PDB: 3b1t>,
 <PDB: 3b1u>,
 <PDB: 7e6t>,
 <PDB: 3vq7>,
 <PDB: 4gqq>,
 <PDB: 3kgf>,
 <PDB: 3nud>,
 <PDB: 5ex4>]

In [769]:
# Modulator identifier fields used
set(tuple(s.modulator.keys()) for s in Site.select())

{('label_asym_id',)}

In [770]:
# Number of different entity instances (different "label_asym_id") annotated as modulators to look for outliers
modulator_chains = dict(
    sorted(
        {s.id: len(s.modulator["label_asym_id"]) for s in Site.select()}.items(),
        key=lambda i: i[-1], reverse=True
    )
)
modulator_chains

{520: 24,
 2807: 9,
 2819: 9,
 582: 8,
 608: 6,
 4806: 6,
 5088: 6,
 5246: 6,
 5320: 6,
 5365: 6,
 160: 4,
 187: 4,
 193: 4,
 368: 4,
 380: 4,
 412: 4,
 684: 4,
 741: 4,
 4493: 4,
 4767: 4,
 336: 3,
 419: 3,
 481: 3,
 482: 3,
 483: 3,
 552: 3,
 569: 3,
 570: 3,
 592: 3,
 599: 3,
 615: 3,
 624: 3,
 2319: 3,
 2327: 3,
 2334: 3,
 2633: 3,
 4542: 3,
 4559: 3,
 4577: 3,
 4590: 3,
 4636: 3,
 4733: 3,
 4834: 3,
 4842: 3,
 4863: 3,
 4886: 3,
 4907: 3,
 4929: 3,
 4949: 3,
 4966: 3,
 4979: 3,
 4992: 3,
 5002: 3,
 5003: 3,
 5122: 3,
 5143: 3,
 5190: 3,
 5200: 3,
 5267: 3,
 5272: 3,
 5382: 3,
 5463: 3,
 5: 2,
 14: 2,
 51: 2,
 75: 2,
 81: 2,
 91: 2,
 92: 2,
 93: 2,
 94: 2,
 95: 2,
 97: 2,
 136: 2,
 140: 2,
 144: 2,
 147: 2,
 150: 2,
 153: 2,
 168: 2,
 175: 2,
 198: 2,
 248: 2,
 254: 2,
 269: 2,
 272: 2,
 288: 2,
 318: 2,
 328: 2,
 339: 2,
 371: 2,
 383: 2,
 418: 2,
 432: 2,
 446: 2,
 463: 2,
 467: 2,
 480: 2,
 488: 2,
 491: 2,
 523: 2,
 526: 2,
 529: 2,
 530: 2,
 573: 2,
 585: 2,
 627: 2,
 748: 2,


In [773]:
# for posterity
modulator_chains = {520: 24,
 2807: 9,
 2819: 9,
 582: 8,
 608: 6,
 4806: 6,
 5088: 6,
 5246: 6,
 5320: 6,
 5365: 6}

## Outlier correction

In [774]:
site = Site.get(Site.id == list(modulator_chains.keys())[0])
site.pdb, site.modulator, site.related_sites, site.info

(<PDB: 3fyh>,
 {'label_asym_id': ['F',
   'G',
   'H',
   'I',
   'J',
   'K',
   'L',
   'M',
   'N',
   'O',
   'P',
   'Q',
   'R',
   'S',
   'T',
   'U',
   'V',
   'W',
   'X',
   'Y',
   'Z',
   'AA',
   'BA',
   'CA']},
 {'equivalent': [], 'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': 'TUNGSTEN ION'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': 'TUNGSTEN ION'},
   {'modulator': [{'label_asym_id': 'H'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': 'TUNGSTEN ION'},
   {'modulator': [{'label_asym_id': 'I'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': 'TUNGSTEN ION'},
   {'modulator': [{'label_asym_id': 'J'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': 'TUNGSTEN ION'},
   {'modulator': 

It is a cluster of tungstens and it is thus correct.

<br>

In [775]:
site = Site.get(Site.id == list(modulator_chains.keys())[1])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 3cag>,
 [<Site: 2807>],
 {'label_asym_id': ['G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O']},
 {'equivalent': [], 'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'H'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'I'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'J'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'K'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'L'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_i

In [776]:
site.delete_instance()

1

In [777]:
df.query(f"allosteric_pdb == '{site.pdb.entry_id.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1545,ASD04250000_3,argR,Mycobacterium tuberculosis,P9WPY9,3CAG,ASD02000001,ARG,A,Lig,Activator,[amino-[[(4S)-4-amino-5-hydroxy-5-oxo-pentyl]a...,300,Protein-DNA/RNA Interaction Regulator,Protein-Protein Interaction,18703843,Structure of the C-terminal domain of the argi...,No,"Chain A:HIS125,ALA128,SER129,ASP132,THR142,ILE..."


In [778]:
process_entry(
    df.query(f"allosteric_pdb == '{site.pdb.entry_id.upper()}'").squeeze(),
    updates = {
        site.pdb.entry_id.upper(): {
            "pdb": site.pdb.entry_id.upper(),
            "mods": [
                {"label_asym_id": ["J", "N", "K"]},
                {"label_asym_id": ["L", "M", "G"]},
                {"label_asym_id": ["O", "H", "I"]},
            ]
        }
    },
    auto_site_grouping=False,
    stringent_site_grouping=True,
)

3CAG [{'label_asym_id': ['J', 'N', 'K']}, {'label_asym_id': ['L', 'M', 'G']}, {'label_asym_id': ['O', 'H', 'I']}]


In [779]:
site = PDB.get(PDB.entry_id == site.pdb).sites[0]
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 3cag>,
 [<Site: 5471>],
 {'label_asym_id': ['G', 'L', 'M']},
 {'equivalent': [{'other_site': {'label_asym_id': ['J', 'K', 'N']},
    'res_of_other_in_site': 1.5555555555555556,
    'res_of_site_in_other': 1.4482758620689655},
   {'other_site': {'label_asym_id': ['H', 'I', 'O']},
    'res_of_other_in_site': 1.5769230769230769,
    'res_of_site_in_other': 1.4137931034482758}],
  'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'L'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'M'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A', 'C', 'D', 'F']},
    'polymer_type': 'polypeptide(L)',
    'Un

In [780]:
site.protein_residues.label_asym_id.unique(), site.nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'C', 'D', 'F'], dtype=object), array(['A', 'C'], dtype=object))

The 9 modulator molecules that bind together can be grouped in 3 symmetry groups.

<br>

In [781]:
site = Site.get(Site.id == list(modulator_chains.keys())[2])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 3fhz>,
 [<Site: 2819>],
 {'label_asym_id': ['M', 'N', 'P', 'Q', 'S', 'T', 'U', 'W', 'X']},
 {'equivalent': [], 'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'M'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'N'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'P'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'Q'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'S'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'T'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_i

In [782]:
site.delete_instance()

1

In [783]:
df.query(f"allosteric_pdb == '{site.pdb.entry_id.upper()}'")

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1546,ASD04250000_3,argR,Mycobacterium tuberculosis,P9WPY8,3FHZ,ASD02000001,ARG,A,Lig,Activator,[amino-[[(4S)-4-amino-5-hydroxy-5-oxo-pentyl]a...,200,Protein-DNA/RNA Interaction Regulator,Protein-Protein Interaction,19265706,The structure of the arginine repressor from M...,No,"Chain A:HIS125,SER129,ASP132,THR142,ILE143,ALA..."


In [784]:
process_entry(
    df.query(f"allosteric_pdb == '{site.pdb.entry_id.upper()}'").squeeze(),
    updates = {
        site.pdb.entry_id.upper(): {
            "pdb": site.pdb.entry_id.upper(),
            "mods": [
                {"label_asym_id": ["U", "X", "Q"]},
                {"label_asym_id": ["P", "S", "N"]},
                {"label_asym_id": ["T", "W", "M"]},
            ]
        }
    },
    auto_site_grouping=False,
    stringent_site_grouping=True,
)

3FHZ [{'label_asym_id': ['U', 'X', 'Q']}, {'label_asym_id': ['P', 'S', 'N']}, {'label_asym_id': ['T', 'W', 'M']}]


In [785]:
site = PDB.get(PDB.entry_id == site.pdb).sites[0]
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 3fhz>,
 [<Site: 5474>],
 {'label_asym_id': ['M', 'T', 'W']},
 {'equivalent': [{'other_site': {'label_asym_id': ['Q', 'U', 'X']},
    'res_of_other_in_site': 1.24,
    'res_of_site_in_other': 1.3478260869565217},
   {'other_site': {'label_asym_id': ['N', 'P', 'S']},
    'res_of_other_in_site': 1.25,
    'res_of_site_in_other': 1.3043478260869565}],
  'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'M'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'T'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'},
   {'modulator': [{'label_asym_id': 'W'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': 'ARGININE'}],
  'interacting_chains_info': [{'label_entity_id': '1',
    'interacting_chains': {'label_asym_id': ['A', 'C', 'D', 'F']},
    'polymer_type': 'polypeptide(L)',
    'Uniprot': ['P9WPY9']}],
  'sou

In [786]:
site.protein_residues.label_asym_id.unique(), site.nonredundant_site.protein_residues.label_asym_id.unique()

(array(['A', 'C', 'D', 'F'], dtype=object), array(['A', 'C'], dtype=object))

The 9 modulator molecules that bind together can be grouped in 3 symmetry groups; although this time one of the arginines on each group is only partially solved.

<br>

In [787]:
site = Site.get(Site.id == list(modulator_chains.keys())[3])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 4dqw>,
 [<Site: 582>],
 {'label_asym_id': ['C', 'D', 'E', 'F', 'I', 'J', 'K', 'L']},
 {'equivalent': [], 'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'C'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'MANGANESE (II) ION'},
   {'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': 'MANGANESE (II) ION'},
   {'modulator': [{'label_asym_id': 'I'}],
    'label_entity_id': '2',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'J'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    

In [788]:
site = Site.get(Site.id == list(modulator_chains.keys())[4])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 4kh0>,
 [<Site: 608>],
 {'label_asym_id': ['G', 'H', 'I', 'L', 'M', 'N']},
 {'equivalent': [], 'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'H'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'I'}],
    'label_entity_id': '6',
    'type': 'non-polymer',
    'pdbx_description': 'MAGNESIUM ION'},
   {'modulator': [{'label_asym_id': 'L'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'M'}],
    'label_entity_id': '5',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'N'}],
    'label_entity_id': '6',
    'type': 'non-polymer',
    'pdbx_de

In [789]:
site = Site.get(Site.id == list(modulator_chains.keys())[5])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 3lsl>,
 [<Site: 4806>],
 {'label_asym_id': ['E', 'F', 'G', 'K', 'L', 'M']},
 {'equivalent': [{'other_site': {'label_asym_id': ['Q', 'R', 'S']},
    'res_of_other_in_site': 1.0,
    'res_of_site_in_other': 0.6896551724137931}],
  'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
   {'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
   {'modulator': [{'label_asym_id': 'K'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': '2-(2-oxopyrrolidin-1-yl)acetamide'},
   {'modulator': [{'label_asym_id': 'L'}],
    'label_entity_id': '3',
    'type': '

In [790]:
site = Site.get(Site.id == list(modulator_chains.keys())[6])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 4z87>,
 [<Site: 5088>],
 {'label_asym_id': ['F', 'G', 'H', 'L', 'M', 'N']},
 {'equivalent': [{'other_site': {'label_asym_id': ['Y', 'Z', 'AA']},
    'res_of_other_in_site': 0.984375,
    'res_of_site_in_other': 0.984375},
   {'other_site': {'label_asym_id': ['R', 'S', 'T']},
    'res_of_other_in_site': 0.9206349206349206,
    'res_of_site_in_other': 0.90625}],
  'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'G'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'H'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'L'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'

In [791]:
site = Site.get(Site.id == list(modulator_chains.keys())[7])
site.pdb, list(PDB.get(PDB.entry_id == site.pdb).sites), site.modulator, site.related_sites, site.info

(<PDB: 5tc3>,
 [<Site: 5246>],
 {'label_asym_id': ['D', 'E', 'F', 'I', 'J', 'K']},
 {'equivalent': [], 'nonequivalent': []},
 {'modulator_info': [{'modulator': [{'label_asym_id': 'D'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'E'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'F'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'I'}],
    'label_entity_id': '3',
    'type': 'non-polymer',
    'pdbx_description': "ADENOSINE-5'-TRIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'J'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
    'pdbx_description': "GUANOSINE-5'-DIPHOSPHATE"},
   {'modulator': [{'label_asym_id': 'K'}],
    'label_entity_id': '4',
    'type': 'non-polymer',
  

These cases are correct.

<br>

In [792]:
db.close()

True