# Issues with ASD Tab Delimited File (ASD_Release_202309_AS.txt)

## Read the data from the tab delimited file

Download the ASD_Release_202309_AS.tar.gz archive containing the ASD_Release_202309_AS.txt tab delimited file with allosteric site description of all available entries contained in the ASD. Place the extracted  in the ../data directory or provide the path to the extracted file.

The allosteric_site_residues column of the ASD_Release_202309_AS.txt file contains many erroneous entries. Such as PDB ID in place of a list of amino acid residues.


In [1]:
import pandas as pd

df_asd = pd.read_table('../data/ASD_Release_202309_AS.txt')

print('Number of Rows:             ', df_asd.shape[0])
print('Number of Unique PDB IDs:   ', df_asd['allosteric_pdb'].nunique())
print('Number of Unique UniProt AC:', df_asd['pdb_uniprot'].nunique())

Number of Rows:              3102
Number of Unique PDB IDs:    2963
Number of Unique UniProt AC: 696


## Rows with Empty Cells in the Column for Allosteric Site Residue

In [2]:
df_asd[df_asd['allosteric_site_residue'].isna()]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1,ASD00020000_2,aroF,Escherichia coli,P00888,6AGM,,TYR,A,Lig,Inhibitor,TYROSINE,601,Inner Protein Regulator,Inner Protein,,,No,
64,ASD00250000_7,glgC,Rhizobium radiobacter,P39669,5W5T,,9X7,F,Lig,Activator,ethyl 2-oxopropanoate,503,Allosteric function,Allosteric position,30401744,Structural analysis reveals a pyruvate-binding...,No,
234,ASD00600000_2,,Clostridium pasteurianum,P00268,4XNV,,BUR,A,Lig,Inhibitor,1-[2-(2-tert-butylphenoxy)pyridin-3-yl]-3- [4-...,1101,Inner Protein Regulator,Allosteric Position,25822790,Two disparate ligand-binding sites in the huma...,No,
587,ASD01230000_2,Gria2,Rattus norvegicus,P19491,6FAZ,,D45,B,Lig,modulator,"6,6'-(Ethane-1,2-diyl)bis(4-methyl-3,4-dihydro...",301,,,29775064,Enhancing Action of Positive Allosteric Modula...,,
588,ASD01230000_2,Gria2,Rattus norvegicus,P19491,6HC9,,FXW,B,Lig,Activator,"6,6'-(ETHANE-1,2-DIYL)BIS(4-CYCLOPROPYL-3,4-DI...",313,Allosteric function,Allosteric position,,,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3081,ASD22020000_1,,Bothrops moojeni,Q9I834,6PWH,,VRD,A,Lig,,({3-[amino(oxo)acetyl]-1-benzyl-2-ethyl-1H-ind...,201,,,,,,
3082,ASD22030000_1,gyrB,Staphylococcus aureus,P0A0K8,6QX1,,JK8,B,Lig,Inhibitor,"(2~{R})-2-[[5-(2-chlorophenyl)-1,2-benzoxazol-...",702,,,,,,
3083,ASD22030000_1,gyrB,Staphylococcus aureus,P0A0K8,6QX2,,JK8,B,Lig,Inhibitor,"(2~{R})-2-[[5-(2-chlorophenyl)-1,2-benzoxazol-...",702,,,,,,
3086,ASD22050000_1,pfk,Trypanosoma brucei brucei,O15648,6QU3,,JJ5,A,Lig,Inhibitor,"1-[(3,4-dichlorophenyl)methyl]-7~{H}-pyrrolo[3...",1001,,,,,,


In [3]:
df_asd_not_na = df_asd[df_asd['allosteric_site_residue'].notna()]
df_sub = df_asd_not_na[~(df_asd_not_na['allosteric_site_residue'].str.startswith('Chain'))]
df_sub

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
7,ASD00060000_1,PDPK1,Homo sapiens,O15530,3HRF,ASD00060001,P47,A,Lig,Activator,(Z)-5-(4-chlorophenyl)-3-phenyl-pent-2-enoic acid,1374,Inner Protein Regulator,Inner Protein,19718043,Structure and allosteric effects of low-molecu...,No,3HRF
8,ASD00060000_1,PDPK1,Homo sapiens,O15530,3NAX,ASD00060027,MP7,A,Lig,Inhibitor,"1-[(3,4-difluorophenyl)methyl]-2-oxo-N-[(1R)-2...",363,Inner Protein Regulator,Inner Protein,21118801,Genetic and pharmacological inhibition of PDK1...,Yes,3NAX
9,ASD00060000_1,PDPK1,Homo sapiens,O15530,3ORX,ASD00060035,1F8,A,Lig,Inhibitor,2-methyl-N-(2-sulfanylethyl)-1-benzofuran-3-ca...,1,Inner Protein Regulator,Inner Protein,21430264,Turning a protein kinase on or off from a sing...,No,3ORX
10,ASD00060000_1,PDPK1,Homo sapiens,O15530,3ORZ,ASD00068003,2A2,A,Lig,Activator,1-[4-(3-chlorophenyl)piperazin-1-yl]-4-sulfany...,1,Inner Protein Regulator,Inner Protein,21430264,Turning a protein kinase on or off from a sing...,No,3ORZ
11,ASD00060000_1,PDPK1,Homo sapiens,O15530,3OTU,ASD00068007,J30,A,Lig,Activator,1-[4-(naphthalen-1-ylmethyl)piperazin-1-yl]-4-...,1,Inner Protein Regulator,Inner Protein,21430264,Turning a protein kinase on or off from a sing...,No,3OTU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3073,ASD21950000_1,SLC26A5,Homo sapiens,P58743,7LGU,,CL,A,Ion,Regulator,CHLORIDE ION,801,,,,,,7LGU
3074,ASD21950000_1,SLC26A5,Homo sapiens,P58743,7LGW,,CL,A,Ion,Regulator,CHLORIDE ION,801,,,,,,7LGW
3075,ASD21950000_1,SLC26A5,Homo sapiens,P58743,7LH2,,CL,A,Ion,Regulator,CHLORIDE ION,802,,,,,,7LH2
3084,ASD22040000_1,LDHA,Homo sapiens,P00338,6SBU,,L5N,A,Lig,Inhibitor,4-[[4-[(5-chloranylthiophen-2-yl)carbonylamino...,1001,,,,,,6SBU


In [4]:
df_sub[df_sub['allosteric_site_residue'].str.len() > 4]

Unnamed: 0,target_id,target_gene,organism,pdb_uniprot,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,modulator_resi,function,position,pubmed_id,ref_title,site_overlap,allosteric_site_residue
1668,ASD04970000_1,HRAS,Homo sapiens,P01112,5e+95,,NS1,B,Pep,Inhibitor,a synthetic binding protein (monobody),1-97,Allosteric Function,Allosteric Position,27820802,Inhibition of RAS function through targeting a...,No,5e+95


In [5]:
print('Number of Rows:             ', df_sub.shape[0])
print('Number of Unique PDB IDs:   ', df_sub['allosteric_pdb'].nunique())
print('Number of Unique UniProt AC:', df_sub['pdb_uniprot'].nunique())

Number of Rows:              1409
Number of Unique PDB IDs:    1338
Number of Unique UniProt AC: 256
