In [1]:
import gzip
import json
import re
from pathlib import Path

import pandas as pd
import requests
import tqdm
from bs4 import BeautifulSoup


def table2array(content, indent=None):
    soup = BeautifulSoup(content, "html.parser")
    rows = soup.find_all("tr")
    
    headers = {}
    thead = soup.find("thead")
    if thead:
        thead = soup.find_all("th")
        for i in range(len(thead)):
            headers[i] = thead[i].text.strip().lower()
    data = []
    for row in rows:
        cells = row.find_all("td")
        if thead:
            items = {}
            if len(cells) > 0:
                for index in headers:
                    items[headers[index]] = cells[index].text
        else:
            items = []
            for index in cells:
                items.append(index.text.strip())
        if items:
            data.append(items)
    return data

In [2]:
with open('../data/ASD-Sites/ASD-Sites.html', 'r') as html_file:
    string = html_file.read()
    
soup = BeautifulSoup(string, 'html.parser')
entries = soup.find_all(id=re.compile(r'gridview-1011-[\s\d]*'))
    
df = pd.DataFrame([table2array(str(entry))[0] for entry in entries], columns=['Target', 'Organism', 'Modulator', 'Allosteric Site', 'Orthosteric Site', 'PDB_ID'])
df.drop_duplicates(inplace=True)

In [3]:
df

Unnamed: 0,Target,Organism,Modulator,Allosteric Site,Orthosteric Site,PDB_ID
0,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XCG_1,O_ASD07000000_1GOS,2XCG
1,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFN_1,O_ASD07000000_1GOS,2XFN
2,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFO_1,O_ASD07000000_1GOS,2XFO
3,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFP_1,O_ASD07000000_1GOS,2XFP
4,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFQ_1,O_ASD07000000_1GOS,2XFQ
...,...,...,...,...,...,...
3097,GTPase Kras,Homo sapiens,V4T,A_ASD0639_1_7O70_1,,7O70
3098,GTPase Kras,Homo sapiens,V52,A_ASD0639_1_7O83_1,,7O83
3099,GTPase Kras,Homo sapiens,VLE,A_ASD0639_1_7OO7_1,,7OO7
3100,Calcium/calmodulin-dependent protein kinase ki...,Homo sapiens,609,A_ASD2015_1_2ZV2_1,,2ZV2


Failed to download the following orthosteric sites:
* O_ASD0950_1_3ATH
* O_ASD2057_1_3U1I
* O_ASD0322_1_6N2W
* O_ASD1652_1_6H5O
* O_ASD2165_1_6R02

In [4]:
import glob
import tqdm
import os
from Bio.PDB import PDBParser
from collections import defaultdict
import pandas as pd

from Bio import BiopythonWarning
import warnings

orthosteric_sites = []
with warnings.catch_warnings():
    warnings.simplefilter('ignore', BiopythonWarning)

    parser = PDBParser()
    

    for pdb_file in glob.glob('../data/ASD-Sites/Orthosteric_Sites/*.pdb'):
        site_label = os.path.splitext(os.path.basename(pdb_file))[0]
        structure = parser.get_structure('structure', pdb_file)
        residue_list = []
        for residue in structure.get_residues():
            chain = residue.get_parent()
            residue_list.append(f'{chain.id}-{residue.resname}-{residue.id[1]}')
        orthosteric_sites.append([site_label, residue_list])

df_ortho_sites = pd.DataFrame(orthosteric_sites, columns=['Orthosteric Site', 'Orthosteric Site Residues'])
df_ortho_sites['Number of Orthosteric Site Residues'] = df_ortho_sites['Orthosteric Site Residues'].apply(len)
df_ortho_sites

Unnamed: 0,Orthosteric Site,Orthosteric Site Residues,Number of Orthosteric Site Residues
0,O_ASD00020000_1KFL,"[A-CYS-61, A-ARG-92, A-TYR-94, A-GLU-96, A-LYS...",23
1,O_ASD00020000_1RZM,"[A-CYS-102, A-LYS-131, A-PRO-132, A-GLU-164, A...",15
2,O_ASD00060000_3HRF,"[A-HOH-16, A-HOH-47, A-LEU-88, A-GLY-89, A-GLU...",47
3,O_ASD0006_1_4XX9,"[A-LYS-86, A-ILE-87, A-LEU-88, A-GLY-89, A-GLU...",19
4,O_ASD0006_1_5ACK,"[A-GLY-85, A-LYS-86, A-ILE-87, A-LEU-88, A-GLY...",30
...,...,...,...
204,O_ASD1435_1_5WDY,"[A-GLU-226, A-GLY-228, A-ARG-229, A-GLY-230, A...",33
205,O_ASD1457_1_2P02,"[A-ILE-227, A-SER-228, A-HIS-231, A-LEU-237, A...",24
206,O_ASD1522_1_1QOQ,"[A-PHE-19, A-VAL-20, A-PRO-21, A-PHE-22, A-VAL...",54
207,O_ASD1634_1_5NQQ,"[A-ASP-5, A-LEU-7, A-ILE-8, A-HIS-9, A-ASN-10,...",75


In [5]:
df_ortho_sites[df_ortho_sites['Number of Orthosteric Site Residues'] < 1]

Unnamed: 0,Orthosteric Site,Orthosteric Site Residues,Number of Orthosteric Site Residues


In [6]:
import glob
import tqdm
import os
from Bio.PDB import PDBParser
from collections import defaultdict
import pandas as pd
from Bio import BiopythonWarning
import warnings

allosteric_sites = []
with warnings.catch_warnings():
    warnings.simplefilter('ignore', BiopythonWarning)

    parser = PDBParser()

    for pdb_file in glob.glob('../data/ASD-Sites/Allosteric_Sites/*.pdb'):
        site_label = os.path.splitext(os.path.basename(pdb_file))[0]
        structure = parser.get_structure('structure', pdb_file)
        residue_list = []
        for residue in structure.get_residues():
            chain = residue.get_parent()
            residue_list.append(f'{chain.id}-{residue.resname}-{residue.id[1]}')
        allosteric_sites.append([site_label, residue_list])
    
df_allo_sites = pd.DataFrame(allosteric_sites, columns=['Allosteric Site', 'Allosteric Site Residues'])
df_allo_sites['Number of Allosteric Site Residues'] = df_allo_sites['Allosteric Site Residues'].apply(len)
df_allo_sites

Unnamed: 0,Allosteric Site,Allosteric Site Residues,Number of Allosteric Site Residues
0,A_ASD0002_1_1KFL_1,"[A-ARG-40, A-MSE-147, A-ILE-148, A-PRO-150, A-...",24
1,A_ASD0002_2_6AGM_1,"[A-LYS-24, A-ARG-41, A-ASP-147, A-PRO-148, A-A...",21
2,A_ASD0002_4_3PG9_1,"[A-SER-31, A-LYS-32, A-GLY-33, A-GLN-34, A-GLU...",23
3,A_ASD0002_5_1OF6_1,"[A-ARG-55, A-ASP-161, A-THR-162, A-ILE-163, A-...",28
4,A_ASD0002_5_1OFR_1,"[G-ARG-55, G-ASP-161, G-THR-162, G-ILE-163, G-...",26
...,...,...,...
3069,A_ASD8006_2_3OWZ_1,"[A-G-32, A-A-33, A-A-34, A-G-35, A-G-36, A-A-3...",19
3070,A_ASD8006_2_3OXE_1,"[A-G-32, A-A-33, A-A-34, A-G-35, A-G-36, A-A-6...",18
3071,A_ASD8006_2_3OXJ_1,"[A-G-32, A-A-33, A-A-34, A-G-35, A-G-36, A-A-3...",22
3072,A_ASD8006_2_3OXM_1,"[A-G-32, A-A-33, A-A-34, A-G-35, A-G-36, A-A-6...",21


In [7]:
df_allo_sites[df_allo_sites['Number of Allosteric Site Residues'] < 1]

Unnamed: 0,Allosteric Site,Allosteric Site Residues,Number of Allosteric Site Residues
87,A_ASD0031_5_4RER_1,[],0
251,A_ASD0075_1_5TKV_1,[],0
339,A_ASD0102_1_6P1D_1,[],0
340,A_ASD0102_1_6P1L_1,[],0
341,A_ASD0102_1_6P1Q_1,[],0
...,...,...,...
3041,A_ASD2192_1_7AIA_1,[],0
3047,A_ASD2195_1_7LH2_1,[],0
3051,A_ASD2200_1_6GVT_1,[],0
3052,A_ASD2201_1_6NUD_1,[],0


In [8]:
df_filtered = df.merge(df_allo_sites).merge(df_ortho_sites)
df_filtered

Unnamed: 0,Target,Organism,Modulator,Allosteric Site,Orthosteric Site,PDB_ID,Allosteric Site Residues,Number of Allosteric Site Residues,Orthosteric Site Residues,Number of Orthosteric Site Residues
0,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XCG_1,O_ASD07000000_1GOS,2XCG,"[A-GLU-84, A-LEU-88, A-PHE-99, A-GLY-101, A-PR...",28,"[A-GLY-57, A-GLY-58, A-SER-59, A-TYR-60, A-PHE...",17
1,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFN_1,O_ASD07000000_1GOS,2XFN,"[A-GLU-84, A-LEU-88, A-PHE-99, A-GLY-101, A-PR...",26,"[A-GLY-57, A-GLY-58, A-SER-59, A-TYR-60, A-PHE...",17
2,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFO_1,O_ASD07000000_1GOS,2XFO,"[A-GLU-84, A-LEU-88, A-PHE-99, A-GLY-101, A-PR...",22,"[A-GLY-57, A-GLY-58, A-SER-59, A-TYR-60, A-PHE...",17
3,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFP_1,O_ASD07000000_1GOS,2XFP,"[A-GLU-84, A-LEU-88, A-PHE-99, A-GLY-101, A-PR...",28,"[A-GLY-57, A-GLY-58, A-SER-59, A-TYR-60, A-PHE...",17
4,Amine oxidase,Homo sapiens,XCG,A_ASD0700_1_2XFQ_1,O_ASD07000000_1GOS,2XFQ,"[A-GLU-84, A-LEU-88, A-PHE-99, A-GLY-101, A-PR...",24,"[A-GLY-57, A-GLY-58, A-SER-59, A-TYR-60, A-PHE...",17
...,...,...,...,...,...,...,...,...,...,...
1401,2-C-methyl-D-erythritol 4-phosphate cytidylylt...,Arabidopsis thaliana,Q9P,A_ASD0401_1_5MRO_1,O_ASD0401_1_1VPA,5MRO,"[A-ARG-157, A-GLN-158, A-VAL-161, A-ILE-177, A...",23,"[A-LEU-7, A-ALA-8, A-ALA-9, A-GLY-10, A-LYS-11...",27
1402,2-C-methyl-D-erythritol 4-phosphate cytidylylt...,Arabidopsis thaliana,6BC,A_ASD0401_1_5MRP_1,O_ASD0401_1_1VPA,5MRP,"[A-GLU-156, A-ARG-157, A-GLN-158, A-ASP-159, A...",25,"[A-LEU-7, A-ALA-8, A-ALA-9, A-GLY-10, A-LYS-11...",27
1403,Tryptophan synthase beta chain,Mycobacterium tuberculosis (strain ATCC 25618 ...,79V,A_ASD1522_1_5TCI_1,O_ASD1522_1_1QOQ,5TCI,"[A-TYR-62, A-ASP-64, A-PRO-65, A-GLY-66, A-MET...",111,"[A-PHE-19, A-VAL-20, A-PRO-21, A-PHE-22, A-VAL...",54
1404,Tryptophan synthase beta chain,Mycobacterium tuberculosis (strain ATCC 25618 ...,79V,A_ASD1522_1_5TCJ_1,O_ASD1522_1_1QOQ,5TCJ,"[A-TYR-62, A-ASP-64, A-PRO-65, A-GLY-66, A-MET...",112,"[A-PHE-19, A-VAL-20, A-PRO-21, A-PHE-22, A-VAL...",54


In [9]:
df_asd = pd.read_csv('../output/ASD_Release_202306.csv', usecols=['allosteric_pdb', 'pdb_uniprot'])
df_asd.rename(columns={'allosteric_pdb': 'PDB_ID', 'pdb_uniprot': 'UniProt ID'}, inplace=True)
df_asd.drop_duplicates(inplace=True)
df_merged = df_filtered.merge(df_asd, how='left')

In [16]:
df_merged[df_merged['UniProt ID'].isna()]

Unnamed: 0,Target,Organism,Modulator,Allosteric Site,Orthosteric Site,PDB_ID,Allosteric Site Residues,Number of Allosteric Site Residues,Orthosteric Site Residues,Number of Orthosteric Site Residues,UniProt ID
234,Aspartate transcarbamoylase,Escherichia coli,ATP,A_ASD0033_2_4KGV_1,O_ASD00330000_4KH0,4KGV,"[B-GLU-10, B-ALA-11, B-ILE-12, B-LYS-13, B-VAL...",24,"[A-PHE-48, A-GLU-50, A-ALA-51, A-SER-52, A-THR...",38,
235,Aspartate transcarbamoylase,Escherichia coli,CTP,A_ASD0033_2_4KGX_1,O_ASD00330000_4KH0,4KGX,"[B-GLU-10, B-ALA-11, B-ILE-12, B-LYS-13, B-VAL...",23,"[A-PHE-48, A-GLU-50, A-ALA-51, A-SER-52, A-THR...",38,
236,Aspartate transcarbamoylase,Escherichia coli,ATP;MG;ATP,A_ASD0033_2_4KH0_1,O_ASD00330000_4KH0,4KH0,"[B-ASP-19, B-HIS-20, B-LYS-56, B-LYS-94, B-HOH...",8,"[A-PHE-48, A-GLU-50, A-ALA-51, A-SER-52, A-THR...",38,
237,Aspartate transcarbamoylase,Escherichia coli,CTP;MG;UTP,A_ASD0033_2_4KH1_1,O_ASD00330000_4KH0,4KH1,"[B-ASP-19, B-HIS-20, B-LYS-56, B-HOH-301, B-HO...",5,"[A-PHE-48, A-GLU-50, A-ALA-51, A-SER-52, A-THR...",38,
273,Glutamate dehydrogenase (Bovine),Bos taurus,XEG,A_ASD0124_1_3QMU_1,O_ASD01240000_3MW9,3QMU,"[A-ILE-203, A-SER-204, A-GLN-205, A-GLY-206, A...",30,"[A-MET-111, A-LYS-114, A-LYS-126, A-PRO-167, A...",13,
274,Glutamate dehydrogenase (Bovine),Bos taurus,GTP,A_ASD0124_1_3MVQ_1,O_ASD01240000_3MW9,3MVQ,"[A-HIS-209, A-GLY-210, A-ARG-211, A-ILE-212, A...",17,"[A-MET-111, A-LYS-114, A-LYS-126, A-PRO-167, A...",13,
283,Glutamate dehydrogenase (Bovine),Bos taurus,GTP,A_ASD0124_1_1HWZ_1,O_ASD01240000_3MW9,1HWZ,"[A-HIS-209, A-GLY-210, A-ARG-211, A-ILE-212, A...",16,"[A-MET-111, A-LYS-114, A-LYS-126, A-PRO-167, A...",13,
284,Glutamate dehydrogenase (Bovine),Bos taurus,GTP,A_ASD0124_1_3MW9_1,O_ASD01240000_3MW9,3MW9,"[A-HIS-209, A-GLY-210, A-ARG-211, A-ILE-212, A...",20,"[A-MET-111, A-LYS-114, A-LYS-126, A-PRO-167, A...",13,
285,Glutamate dehydrogenase (Bovine),Bos taurus,NAI,A_ASD0124_1_3MW9_2,O_ASD01240000_3MW9,3MW9,"[A-HIS-195, A-LYS-201, A-ILE-203, A-SER-204, A...",43,"[A-MET-111, A-LYS-114, A-LYS-126, A-PRO-167, A...",13,
1297,Pyruvate carboxylase,Listeria monocytogenes R479a,2BA,A_ASD0555_5_4QSH_1,O_ASD05550000_4JX6,4QSH,"[A-TYR-722, A-SER-756, A-2BA-2003, A-HOH-2147,...",6,"[A-ARG-548, A-ASP-549, A-GLN-552, A-GLY-586, A...",18,
