# Test Set Creation

In [1]:
import gzip
import json
import os
import warnings
import requests
import urllib.request
from glob import glob

# import get_pdb_data
import pandas as pd
from Bio.PDB import PDBParser
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
import itertools


def uniref50_clusters(uniprot_ids):
    # Set the SPARQL endpoint (UniProt)
    sparql = SPARQLWrapper("https://sparql.uniprot.org/sparql")

    output = []
    for uniprot_subset in tqdm(itertools.batched(uniprot_ids, 200)):
        uniprot_string = ' '.join([f'uniprotkb:{id}' for id in uniprot_subset])
                 
        # Define the query
        query_string = f"""
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX uniref: <http://purl.uniprot.org/uniref/>

SELECT ?primaryAccession ?cluster_id
FROM <http://sparql.uniprot.org/uniref>
FROM <http://sparql.uniprot.org/uniprot>
WHERE
{{
    VALUES ?protein {{ {uniprot_string} }}
    ?cluster up:member/up:sequenceFor ?protein;
             up:identity 0.5 .
    BIND(substr(str(?protein), strlen(str(uniprotkb:))+1) AS ?primaryAccession)
    BIND(substr(str(?cluster), strlen(str(uniref:))+1) AS ?cluster_id)
}}"""
        sparql.setQuery(query_string)
    
        # Set the output format as JSON
        sparql.setReturnFormat(JSON)
        
        # Run the SPARQL query and convert to the defined format
        results = sparql.query().convert()

        # Print the query result
        for result in results["results"]["bindings"]:
            output.append([result["primaryAccession"]["value"],
                           result["cluster_id"]["value"]])
    return pd.DataFrame(output, columns=['uniprot_id', 'uniref50_cluster'])


protein_letters_1to3 = {"A": "ALA", "C": "CYS", "D": "ASP", "E": "GLU",
                        "F": "PHE", "G": "GLY", "H": "HIS", "I": "ILE",
                        "K": "LYS", "L": "LEU", "M": "MET", "N": "ASN",
                        "P": "PRO", "Q": "GLN", "R": "ARG", "S": "SER",
                        "T": "THR", "V": "VAL", "W": "TRP", "Y": "TYR"}

Combined of list of PDBs used to train AlloPred, AlloSite, PASSER, and other ML based allosteric site prediction packages

In [2]:
df_training = pd.read_csv('../output/Proteins_in_Training_Sets.csv')

In [3]:
print('Number of Rows:             ', df_training.shape[0])
print('Number of Unique PDB IDs:   ', df_training['pdb_id'].nunique())
print('Number of Unique UniProt AC:', df_training['uniprot_id'].nunique())

Number of Rows:              363
Number of Unique PDB IDs:    360
Number of Unique UniProt AC: 266


In [4]:
df_asd_with_active_site = pd.read_csv('../output/ASD_with_Potential_Binding_Site.csv')

In [5]:
df_asd_with_active_site.columns

Index(['target_id', 'target_gene', 'organism', 'allosteric_pdb',
       'modulator_serial', 'modulator_alias', 'modulator_chain',
       'modulator_class', 'modulator_feature', 'modulator_name',
       'modulator_resi', 'function', 'position', 'pubmed_id', 'ref_title',
       'site_overlap', 'allosteric_site_residues', 'pdb_uniprot', 'sequence',
       'active_site_residues', 'catalytic_site', 'catalytic_site_resids',
       'Number_of_Binding_Site_Residues',
       'Number_of_Allosteric_Site_Residues'],
      dtype='object')

In [6]:
def get_chain(residues):
    return {res.split('-')[0] for res in residues}

df_asd_with_active_site['allosteric_site_chains'] = df_asd_with_active_site['allosteric_site_residues'].apply(eval).apply(get_chain)

df_asd_with_active_site[['pdb_uniprot', 'allosteric_pdb', 'allosteric_site_residues', 'modulator_chain', 'allosteric_site_chains']]

Unnamed: 0,pdb_uniprot,allosteric_pdb,allosteric_site_residues,modulator_chain,allosteric_site_chains
0,Q9K169,4UC5,"['B-THR-7', 'B-ILE-12', 'B-ILE-10', 'B-ASP-8',...",A,{B}
1,O15530,5MRD,"['A-PHE-157', 'A-TYR-156', 'A-LEU-155', 'A-ARG...",A,{A}
2,O15530,4XX9,"['A-TYR-156', 'A-ILE-118', 'A-ILE-119', 'A-ARG...",A,{A}
3,O15530,4AW0,"['A-LYS-76', 'A-LYS-115', 'A-ILE-118', 'A-ILE-...",A,{A}
4,O15530,4RQK,"['A-TYR-156', 'A-ILE-118', 'A-ILE-119', 'A-ARG...",A,{A}
...,...,...,...,...,...
2240,V5TDZ4,5Z46,"['A-GLU-63', 'A-ASP-40', 'A-GLU-64', 'A-ASP-65...",A,{A}
2241,P09874,6BHV,"['A-SER-864', 'A-GLY-876', 'A-PHE-869', 'A-SER...",A,{A}
2242,B0SN40,4OV4,"['A-LEU-75', 'A-THR-178', 'A-HIS-99', 'A-GLU-2...",A,{A}
2243,B0SN40,4OV9,"['A-LEU-75', 'A-THR-178', 'A-HIS-99', 'A-ASN-1...",A,{A}


In [7]:
print('Number of Rows:             ', df_asd_with_active_site.shape[0])
print('Number of Unique PDB IDs:   ', df_asd_with_active_site['allosteric_pdb'].nunique())
print('Number of Unique UniProt AC:', df_asd_with_active_site['pdb_uniprot'].nunique())

Number of Rows:              2245
Number of Unique PDB IDs:    2104
Number of Unique UniProt AC: 432


## Get UniRef Clusters

In [8]:
df_training_clusters = uniref50_clusters(df_training['uniprot_id'].drop_duplicates())

2it [00:11,  5.72s/it]


In [9]:
df_traning_merged = df_training.merge(df_training_clusters, how='left')
df_traning_merged

Unnamed: 0,pdb_id,uniprot_id,asd90,allosite,passer_ensemble,allositepro,allopred,uniprot_id_from_pdb,num_uniprot,present,passer_automl,passer_rank,uniref50_cluster
0,11BG,P00669,,,,True,True,,,,True,True,UniRef50_P07998
1,1AO0,P00497,True,True,True,,,,,,,,UniRef50_P00497
2,1B86,P68871,,,,True,True,,,,,,UniRef50_P68871
3,1BJ4,P34896,,,,,,,,,True,True,UniRef50_P34896
4,1CE8,P00968,True,True,True,True,True,,,,True,True,UniRef50_P00968
...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,4TPT,P53671,,,,,,,,,True,True,UniRef50_P53671
359,4TQC,P06730,,,,,,,,,True,True,UniRef50_P06730
360,4TVG,Q90EA1,,,,,,,,,True,True,UniRef50_P04585
361,4UUU,P35520,,,,,,,,,True,True,UniRef50_P35520


In [10]:
df_asd_clusters = uniref50_clusters(df_asd_with_active_site['pdb_uniprot'].drop_duplicates())
df_asd_merged = df_asd_with_active_site.merge(df_asd_clusters, how='left', left_on='pdb_uniprot', right_on='uniprot_id')

3it [00:07,  2.36s/it]


In [11]:
df_asd_merged[df_asd_merged['uniref50_cluster'].isna()]

Unnamed: 0,target_id,target_gene,organism,allosteric_pdb,modulator_serial,modulator_alias,modulator_chain,modulator_class,modulator_feature,modulator_name,...,pdb_uniprot,sequence,active_site_residues,catalytic_site,catalytic_site_resids,Number_of_Binding_Site_Residues,Number_of_Allosteric_Site_Residues,allosteric_site_chains,uniprot_id,uniref50_cluster


In [12]:
df_unselected = df_asd_merged[df_asd_merged['uniref50_cluster'].isin(df_traning_merged['uniref50_cluster'])]

print('Number of Rows:             ', df_unselected.shape[0])
print('Number of Unique PDB IDs:   ', df_unselected['allosteric_pdb'].nunique())
print('Number of Unique UniProt AC:', df_unselected['pdb_uniprot'].nunique())

Number of Rows:              1825
Number of Unique PDB IDs:    1718
Number of Unique UniProt AC: 246


In [13]:
df_selected = df_asd_merged[~df_asd_merged['uniref50_cluster'].isin(df_traning_merged['uniref50_cluster'])]

print('Number of Rows:             ', df_selected.shape[0])
print('Number of Unique PDB IDs:   ', df_selected['allosteric_pdb'].nunique())
print('Number of Unique UniProt AC:', df_selected['pdb_uniprot'].nunique())

Number of Rows:              420
Number of Unique PDB IDs:    388
Number of Unique UniProt AC: 186


## Download PDB files

In [14]:
if not os.path.exists('../data/pdb_downloaded/'):
    os.makedirs('../data/pdb_downloaded/')

for pdb_id in tqdm(list(set(df_selected['allosteric_pdb']))):
    if not os.path.exists(f'../data/pdb_downloaded/{pdb_id}.pdb'):
        try:
            urllib.request.urlretrieve(f'https://files.rcsb.org/download/{pdb_id.lower()}.pdb1', f'../data/pdb_downloaded/{pdb_id}.pdb')
        except:
            print(pdb_id)

100%|██████████| 388/388 [00:00<00:00, 55455.26it/s]


PDB ID 4V9C does not have a .pdb format file available.

In [15]:
residues = []
for pdb_file in tqdm(glob('../data/pdb_downloaded/*.pdb')):
    pdb_id = os.path.splitext(os.path.basename(pdb_file))[0]
    
    parser = PDBParser()
    with warnings.catch_warnings(action="ignore"):
        structure = parser.get_structure(pdb_id, pdb_file)

    for residue in structure.get_residues():
        structure, model, chain, (hetfield, res_id, _) = residue.get_full_id()
        res_name = residue.get_resname()
        if hetfield == ' ':
            residues.append([structure, model, chain, res_id, res_name])
    
df_residues = pd.DataFrame(residues, columns=['structure', 'model', 'chain', 'res_id', 'res_name'])

100%|██████████| 391/391 [00:29<00:00, 13.36it/s]


In [16]:
df_sub = df_selected[['allosteric_pdb', 'sequence', 'active_site_residues']].drop_duplicates()

df_sub = df_sub.drop(df_selected[(df_selected['allosteric_pdb'] == '4V9C')].index)

df_sub['active_site_residues'] = df_sub['active_site_residues'].apply(eval)
df_sub = df_sub.explode('active_site_residues')

df_sub['res_name'] = df_sub.apply(lambda x: protein_letters_1to3[x['sequence'][x['active_site_residues'] - 1]], axis=1)

df_sub['res_id'] = df_sub['active_site_residues'].astype("Int64")
df_sub.drop(columns=['sequence', 'active_site_residues'], inplace=True)
df_sub.rename(columns={'allosteric_pdb': 'structure'}, inplace=True)

In [17]:
df_merge = df_sub.merge(df_residues, how='left', on=['structure', 'res_id', 'res_name'], indicator=True).drop_duplicates()
mismatch_structures = df_merge.loc[df_merge['_merge'] == 'left_only', 'structure'].unique()
len(mismatch_structures)

191

In [18]:
df_merge_match = df_merge[~df_merge['structure'].isin(mismatch_structures)].copy()

df_merge_match['active_site'] = df_merge_match['chain'] + '-' + df_merge_match['res_id'].astype(str) + '-' + df_merge_match['res_name']
df_grouped = df_merge_match[['structure', 'active_site']].groupby(['structure']).agg(set).reset_index()

In [19]:
df_selected_no_mismatch = df_selected.merge(df_grouped, left_on='allosteric_pdb', right_on='structure')

In [20]:
print('Number of Rows:             ', df_selected_no_mismatch.shape[0])
print('Number of Unique PDB IDs:   ', df_selected_no_mismatch['allosteric_pdb'].nunique())
print('Number of Unique UniProt AC:', df_selected_no_mismatch['pdb_uniprot'].nunique())

Number of Rows:              212
Number of Unique PDB IDs:    197
Number of Unique UniProt AC: 104


In [21]:
df_selected_no_mismatch[[
    'target_id', 'target_gene', 'organism', 'pdb_uniprot', 'allosteric_pdb',
    'modulator_serial', 'modulator_alias', 'modulator_chain',
    'modulator_class', 'modulator_feature', 'modulator_name',
    'modulator_resi', 'function', 'position', 'pubmed_id', 'ref_title',
    'site_overlap', 'allosteric_site_residues', 'active_site']].to_csv('../output/Test_Set.csv', index=False)