In [1]:
import SPARQLWrapper
import tqdm
import itertools
import pandas as pd

In [2]:
%%time

# Set the SPARQL endpoint (UniProt)
sparql = SPARQLWrapper.SPARQLWrapper("https://sparql.uniprot.org/sparql")

# Define the query
query_string = f"""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT DISTINCT ?uniprot_id ?begin ?end ?comment
WHERE
{{
  ?protein up:annotation ?annotation .
  BIND(substr(str(?protein), strlen(str(uniprotkb:))+1) AS ?uniprot_id)
  
  ?annotation a up:Binding_Site_Annotation .
  ?annotation rdf:type ?type .
  BIND(substr(str(?type), strlen(str(up:))+1) AS ?site)
  ?annotation up:range ?range .
  ?range faldo:begin/faldo:position ?begin .
  ?range faldo:end/faldo:position ?end .
  
  ?annotation up:ligand ?ligand .
  ?ligand rdfs:comment ?comment .
  FILTER(REGEX(?comment, "allosteric", "i"))
}}
"""
sparql.setQuery(query_string)

# Set the output format as JSON
sparql.setReturnFormat(SPARQLWrapper.JSON)
    
# Run the SPARQL query and convert to the defined format
data = sparql.query().convert()

CPU times: total: 3.42 s
Wall time: 6min 10s


In [21]:
# Store the query result
allosteric_site_annotations = []
for result in data["results"]["bindings"]:
    allosteric_site_annotations.append({key: value['value'] for key, value in result.items()})

df_allosteric_sites = pd.DataFrame(allosteric_site_annotations, columns=['uniprot_id', 'begin', 'end', 'comment'])

In [22]:
df_allosteric_sites['uniprot_id'].nunique()

65855

In [23]:
df_allosteric_sites.to_csv('Allosteric_Site_Annotations_UniProt.csv', index=False)

In [24]:
df_allosteric_sites

Unnamed: 0,uniprot_id,begin,end,comment
0,C1F1E7,187,192,allosteric inhibitor
1,C1F1E7,147,149,allosteric inhibitor
2,C1F1E7,14,14,allosteric inhibitor
3,C1F1E7,223,223,allosteric inhibitor
4,Q1IKC9,187,192,allosteric inhibitor
...,...,...,...,...
271838,A0A5N6KQS5,545,549,allosteric activator; ligand shared between di...
271839,A0A5N6KQS5,676,676,allosteric activator; ligand shared between di...
271840,A0A5N6KQS5,754,754,allosteric activator; ligand shared between di...
271841,A0A5N6KQS5,590,592,allosteric activator; ligand shared between di...


In [26]:
df_allosteric_sites.value_counts('comment')

comment
allosteric inhibitor                                                                  151102
allosteric activator; ligand shared between dimeric partners                          105141
allosteric activator                                                                   15137
allosteric activator; ligand shared between two neighboring subunits                     115
allosteric effector that controls substrate specificity                                  110
allosteric activator; ligand shared between 3 neighboring subunits of the tetramer        97
allosteric activator; ligand shared with subunit beta                                     40
allosteric effector                                                                       37
allosteric activator; ligand shared with subunit alpha                                    36
allosteric inhibitor; ligand shared between dimeric partners                              23
allosteric inhibitor; ligand shared between homodimeric partne