In [1]:
import pandas as pd

from pandas.io.json import json_normalize
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
def query_wikidata(sparql_query, sparql_service_url):
    """
    Query the endpoint with the given query string and return the results as a pandas Dataframe.
    """
    # create the connection to the endpoint
    sparql = SPARQLWrapper(sparql_service_url)
    
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)

    # ask for the result
    result = sparql.query().convert()
    return json_normalize(result["results"]["bindings"])

In [4]:
sparql_query = """SELECT ?drug ?drugLabel ?gene ?geneLabel ?entrez_id ?disease ?diseaseLabel WHERE {
      ?drug wdt:P129 ?gene_product .   # drug interacts with a gene_product 
      ?gene_product wdt:P702 ?gene .  # gene_product is encoded by a gene
      ?gene wdt:P2293 ?disease .    # gene is genetically associated with a disease 
      ?gene wdt:P351 ?entrez_id .  # get the entrez gene id for the gene 
      # add labels
      SERVICE wikibase:label {
            bd:serviceParam wikibase:language "en" .
      }
    }
    limit 1000
    """
#to query another endpoint, change the URL for the service and the query
sparql_service_url = "https://query.wikidata.org/sparql"
result_table = query_wikidata(sparql_query, sparql_service_url)

In [5]:
result_table.shape

(1000, 17)

In [6]:
result_table.head()

Unnamed: 0,disease.type,disease.value,diseaseLabel.type,diseaseLabel.value,diseaseLabel.xml:lang,drug.type,drug.value,drugLabel.type,drugLabel.value,drugLabel.xml:lang,entrez_id.type,entrez_id.value,gene.type,gene.value,geneLabel.type,geneLabel.value,geneLabel.xml:lang
0,uri,http://www.wikidata.org/entity/Q55783469,literal,melanoma-pancreatic cancer syndrome,en,uri,http://www.wikidata.org/entity/Q21171873,literal,hypothetical protein CTL0003,en,literal,1029,uri,http://www.wikidata.org/entity/Q5009957,literal,CDKN2A,en
1,uri,http://www.wikidata.org/entity/Q3704732,literal,antithrombin III deficiency,en,uri,http://www.wikidata.org/entity/Q416516,literal,enoxaparin,en,literal,462,uri,http://www.wikidata.org/entity/Q14861061,literal,SERPINC1,en
2,uri,http://www.wikidata.org/entity/Q4795508,literal,aromatase excess syndrome,en,uri,http://www.wikidata.org/entity/Q194974,literal,letrozole,en,literal,1588,uri,http://www.wikidata.org/entity/Q14865015,literal,CYP19A1,en
3,uri,http://www.wikidata.org/entity/Q4795508,literal,aromatase excess syndrome,en,uri,http://www.wikidata.org/entity/Q241150,literal,aminoglutethimide,en,literal,1588,uri,http://www.wikidata.org/entity/Q14865015,literal,CYP19A1,en
4,uri,http://www.wikidata.org/entity/Q4795508,literal,aromatase excess syndrome,en,uri,http://www.wikidata.org/entity/Q418819,literal,exemestane,en,literal,1588,uri,http://www.wikidata.org/entity/Q14865015,literal,CYP19A1,en


In [8]:
simple_table = result_table[["drugLabel.value", "diseaseLabel.value", "geneLabel.value"]]

In [9]:
simple_table.head()

Unnamed: 0,drugLabel.value,diseaseLabel.value,geneLabel.value
0,hypothetical protein CTL0003,melanoma-pancreatic cancer syndrome,CDKN2A
1,enoxaparin,antithrombin III deficiency,SERPINC1
2,letrozole,aromatase excess syndrome,CYP19A1
3,aminoglutethimide,aromatase excess syndrome,CYP19A1
4,exemestane,aromatase excess syndrome,CYP19A1


In [10]:
simple_table = simple_table.rename(columns = lambda col: col.replace("Label.value", ""))

In [11]:
simple_table.head()

Unnamed: 0,drug,disease,gene
0,hypothetical protein CTL0003,melanoma-pancreatic cancer syndrome,CDKN2A
1,enoxaparin,antithrombin III deficiency,SERPINC1
2,letrozole,aromatase excess syndrome,CYP19A1
3,aminoglutethimide,aromatase excess syndrome,CYP19A1
4,exemestane,aromatase excess syndrome,CYP19A1


In [12]:
counts = simple_table.groupby(["drug", "disease"]).size()

In [13]:
counts.head()

drug             disease                                             
(-)-pentazocine  amyotrophic lateral sclerosis type 16                   1
                 autosomal recessive distal spinal muscular atrophy 2    1
                 coronary artery disease                                 1
(RS)-methadone   coronary artery disease                                 1
(RS)-mexiletine  congenital myasthenic syndrome 16                       1
dtype: int64

In [14]:
counts = counts.to_frame("gene_count")

In [15]:
counts.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_count
drug,disease,Unnamed: 2_level_1
(-)-pentazocine,amyotrophic lateral sclerosis type 16,1
(-)-pentazocine,autosomal recessive distal spinal muscular atrophy 2,1
(-)-pentazocine,coronary artery disease,1
(RS)-methadone,coronary artery disease,1
(RS)-mexiletine,congenital myasthenic syndrome 16,1
