In [1]:
import requests
import pandas as pd

import re
import json

from io import StringIO
from functools import lru_cache

In [2]:
class MeSHQuery:
    def __init__(self):
        self._base = r"https://id.nlm.nih.gov"
        self._endp_sparql = r"/mesh/sparql"
        self._endp_descriptor = r"/mesh/lookup/descriptor"
        
        self._headers = {"accept": "text/csv"}
    
    @lru_cache
    def _query_ancestors(self, unique_id):
        rdf_prefixes = r"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
"""
        ancestors_rdf = rf"""
            SELECT ?treeNum ?ancestor ?alabel
            FROM <http://id.nlm.nih.gov/mesh>

            WHERE {{
                mesh:{unique_id} meshv:treeNumber ?treeNum .
                ?treeNum meshv:parentTreeNumber+ ?ancestorTreeNum .
                ?ancestor meshv:treeNumber ?ancestorTreeNum .
                ?ancestor rdfs:label ?alabel
            }}

            ORDER BY ?treeNum ?ancestorTreeNum
        """
        params = {
            "query": rdf_prefixes + ancestors_rdf,
            "inference": "false",
            "offset": 0,
            "limit": 1000,
        }
        
        r = requests.get(
            self._base + self._endp_sparql,
            params=params,
            headers=self._headers
        )
        
        df = pd.read_csv(StringIO(r.text)).groupby("treeNum").agg(list)
        
        return df.values
    
    @lru_cache
    def _lookup(self, descriptor):
        params = {
            "label": descriptor,
            "match": "exact",
            "year": "current",
            "limit": 10
        }
        
        r = requests.get(
            self._base + self._endp_descriptor,
            params=params
        )
        
        # tratamento de erro tabajaras
        return json.loads(r.text)[0]["resource"].split("/")[-1]
    
    def query(self, uid_or_label):
        uid_or_label = uid_or_label.strip()
        
        if len(uid_or_label) == 7 and re.match(r"D\d{6}", uid_or_label):
            uid = uid_or_label
        else:
            uid = self._lookup(uid_or_label)
        
        return self._query_ancestors(uid)

mq = MeSHQuery()
mq.query("D001859")

array([[list(['http://id.nlm.nih.gov/mesh/D009369', 'http://id.nlm.nih.gov/mesh/D009371']),
        list(['Neoplasms', 'Neoplasms by Site'])],
       [list(['http://id.nlm.nih.gov/mesh/D009140', 'http://id.nlm.nih.gov/mesh/D001847']),
        list(['Musculoskeletal Diseases', 'Bone Diseases'])]],
      dtype=object)

In [4]:
query = r"""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>

SELECT ?treeNum ?ancestor ?alabel
FROM <http://id.nlm.nih.gov/mesh>

WHERE {
   mesh:D001859 meshv:treeNumber ?treeNum .
   ?treeNum meshv:parentTreeNumber+ ?ancestorTreeNum .
   ?ancestor meshv:treeNumber ?ancestorTreeNum .
   ?ancestor rdfs:label ?alabel
}

ORDER BY ?treeNum ?ancestorTreeNum
"""