In [1]:
!pip install pandas pyarrow SPARQLWrapper rdflib

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl (28.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.4/28.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow, isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 pyarrow-16.1.0 rdflib-7.0.0


In [1]:
import pandas as pd #data manipulation and analysis library that provides data structures like DataFrames to work with structured data
import pyarrow #enables reading and writing of Parquet files
from SPARQLWrapper import SPARQLWrapper, JSON #execute SPARQL queries
import rdflib #querying of RDF data, including JSON-LD
import json #library for parsing, generating, and manipulating JSON
import subprocess # run shell commmands
import os #interact with the operating system
import re  # Import the regular expression module


In [14]:
silva_taxa = '/Users/gaetanemagali/OIH_rotation/datafiles' #curated files of most common marine microbial taxa 

In [15]:
taxa_values = ' '.join([f'"{taxon}"' for taxon in silva_taxa])

In [16]:
# endpoint where the data is stored, such as the ODIS graph:
endpoint = "http://graph.oceaninfohub.org/blazegraph/namespace/oih/sparql"

In [None]:
#Template of the SPARQL query
#Dwc = Darwin Code
#schema.org dataset
#""" allowing to write multi-line query
#LCASE : performs case-insensitive comparisons and REGEX for complex pattern matching such as plural/singular forms and other text patterns 
#If taxonomy is available, then it will be included in the result, if not, then the query still returns a results without those fields
#The marine or microbial entity must be matched for the "marine microorganism" to be valid

sparql_microbe_query = """ 
SELECT ?dataset ?associatedTaxa ?class ?family ?genericName ?genus ?higherClassification ?kingdom ?order ?phylum ?scientificName ?superfamily ?taxonAttributes ?verbatimIdentification ?associatedSequences ?acceptedNameUsageID ?acceptedNameUsage ?GeologicalContext ?Occurrence ?habitat ?occurrenceRemarks ?occurrenceDetails ?MaterialEntity ?MaterialSample

WHERE {{
  VALUES ?taxa {{ {taxa_values} }}
  ?dataset a schema:Dataset ;
           dwc:scientificName ?scientificName ;
           OPTIONAL {{ ?dataset dwc:class ?class }} ;
           OPTIONAL {{ ?dataset dwc:family ?family }} ;
           OPTIONAL {{ ?dataset dwc:genericName ?genericName }} ;
           OPTIONAL {{ ?dataset dwc:genus ?genus }} ;
           OPTIONAL {{ ?dataset dwc:higherClassification ?higherClassification }} ;
           OPTIONAL {{ ?dataset dwc:kingdom ?kingdom }} ;
           OPTIONAL {{ ?dataset dwc:order ?order }} ;
           OPTIONAL {{ ?dataset dwc:phylum ?phylum }} ;
           OPTIONAL {{ ?dataset dwc:superfamily ?superfamily }} ;
           OPTIONAL {{ ?dataset dwc:taxonAttributes ?taxonAttributes }} ;
           OPTIONAL {{ ?dataset dwc:verbatimIdentification ?verbatimIdentification }} ;
           OPTIONAL {{ ?dataset dwc:associatedSequences ?associatedSequences }} ;
           OPTIONAL {{ ?dataset dwc:acceptedNameUsageID ?acceptedNameUsageID }} ;
           OPTIONAL {{ ?dataset dwc:acceptedNameUsage ?acceptedNameUsage }} ;
           OPTIONAL {{ ?dataset dwc:GeologicalContext ?GeologicalContext }} ;
           OPTIONAL {{ ?dataset dwc:Occurrence ?Occurrence }} ;
           OPTIONAL {{ ?dataset dwc:habitat ?habitat }} ;
           OPTIONAL {{ ?dataset dwc:occurrenceRemarks ?occurrenceRemarks }} ;
           OPTIONAL {{ ?dataset dwc:occurrenceDetails ?occurrenceDetails }} ;
           OPTIONAL {{ ?dataset dwc:MaterialEntity ?MaterialEntity }} ;
           OPTIONAL {{ ?dataset dwc:MaterialSample ?MaterialSample }} .
  FILTER (
    LCASE(?scientificName) IN ({taxa_values}) &&
    LCASE(?habitat) IN ("marine", "pelagic", "oceanic", "maritime", "coastal", "seafaring", "littoral", "benthic", "abyssal", "planktonic", "bathyal", "epipelagic", "mesopelagic", "upwelling", "downwelling", "saltwater", "gyre", "MPA") &&
    REGEX(LCASE(?MaterialEntity), "microbe|bacteria|bacterium|bacillus|microflora|microbial|prokaryote|protist|archaea|microorganism")
  )
}}
"""


In [None]:
# Set up the SPARQLWrapper
sparql = SPARQLWrapper(sparql_endpoint)
sparql.setQuery(sparql_microbe_query)
sparql.setReturnFormat(JSON)

In [None]:
#sparql.query() sends the query to the SPARQL endpoint and retrieves results
#convert() processes raw results and converts them to a JSON object, a list of bindings
sparql_microbe_query_results = sparql.query().convert()