In [1]:
!pip install pandas pyarrow SPARQLWrapper rdflib

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp39-cp39-macosx_10_15_x86_64.whl (28.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.4/28.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting isodate<0.7.0,>=0.6.0
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow, isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 pyarrow-16.1.0 rdflib-7.0.0


In [71]:
import pandas as pd #data manipulation and analysis library that provides data structures like DataFrames to work with structured data
import pyarrow #enables reading and writing of Parquet files
from SPARQLWrapper import SPARQLWrapper, JSON #execute SPARQL queries
import rdflib #querying of RDF data, including JSON-LD
import json #library for parsing, generating, and manipulating JSON
import subprocess # run shell commmands
import os #interact with the operating system
import re  # Import the regular expression module


In [58]:
# Load Silva Database Taxonomy file
tsv_file = '/Users/gaetanemagali/OIH_rotation/documentation/TAXONOMY_SILVA.tsv'

# Read the file line by line
identifiers = []
taxonomies = []

with open(tsv_file, 'r') as file:
    for line in file:
        parts = line.strip().split(maxsplit=1)  # Split by first space
        if len(parts) == 2:
            identifiers.append(parts[0])
            taxonomies.append(parts[1])

# Create a DataFrame
silva_taxa = pd.DataFrame({
    'identifier': identifiers,
    'taxonomy': taxonomies
})

# Display the first few rows to inspect the structure
print(silva_taxa.head())


               identifier                                           taxonomy
0         BD359735.3.2145  Eukaryota;SAR;Alveolata;Apicomplexa;Aconoidasi...
1      HL196872.5744.7271  Eukaryota;SAR;Stramenopiles;Labyrinthulomycete...
2         AY855839.1.1390  Bacteria;Proteobacteria;Alphaproteobacteria;Ri...
3         FW343016.1.1511  Bacteria;Firmicutes;Bacilli;Lactobacillales;Ca...
4  AY835431.189876.191345  Bacteria;Cyanobacteria;Cyanobacteriia;Chloropl...


In [60]:
# Ensure the 'taxonomy' column is correctly populated
silva_taxa['taxonomy'] = silva_taxa['taxonomy'].astype(str)

In [62]:
# Split the taxonomy column into individual taxonomic levels
silva_taxa['taxonomy'] = silva_taxa['taxonomy'].str.split(';')

In [92]:
def clean_taxon(taxon):
    cleaned_taxon = taxon.strip().lower().replace('[', '').replace(']', '').replace("'", "").replace('"', '')
    cleaned_taxon = re.sub(r'[^a-z\s-]', '', cleaned_taxon)  # Remove non-alphabetic characters except spaces and hyphens
    if cleaned_taxon:  # Check if cleaned_taxon is not empty
        first_part = cleaned_taxon.split()[0]  # Keep only the first part before any space
        # Filter out names with unwanted characters or patterns
        if not re.search(r'[-]', first_part):  # Exclude names with hyphens
            return first_part
    return ""

In [93]:
unique_taxa = set()
for taxa_list in silva_taxa['taxonomy']:
    for taxon in taxa_list:
        cleaned_taxon = clean_taxon(taxon)
        if cleaned_taxon and not re.search(r'\d', cleaned_taxon):  # Filter out names with numbers
            unique_taxa.add(cleaned_taxon)

In [94]:
# Convert to a sorted list
unique_taxa = sorted(unique_taxa)

In [95]:
# Save to a new text file
output_folder = '/Users/gaetanemagali/OIH_rotation/documentation'
output_file = os.path.join(output_folder, 'unique_taxa_cleaned.txt')

with open(output_file, 'w') as f:
    for taxon in unique_taxa:
        f.write(f"{taxon}\n")

In [4]:
# endpoint where the data is stored, such as the ODIS graph:
sparql = SPARQLWrapper("http://graph.oceaninfohub.org/blazegraph/namespace/oih/sparql")

In [None]:
# Restricted search to datasets 
SELECT ?dataset ?scientificName WHERE {
  ?dataset a dwc:Dataset .
  ?dataset dwc:scientificName ?scientificName .
  FILTER(?scientificName IN (/* List of microbial names from SILVA and WoRMS */))
}

In [None]:
#Define which vocabulary we are using to scan through metadata
PREFIX dwc: <http://rs.tdwg.org/dwc/terms/> #Darwin Code
PREFIX schema: <http://schema.org/> #schema.org 

In [None]:
#For reference - triage of relevant DwC vocab

#dwc:acceptedNameUsage
 #dwc:acceptedNameUsageID
 #dwc:associatedSequences
 #dwc:associatedTaxa
 #dwc:bed
 #dwciri:behavior / dwc:behavior
 #dwc:class
 #dwc:degreeOfEstablishment
 #dwc:family
 #dwc:fieldNotes
 #dwc:fieldNumber
 #dwc:genericName
 #dwc:genus
 #dwc:GeologicalContext
 #dwc:habitat
 #dwc:higherClassification
 #dwc:identifiedBy
 #dwc:informationWithheld
 #dwc:kingdom
 #dc:language
 #dwc:MaterialEntity
 #dwc:MaterialSample (cross-links to New Pattern: Samples #376)
 #dwc:materialSampleID
 #dwc:nomenclaturalCode
 #dwc:Occurrence
 #dwc:occurrenceDetails
 #dwc:occurrenceRemarks
 #dwc:order
 #dwc:originalNameUsage
 #dwc:phylum
 #dwc:scientificName
 #dwc:superfamily
 #dwc:taxonAttributes
 #dwc:taxonID
 #dwc:verbatimIdentification
 #dwc:vernacularName
#Map to schema.org properties
#
 #dwc:accessRights
 #dwc:associatedMedia
 #dwc:associatedReferences
 #dcterms:bibliographicCitation
 #dwc:continent
 #dwc:country / dwc:countryCode
 #dwc:county
 #dwc:dataGeneralizations (to additonal description or similar)
 #dwc:datasetID
 #dwc:datasetName
 #dwc:day
 #dwc:endDayOfYear
 #dwc:establishmentMeans
 #dwc:eventDate
 #dwc:eventID
 #dwc:eventRemarks (comment or description on schema:Event)
 #dwc:eventTime
 #dwc:GeologicalContext
 #dwc:higherGeography
 #dwciri:inDataset (subjectOf)
 #dwc:institutionCode
 #dwc:institutionID
 #dcterms:license
 #dwc:measurementMethod
 #dcterms:modified (sd properties)
 #dwc:month
 #dcterms:references
 #dwc:relatedResourceID
 #dcterms:rights
 #dcterms:rightsHolder
 #dwc:startDayOfYear
 #dc:type
 #dwc:year
#
#
##For Spatial Mapping to GeoJSON or schema.org spatial properties
##dwc:coordinatePrecision
 #dwc:coordinateUncertaintyInMeters
 #dwc:decimalLatitude
 #dwc:decimalLongitude
 #dwc:footprintSRS
 #dwc:footprintWKT
 #dwc:geodeticDatum
 #dwc:locality
 #dwc:locationRemarks
 #dwc:maximumDepthInMeters (xref Depth representation conventions #377)
 #dwc:maximumDistanceAboveSurfaceInMeters
 #dwc:maximumElevationInMeters
 #dwc:minimumDepthInMeters
 #dwc:minimumDistanceAboveSurfaceInMeters
 #dwc:minimumElevationInMeters
 #dwc:municipality
 #dwc:stateProvince
 #dwc:verbatimCoordinates
 #dwc:verbatimCoordinateSystem
 #dwc:verbatimDepth
 #dwc:verbatimElevation
 #dwc:verbatimEventDate
 #dwc:verbatimLatitude
 #dwc:verbatimLocality
 #dwc:verbatimLongitude
 #dwc:verbatimSRS
 #dwc:verticalDatum
 #dwc:waterBody


In [None]:
#First let us define a list of DwC terms relevant to microbial taxonomy

SELECT ?taxonID ?scientificName ?kingdom ?phylum ?class ?order ?family ?genus ?acceptedNameUsage ?higherClassification
WHERE {
  ?taxon dwc:taxonID ?taxonID;
         dwc:scientificName ?scientificName;
         dwc:kingdom ?kingdom;
         dwc:phylum ?phylum;
         dwc:class ?class;
         dwc:order ?order;
         dwc:family ?family;
         dwc:genus ?genus;
         dwc:acceptedNameUsage ?acceptedNameUsage;
         dwc:higherClassification ?higherClassification.
}
"""