# Samples

This notebook queries and visualizes information about samples collected at EMO-BON observatories.

In [None]:
from pysema import SPARQLConnection
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from datetime import datetime

In [None]:
# Connect to EMO-BON Knowledge Graph
graphdb_url = "https://emobon-kb.vliz.be/repositories/kgap"
conn = SPARQLConnection(graphdb_url)

## Total Sample Count

Get the total number of samples in the knowledge graph.

In [None]:
query_count = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>

SELECT (COUNT(?sample) as ?total_samples)
WHERE {
  ?sample a sosa:Sample .
}
"""

count_df = conn.query(query_count)
if not count_df.empty:
    total = count_df.iloc[0]['total_samples']
    print(f"Total samples in the knowledge graph: {total}")
else:
    print("Unable to retrieve sample count")

## Samples by Observatory

Count samples collected at each observatory.

In [None]:
query_by_observatory = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX schema: <http://schema.org/>

SELECT ?observatory_name (COUNT(?sample) as ?sample_count)
WHERE {
  ?sample a sosa:Sample .
  ?sample sosa:isSampleOf ?feature .
  ?observatory sosa:hasSamplingFeature ?feature .
  OPTIONAL { ?observatory schema:name ?observatory_name . }
}
GROUP BY ?observatory_name
ORDER BY DESC(?sample_count)
LIMIT 20
"""

obs_samples_df = conn.query(query_by_observatory)

if not obs_samples_df.empty:
    fig = px.bar(obs_samples_df, x='observatory_name', y='sample_count',
                 title='Top 20 Observatories by Number of Samples',
                 labels={'observatory_name': 'Observatory', 'sample_count': 'Number of Samples'},
                 template='plotly_white')
    fig.update_traces(marker_color='#2ca02c')
    fig.update_xaxes(tickangle=45)
    fig.show()
else:
    print("No sample data by observatory available")

## Sampling Events Over Time

Visualize when sampling events occurred.

In [None]:
query_timeline = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX schema: <http://schema.org/>
PREFIX prov: <http://www.w3.org/ns/prov#>

SELECT ?date (COUNT(?sample) as ?sample_count)
WHERE {
  ?sample a sosa:Sample .
  ?sampling sosa:hasResult ?sample .
  ?sampling sosa:resultTime ?date .
}
GROUP BY ?date
ORDER BY ?date
"""

timeline_df = conn.query(query_timeline)

if not timeline_df.empty and 'date' in timeline_df.columns:
    fig = px.line(timeline_df, x='date', y='sample_count',
                  title='Sampling Events Over Time',
                  labels={'date': 'Date', 'sample_count': 'Number of Samples'},
                  template='plotly_white')
    fig.update_traces(line_color='#ff7f0e')
    fig.show()
else:
    print("No temporal data available")

## Sample Types

Distribution of different sample types.

In [None]:
query_types = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?type (COUNT(?sample) as ?count)
WHERE {
  ?sample a ?type .
  FILTER(?type != sosa:Sample)
}
GROUP BY ?type
ORDER BY DESC(?count)
LIMIT 10
"""

types_df = conn.query(query_types)

if not types_df.empty:
    fig = px.pie(types_df, values='count', names='type',
                 title='Sample Type Distribution',
                 template='plotly_white')
    fig.show()
else:
    print("No sample type data available")

## Sample Details

View detailed information about individual samples.

In [None]:
query_details = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX schema: <http://schema.org/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?sample ?label ?description
WHERE {
  ?sample a sosa:Sample .
  OPTIONAL { ?sample rdfs:label ?label . }
  OPTIONAL { ?sample schema:description ?description . }
}
LIMIT 50
"""

details_df = conn.query(query_details)
details_df.head(20)