# Quality Analysis

This notebook performs quality assessment and validation checks on the EMO-BON knowledge graph data.

In [2]:
from sema.query import DefaultSparqlBuilder, GraphSource as KGSource, QueryResult
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from pandas import DataFrame

In [3]:
# Connect to EMO-BON Knowledge Graph
graphdb_url = "http://localhost:7200/repositories/kgap"
# print(f"{GDB_ENDPOINT=}")
GDB: KGSource = KGSource.build(graphdb_url)

## Missing Mandatory Properties

Check for observatories missing mandatory properties like name or location.

In [5]:
query_missing_names = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX schema: <http://schema.org/>

SELECT ?observatory
WHERE {
  ?observatory a emobon:Observatory .
  FILTER NOT EXISTS { ?observatory schema:name ?name . }
}
"""

missing_names_df: QueryResult = GDB.query(sparql=query_missing_names)
print(f"Result: {missing_names_df=}")
missing_names_df: DataFrame = missing_names_df.to_dataframe()
missing_names_df.head(10)
print(f"Observatories missing names: {len(missing_names_df)}")
if not missing_names_df.empty:
    missing_names_df.head(10)

Result: missing_names_df=<sema.query.query.SPARQLQueryResult object at 0x000001A6B3673A10>
Observatories missing names: 0


## Orphaned Samples

Find samples that are not linked to any observatory or sampling event.

In [10]:
query_orphaned_samples = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX sampling: <https://data.emobon.embrc.eu/ns/sampling#>

SELECT ?sample
WHERE {
  ?sample a sosa:Sample .
  #?sample sosa:isResultOf ?SamplingActivity .
  #?SamplingActivity sampling:linkedToObservatory ?observatory .
  FILTER NOT EXISTS { ?sample sosa:isResultOf/sampling:linkedToObservatory ?observatory . }
}
"""

orphaned = GDB.query(sparql=query_orphaned_samples)
print(f"Result: {orphaned=}")
orphaned_df: DataFrame = orphaned.to_dataframe()
orphaned_df.head(10)
print(f"Orphaned samples (not linked to sampling features): {len(orphaned_df)}")
if not orphaned_df.empty:
    orphaned_df.head(10)
    
query_orphaned_samples_reverse = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX sampling: <https://data.emobon.embrc.eu/ns/sampling#>

SELECT ?sample
WHERE {
  ?sample a sosa:Sample .
  #?sample sosa:isResultOf ?SamplingActivity .
  #?SamplingActivity sampling:linkedToObservatory ?observatory .
  FILTER EXISTS { ?sample sosa:isResultOf/sampling:linkedToObservatory ?observatory . }
}
"""

orphaned_reverse = GDB.query(sparql=query_orphaned_samples_reverse)
print(f"Result: {orphaned_reverse=}")
orphaned_reverse_df: DataFrame = orphaned_reverse.to_dataframe()
orphaned_reverse_df.head(10)
print(f"Samples linked to sampling features: {len(orphaned_reverse_df)}")
if not orphaned_reverse_df.empty:
    orphaned_reverse_df.head(10)

Result: orphaned=<sema.query.query.SPARQLQueryResult object at 0x000001A6B35D93D0>
Orphaned samples (not linked to sampling features): 2105
Result: orphaned_reverse=<sema.query.query.SPARQLQueryResult object at 0x000001A6B38C7D10>
Samples linked to sampling features: 2178


## Duplicate Detection

Check for potential duplicate observatories based on same name and location.

In [None]:
query_duplicates = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX schema: <http://schema.org/>

SELECT ?name ?location (COUNT(?observatory) as ?count)
WHERE {
  ?observatory a emobon:Observatory .
  ?observatory schema:name ?name .
  ?observatory schema:location ?location .
}
GROUP BY ?name ?location
HAVING (COUNT(?observatory) > 1)
ORDER BY DESC(?count)
"""

duplicates_df = conn.query(query_duplicates)
print(f"Potential duplicate observatory groups: {len(duplicates_df)}")
if not duplicates_df.empty:
    duplicates_df

## Data Completeness Score

Calculate completeness percentage for key observatory properties.

In [None]:
query_completeness = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX schema: <http://schema.org/>

SELECT 
  (COUNT(DISTINCT ?observatory) as ?total)
  (COUNT(DISTINCT ?name_obs) as ?with_name)
  (COUNT(DISTINCT ?loc_obs) as ?with_location)
  (COUNT(DISTINCT ?desc_obs) as ?with_description)
  (COUNT(DISTINCT ?country_obs) as ?with_country)
WHERE {
  ?observatory a emobon:Observatory .
  OPTIONAL { ?name_obs a emobon:Observatory ; schema:name ?name . }
  OPTIONAL { ?loc_obs a emobon:Observatory ; schema:location ?loc . }
  OPTIONAL { ?desc_obs a emobon:Observatory ; schema:description ?desc . }
  OPTIONAL { ?country_obs a emobon:Observatory ; schema:addressCountry ?country . }
}
"""

completeness_df = conn.query(query_completeness)

if not completeness_df.empty:
    total = completeness_df.iloc[0]['total']
    if total > 0:
        metrics = {
            'Property': ['Name', 'Location', 'Description', 'Country'],
            'Completeness (%)': [
                (completeness_df.iloc[0]['with_name'] / total) * 100,
                (completeness_df.iloc[0]['with_location'] / total) * 100,
                (completeness_df.iloc[0]['with_description'] / total) * 100,
                (completeness_df.iloc[0]['with_country'] / total) * 100
            ]
        }
        
        metrics_df = pd.DataFrame(metrics)
        
        fig = px.bar(metrics_df, x='Property', y='Completeness (%)',
                     title='Data Completeness for Observatory Properties',
                     template='plotly_white',
                     color='Completeness (%)',
                     color_continuous_scale='RdYlGn',
                     range_color=[0, 100])
        fig.update_layout(showlegend=False)
        fig.show()
        
        print("\nCompleteness Summary:")
        print(metrics_df.to_string(index=False))
    else:
        print("No observatories found")
else:
    print("Unable to calculate completeness")

## Temporal Data Quality

Check for samples with missing or invalid timestamps.

In [None]:
query_missing_dates = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>

SELECT (COUNT(?sample) as ?samples_without_date)
WHERE {
  ?sample a sosa:Sample .
  ?sampling sosa:hasResult ?sample .
  FILTER NOT EXISTS { ?sampling sosa:resultTime ?date . }
}
"""

missing_dates_df = conn.query(query_missing_dates)
if not missing_dates_df.empty:
    count = missing_dates_df.iloc[0]['samples_without_date']
    print(f"Samples without sampling date: {count}")
else:
    print("No data available for temporal quality check")

## Class Instance Distribution

Overview of the number of instances for each major class.

In [None]:
query_class_counts = """
PREFIX sosa: <http://www.w3.org/ns/sosa/>
PREFIX emobon: <http://www.embrc.eu/emobon/EmobonOntology#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?class (COUNT(?instance) as ?count)
WHERE {
  ?instance a ?class .
  FILTER(?class IN (emobon:Observatory, sosa:Sample, sosa:Sampling, sosa:FeatureOfInterest))
}
GROUP BY ?class
ORDER BY DESC(?count)
"""

class_counts_df = conn.query(query_class_counts)

if not class_counts_df.empty:
    fig = go.Figure(data=[go.Table(
        header=dict(values=['Class', 'Instance Count'],
                   fill_color='paleturquoise',
                   align='left'),
        cells=dict(values=[class_counts_df['class'], class_counts_df['count']],
                  fill_color='lavender',
                  align='left'))
    ])
    fig.update_layout(title='Instance Counts by Class')
    fig.show()
else:
    print("No class count data available")

## Quality Summary

Overall quality metrics summary.

In [None]:
print("=" * 50)
print("QUALITY ANALYSIS SUMMARY")
print("=" * 50)
print("\nThis analysis checked the following quality aspects:")
print("- Missing mandatory properties (e.g., names, locations)")
print("- Orphaned samples (samples not linked to features)")
print("- Potential duplicates (same name and location)")
print("- Data completeness across key properties")
print("- Temporal data quality (missing timestamps)")
print("- Distribution of class instances")
print("\nReview the results above to identify areas for improvement.")