In [20]:
! pip install pandas
! pip install openpyxl

import pandas as pd
import rdflib
import hashlib
import numpy as np
import urllib.parse
from rdflib import Literal, Namespace, RDF, URIRef, Graph, RDFS, OWL
from rdflib.namespace import FOAF, XSD
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [21]:
import pandas as pd

# Define the file paths
files = ['data/PMP_dummy.xlsx']

# Load the data into Pandas dataframes
dfs = []
for file in files:
    try:
        df = pd.read_excel(file, engine='openpyxl')
        dfs.append(df)
    except Exception as e:
        
        print(f"Error reading {file}: {e}")
barometer_dt_Poultry = dfs[0]



In [22]:

# Convert date columns to datetime and format to remove time
barometer_dt_Poultry['HatchDate'] = pd.to_datetime(barometer_dt_Poultry['HatchDate']).dt.date.astype(str)
barometer_dt_Poultry['RemovalDate'] = pd.to_datetime(barometer_dt_Poultry['RemovalDate']).dt.date.astype(str)


In [23]:
barometer_dt_Poultry

Unnamed: 0,VetId,HatchDate,FlockID,RemovalDate,FarmIdentification,PoultryFarmIdentification,GrowthCurve,Thinned,House
0,4856405,2019-09-05,261479,2019-10-31,2752546,38236,ALT,1,1
1,164846,2019-09-12,261895,2019-11-05,1646742,38423,ALT,0,6
2,164846,2019-11-22,266024,2020-01-15,1761167,39105,ALT,0,5
3,164846,2019-12-23,267678,2020-02-20,1977700,39488,ALT,1,1
4,164846,2019-12-27,268273,2020-02-06,2811332,40981,REG,1,2
5,164846,2020-03-31,273330,2020-05-13,2811332,40981,REG,1,2
6,164846,2020-04-06,273898,2020-05-20,1729635,38615,REG,1,3
7,487236,2020-04-07,273670,2020-05-20,1729635,38615,REG,1,2
8,487236,2020-04-08,274072,2020-05-20,6369989,38543,REG,1,4
9,487236,2020-04-21,274453,2020-06-03,2214740,39044,REG,1,3


In [24]:
unique_farms = barometer_dt_Poultry['FarmIdentification'].unique()
print(unique_farms)


[2752546 1646742 1761167 1977700 2811332 1729635 6369989 2214740  946937]


In [25]:
# Load existing ontology
g = Graph()
ontology_file = "ontology/LivestockHealthOntology_updated.rdf"
g.parse(ontology_file, format="xml")

# Namespaces
LHO = Namespace("http://www.purl.org/decide/LiveStockHealthOnto/LHO#")
DECIDE = Namespace("http://www.purl.org/decide#")
AGROVOC = Namespace("http://aims.fao.org/aos/agrovoc/")
INRA = Namespace("http://opendata.inra.fr/AnimalDiseasesOnto/")
NCIT = Namespace("http://purl.obolibrary.org/obo/")

g.bind("lho", LHO)
g.bind("decide", DECIDE)
g.bind("agrovoc", AGROVOC)
g.bind("inra", INRA)
g.bind("ncit", NCIT)


In [26]:
# check if an individual exists in the graph
def individual_exists(graph, uri):
    return (uri, None, None) in graph

# Iterate through your DataFrame
for index, row in barometer_dt_Poultry.iterrows():
    flock_id = row['FlockID']
    house_id = row['House']
    p_farm_id = row['PoultryFarmIdentification']
    farm_id = row['FarmIdentification']

    # Construct URIs
    flock_uri = URIRef(LHO + f"FlockID_{flock_id}")
    house_uri = URIRef(LHO + f"House_{farm_id}_{house_id}")  # uniquely identify House per Farm
    p_farm_uri = URIRef(LHO + f"Farm_{p_farm_id}")
    farm_uri = URIRef(LHO + f"FarmID_{farm_id}")

    # Skip if flock already exists

    # --- Add Flock ---
    if not individual_exists(g, flock_uri):
         g.add((flock_uri, RDF.type, LHO.CS28))  # Class: Flock
         g.add((flock_uri, LHO.CS32, Literal(str(flock_id), datatype=XSD.string)))
    
    if pd.notna(row['VetId']):
        g.add((flock_uri, LHO.CS33, Literal(str(row['VetId']), datatype=XSD.string)))
    if pd.notna(row['HatchDate']):
        g.add((flock_uri, LHO.CS34, Literal(str(row['HatchDate']), datatype=XSD.string)))
    if pd.notna(row['RemovalDate']):
        g.add((flock_uri, LHO.CS35, Literal(str(row['RemovalDate']), datatype=XSD.string)))
    if pd.notna(row['GrowthCurve']):
        g.add((flock_uri, LHO.CS36, Literal(str(row['GrowthCurve']), datatype=XSD.string)))
    if pd.notna(row['Thinned']):
        g.add((flock_uri, LHO.CS37, Literal(str(int(row['Thinned'])), datatype=XSD.string)))

    # --- Add House if it doesn't exist ---
    if not individual_exists(g, house_uri):
        g.add((house_uri, RDF.type, LHO.CS00))  # Class: House
        g.add((house_uri, RDFS.label, Literal(f"House_{farm_id}_{house_id}", lang="en")))
        g.add((house_uri, LHO.CS45, Literal(str(house_id), datatype=XSD.string)))  # House number as data prop
        g.add((house_uri, RDFS.comment, Literal("House (like 1, 2, 3) appears in many farms, but it's contextual.That means Farm A might have House 1, and Farm B also might have its own House 1 — but they're not the same. so we model House individuals uniquely per Farm or PoultryFarm (e.g., House_Farm2214740_1). ", lang="en")))
        g.add((house_uri, LHO.CS30, flock_uri))  # Link: House → Flock

    # --- Poultry Farm ID ---
    if not individual_exists(g, p_farm_uri):
        g.add((p_farm_uri, RDF.type, LHO.CS62))
        g.add((p_farm_uri, RDFS.label, Literal(f"PF_ID_{p_farm_id}", lang="en")))
        g.add((p_farm_uri, RDFS.comment, Literal("This is a Poultry Farm ID individual/instance", lang="en")))

    # --- General Farm ID ---
    if not individual_exists(g, farm_uri):
        g.add((farm_uri, RDF.type, LHO.FarmIdentification))
        g.add((farm_uri, RDFS.label, Literal(f"FarmID_{farm_id}", lang="en")))
        g.add((farm_uri, RDFS.comment, Literal("This is a general Farm ID individual/ instance", lang="en")))

    # --- Link Farms ---
    g.add((p_farm_uri, LHO.CS29, house_uri))     # Link: PoultryFarm → House
    g.add((farm_uri, LHO.CS63, p_farm_uri))      # Link: FarmID → PoultryFarmID



In [27]:
#Save the updated ontology
output_file = "output/LivestockHealthOntology_updated 1.3.rdf"
g.serialize(destination=output_file, format="xml")
print(f"Ontology updated and saved to {output_file}")

Ontology updated and saved to output/LivestockHealthOntology_updated 1.3.rdf


In [28]:
# SPARQL query to get data properties for a specific flock
# SPARQL query to get data property values + their rdfs:labels
query = """
PREFIX lho: <http://www.purl.org/decide/LiveStockHealthOnto/LHO#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?property ?label ?value
WHERE {
    lho:FlockID_261895 ?property ?value .
    FILTER (isLiteral(?value)) .

    OPTIONAL {
        ?property rdfs:label ?label .
        FILTER (lang(?label) = "" || lang(?label) = "en")
    }
}
"""

# Run the query
results = g.query(query)

# Print results with label fallback
for row in results:
    label = row.label if row.label else row.property.split("#")[-1]
    print(f"{label}: {row.value}")

hasFlockID: 261895
hasVetID: 164846
hasHatchDate: 2019-09-12
hasRemovalDate: 2019-11-05
hasGrowthCurve: ALT
hasThinned: 0
