### Install and Import the necessary classes from the RDFlib library:

In [1]:
! pip install pandas
! pip install openpyxl

import pandas as pd
import rdflib
import hashlib
import numpy as np
from rdflib import Literal, Namespace, RDF, URIRef
from rdflib.namespace import FOAF, XSD
from rdflib import Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit




### Step 01: Define the file paths and Load the data into Pandas dataframes and Clean

In [2]:
# Define the file paths
files = ['DGZ/DECIDE_MTA_UGENT_14nov2022.xlsx', 
         'DGZ/DECIDE_MTA_UGENT_BAC_AERO_14nov2022.xlsx', 
         'DGZ/DECIDE_MTA_UGENTBAC_MYCO_14nov2022.xlsx']

# Load the data into Pandas dataframes
dfs = []
for file in files:
    df = pd.read_excel(file, engine='openpyxl')
    dfs.append(df)

barometer_dt_raw = dfs[0]
barometer_aero_cult_raw = dfs[1]
barometer_myco_cult_raw = dfs[2]


### Clean the Data 

In [3]:
import pandas as pd

# Data manipulation AEROBIC CULTURE results
barometer_aero_cult = barometer_aero_cult_raw \
    .rename(columns={"Dossiernummer": "Filenumber", 
                     "KIEMSTAAL IDENTIFICATIE": "Pathogen_identification", 
                     "KIEMSTAAL RESULTAAT": "Pathogen_result", 
                     "Staalnummer": "Samplenumber"}) \
    .assign(Parameter_code = "BAC_AERO", Result = "OK") \
    .filter(items=["Filenumber", "Pathogen_identification", "Pathogen_result", "Parameter_code", "Samplenumber", "Result"]) \
    .query('Pathogen_identification in ["Pasteurella multocida", "Mannheimia haemolytica", "Histophilus somni", "Mycoplasma bovis"]') \
    .drop_duplicates()

df_samples = pd.DataFrame([
    ("OK", "BAC_AERO", "Culture", "Pasteurella multocida"),
    ("OK", "BAC_AERO", "Culture", "Mannheimia haemolytica"),
    ("OK", "BAC_AERO", "Culture", "Histophilus somni"),
    ("OK", "BAC_MYCOPLASMA", "Culture", "Mycoplasma bovis")
], columns=["Result", "Parameter_code", "Diagnostic_test", "Pathogen_identification"])

In [4]:
# Data manipulation MYCOPLASMA CULTURE results
barometer_myco_cult = barometer_myco_cult_raw \
    .rename(columns={"Dossiernummer": "Filenumber", "KIEMSTAAL IDENTIFICATIE": "Pathogen_identification", 
                     "KIEMSTAAL RESULTAAT": "Mycoplasma_result", "Staalnummer": "Samplenumber"}) \
    .assign(Parameter_code="BAC_MYCOPLASMA", Result="OK") \
    .loc[barometer_myco_cult_raw["KIEMSTAAL IDENTIFICATIE"] == "Mycoplasma bovis"] \
    .drop_duplicates(subset=["Filenumber", "Pathogen_identification", "Mycoplasma_result", "Parameter_code", "Samplenumber", "Result"]) \
    [["Filenumber", "Pathogen_identification", "Mycoplasma_result", "Parameter_code", "Samplenumber", "Result"]]


#print(barometer_myco_cult)




In [5]:
# Data manipulation PCR results
barometer_dtt = barometer_dt_raw \
    .rename(columns={"Dossiernummer": "Filenumber", "Staalnummer": "Samplenumber", 
                     "Staaltype": "Sample_type", "PARAMETER_CODE": "Parameter_code",
                     "Onderzoek": "Pathogen", "Resultaat": "Result",
                     "Creatiedatum": "Date", "Postcode": "Postal_code",
                     "ANON_ID": "Farm_ID"}) \
    .assign(Country=np.where(barometer_dt_raw["PARAMETER_CODE"].isin(["BAC_AERO", "BAC_MYCOPLASMA"]), "Belgium", np.nan)) \
    .assign(Diagnostic_test=np.where(barometer_dt_raw["PARAMETER_CODE"].isin(["BAC_AERO", "BAC_MYCOPLASMA"]), "Culture", "PCR")) \
    .assign(Lab_reference="1") \
    .replace({"RU Broncho-alveolar lavage (BAL)": "BAL",
              "RU Anderen": "Unknown",
              "RU Swabs": "Swab",
              "RU Swab": "Swab",
              "RU Neusswab": "Swab",
              "RU Neusswabs": "Swab",
              "RU Kadaver": "Autopsy",
              "RU Organen": "Autopsy",
              np.nan: "Missing"}) \
    .assign(Breed=np.where(barometer_dt_raw["Bedrijfstype"] == "VCALF", "Veal",
                          np.where(barometer_dt_raw["MEAT"].isnull(), "Unknown",
                          np.where((barometer_dt_raw["MEAT"] / barometer_dt_raw["TOTAL"]) > 0.9, "Beef",
                          np.where((barometer_dt_raw["MILK"] / barometer_dt_raw["TOTAL"]) > 0.9, "Dairy",
                          "Mixed"))))) \
    [["Filenumber", "Pathogen", "Result", "Parameter_code", "Samplenumber", "Result", "Country", "Diagnostic_test", "Lab_reference", "Sample_type", "Postal_code", "Farm_ID", "Breed"]]

In [6]:
# Data manipulation PCR results
barometer_dtt = barometer_dt_raw.rename(columns={"Dossiernummer": "Filenumber",
                                                  "Staalnummer": "Samplenumber",
                                                  "Staaltype": "Sample_type",
                                                  "PARAMETER_CODE": "Parameter_code",
                                                  "Onderzoek": "Pathogen",
                                                  "Resultaat": "Result",
                                                  "Creatiedatum": "Date",
                                                  "Postcode": "Postal_code",
                                                  "ANON_ID": "Farm_ID"})
    
barometer_dtt["Country"] = np.where(barometer_dtt["Parameter_code"].isin(["BAC_AERO", "BAC_MYCOPLASMA"]), "Belgium", np.nan)
barometer_dtt["Diagnostic_test"] = np.where(barometer_dtt["Parameter_code"].isin(["BAC_AERO", "BAC_MYCOPLASMA"]), "Culture", "PCR")
barometer_dtt["Lab_reference"] = "1"

sample_type_mapping = {"RU Broncho-alveolar lavage (BAL)": "BAL",
                       "RU Anderen": "Unknown",
                       "RU Swabs": "Swab",
                       "RU Swab": "Swab",
                       "RU Neusswab": "Swab",
                       "RU Neusswabs": "Swab",
                       "RU Kadaver": "Autopsy",
                       "RU Organen": "Autopsy"}

barometer_dtt["Sample_type"] = barometer_dtt["Sample_type"].map(sample_type_mapping).fillna("Missing")

breed_mapping = {"VCALF": "Veal",
                 "MEAT": np.nan}
barometer_dtt["Breed"] = np.select([(barometer_dtt["Bedrijfstype"] == "VCALF"),
                                     (barometer_dtt["MEAT"].isnull()),
                                     ((barometer_dtt["MEAT"] / barometer_dtt["TOTAL"]) > 0.9),
                                     ((barometer_dtt["MILK"] / barometer_dtt["TOTAL"]) > 0.9)],
                                    ["Veal", "Unknown", "Beef", "Dairy"],
                                    default="Mixed")

pathogen_mapping = {"AD Pasteurella multocida Ag (PCR)": "Pasteurella multocida",
                    "AD Pasteurella multocida Ag pool (PCR)": "Pasteurella multocida",
                    "AD P. multocida Ag (PCR)": "Pasteurella multocida",
                    "AD P. multocida Ag pool (PCR)": "Pasteurella multocida",
                    "AD Mannheimia haemolytica Ag (PCR)": "Mannheimia haemolytica",
                    "AD Mannheimia haemolytica Ag pool (PCR)": "Mannheimia haemolytica",
                    "RU PI3 Ag (PCR)": "PI3",
                    "RU PI3 Ag pool (PCR)": "PI3",
                    "RU BRSV Ag (PCR)": "BRSV",
                    "RU BRSV Ag pool (PCR)": "BRSV",
                    "AD Histophilus somnus (PCR)": "Histophilus somni",
                    "AD Histophilus somnus Ag (PCR)": "Histophilus somni",
                    "AD Histophilus somnus Ag pool (PCR)": "Histophilus somni",
                    "AD Histophilus somni Ag (PCR)": "Histophilus somni",
                    "AD Histophilus somni Ag pool (PCR)": "Histophilus somni",
                    "RU Mycoplasma bovis (PCR)": "Mycoplasma bovis",
                    "RU Mycoplasma bovis Ag pool (PCR)": "Mycoplasma bovis",
                    "RU Mycoplasma bovis Ag (PCR)": "Mycoplasma bovis",
                    "AD Corona Ag (PCR)": "BCV",
                    "AD Corona Ag pool (PCR)": "BCV"}

# Create a new column 'Disease' based on the mapping between Pathogen and Disease
barometer_dtt["Disease"] = barometer_dtt["Pathogen"].replace(pathogen_mapping)

# Create a mapping between postal codes and provinces
province_map = [(1000, 1299, "Brussels"),
                (1300, 1499, "Walloon Brabant"),
                (1500, 1999, "Flemish Brabant"),
                (3000, 3499, "Antwerp"),
                (2000, 2999, "Limburg"),
                (5000, 5999, "Namur"),
                (6000, 6599, "Hainaut"),
                (7000, 7999, "Hainaut"),
                (6600, 6999, "Luxembourg"),
                (8000, 8999, "West Flanders")]

# Sort the province_map list by the first element of each tuple
province_map.sort(key=lambda x: x[0])

# Create a new column 'Province' based on the mapping between Postal_code and Province
barometer_dtt["Province"] = pd.cut(barometer_dtt["Postal_code"], 
                               bins=[p[0]-1 for p in province_map] + [max([p[1] for p in province_map])+1],
                               labels=[p[2] for p in province_map],
                               ordered=False)


# Select columns of interest and drop duplicates
barometer_dtt = barometer_dtt.loc[:, ["Filenumber", "Diagnostic_test", "Samplenumber", "Country", 
                                      "Lab_reference", "Sample_type", "Breed", "Parameter_code", 
                                      "Result", "Pathogen", "Date", "Postal_code", "Province", "Farm_ID"]]\
                             .drop_duplicates()

# Show the resulting dataframe
#print(barometer_dtt.head())


In [7]:
 #Join dataframes
barometer = pd.merge(barometer_dtt, df_samples, on=['Diagnostic_test', 'Result', 'Parameter_code'], how='left')
barometer = pd.merge(barometer, barometer_aero_cult, on=['Filenumber', 'Samplenumber', 'Result', 'Parameter_code', 'Pathogen_identification'], how='left')
barometer = pd.merge(barometer, barometer_myco_cult, on=['Filenumber', 'Samplenumber', 'Result', 'Parameter_code', 'Pathogen_identification'], how='left')

# Replace values in Pathogen column
barometer['Pathogen'] = np.where(barometer['Pathogen'] == 'Pasteurella multocida', 'PM', 
                                 np.where(barometer['Pathogen'] == 'Histophilus somni', 'HS',
                                          np.where(barometer['Pathogen'] == 'Mannheimia haemolytica', 'MH', 
                                                   np.where(barometer['Pathogen'] == 'Mycoplasma bovis', 'MB', barometer['Pathogen']))))

barometer['Pathogen'] = np.where(barometer['Pathogen_identification'] == 'Pasteurella multocida', 'PM', 
                                 np.where(barometer['Pathogen_identification'] == 'Histophilus somni', 'HS',
                                          np.where(barometer['Pathogen_identification'] == 'Mannheimia haemolytica', 'MH', 
                                                   np.where(barometer['Pathogen_identification'] == 'Mycoplasma bovis', 'MB', barometer['Pathogen']))))

# Replace values in Result column
conditions = [barometer['Result'].isin(["Twijfelachtig (PCR)", "POSITIEF", "GEDETECTEERD", "GEDETECTEERD (sterk)", "GEDETECTEERD (zwak)", "GEDETECTEERD (matig)", "GEDETECTEERD (zeer sterk)", "GEDETECTEERD (zeer zwak)"]),
              barometer['Result'].isin(["negatief", "Niet gedetecteerd"]),
              barometer['Result'].isin(["NI", "niet interpreteerbaar", "Inhibitie"]),
              (barometer['Parameter_code'] == 'BAC_AERO') & (barometer['Pathogen_result'].isnull()),
              (barometer['Parameter_code'] == 'BAC_AERO') & (barometer['Pathogen_result'].notnull()),
              (barometer['Parameter_code'] == 'BAC_MYCOPLASMA') & (barometer['Mycoplasma_result'].isnull()),
              (barometer['Parameter_code'] == 'BAC_MYCOPLASMA') & (barometer['Mycoplasma_result'] == 'neg'),
              (barometer['Parameter_code'] == 'BAC_MYCOPLASMA') & (barometer['Mycoplasma_result'].str.contains('POS'))]

choices = [1, 0, None, 0, 1, None, 0, 1]

barometer['Result'] = np.select(conditions, choices, default=None)
#print(barometer.head())

### Step 02: Create an RDF graph and namespaces.

In [8]:
g = rdflib.Graph()
onto = Namespace("http://www.purl.org/decide/LivestockHealthOnto")
g.bind('onto', onto)
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

###  Step 03: Iterate over the Panda DataFrame and map to ontology properties:

In [9]:
# iterate over each row in the dataframe and
for _, row in barometer.iterrows():

# Generate anonymized values for file number and sample number
    FileNumber = hashlib.sha256(str(row.iloc[0]).encode()).hexdigest()
    SampleNumber = hashlib.sha256(str(row.iloc[2]).encode()).hexdigest()
    
    CattleSample = onto[f'CattleSample{row[0]}']
    g.add((CattleSample, RDF.type, onto.CattleSample))
        # Add anonymized values to the RDF graph
    g.add((CattleSample, onto.hasFileNumber, Literal(FileNumber, datatype=XSD.string)))
    g.add((CattleSample, onto.hasSampleNumber, Literal(SampleNumber, datatype=XSD.string)))
    g.add((CattleSample, onto.hasDiagnosticTest, Literal(row[1], datatype=XSD.string)))
    g.add((CattleSample, onto.hasCountry, Literal(row[3], datatype=XSD.string)))
    g.add((CattleSample, onto.hasLabReference, Literal(row[4], datatype=XSD.string)))
    g.add((CattleSample, onto.hasSampleType, Literal(row[5], datatype=XSD.string)))
    g.add((CattleSample, onto.hasBreed, Literal(row[6], datatype=XSD.string)))
    g.add((CattleSample, onto.hasParameterCode, Literal(row[7], datatype=XSD.string)))
    g.add((CattleSample, onto.hasResult, Literal(row[8], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogen, Literal(row[9], datatype=XSD.string)))
    g.add((CattleSample, onto.hasDate, Literal(row[10], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPostalCode, Literal(row[11], datatype=XSD.string)))
    g.add((CattleSample, onto.hasProvince, Literal(row[12], datatype=XSD.string)))
    g.add((CattleSample, onto.hasFarmID, Literal(row[13], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogenIdentification, Literal(row[14], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogenResult, Literal(row[15], datatype=XSD.string)))
    g.add((CattleSample, onto.hasMicoplasmaResult, Literal(row[16], datatype=XSD.string)))

# output RDF graph to file (replace with your desired filename)
g.serialize(destination='output/RDFoutputCattleSampleAnomizedElena.ttl', format='turtle')


<Graph identifier=N3cd53be78f604910965acb2d5bf77808 (<class 'rdflib.graph.Graph'>)>

### Step 4: Load the RDF data and ontology into a Panda DataFrame: 

In [10]:
# Define the original path
path_to_RDF = "output/RDFoutputCattleSampleAnomizedElena.ttl"

# Try to parse the file and catch any errors
try:
    # Create a new graph
    g = Graph()

    # Parse the RDF file in Turtle format
    g.parse(path_to_RDF, format='ttl')

    # Parse the ontology file in OWL format and add it to the graph
    path_to_ontology = "Ontology/LivestockHealthOnto1.0Elena.owl"
    g.parse(path_to_ontology, format="xml")
    
except Exception as e:
    # Print the error message
    print(f"An error occurred while parsing the RDF file: {e}")

In [11]:
#Use RDFS or OWL reasoning to infer additional knowledge
g.bind('rdfs', RDFS)
g.bind('owl', OWL)
g.bind('onto', Namespace("http://www.purl.org/decide/LivestockHealthOnto"))

### Step 05: Query the data from updated ontology 

In [12]:
# define the SPARQL query and Query the data from updated ontology
query = """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto#>
SELECT ?FileNumber ?DiagnosticTest ?SampleNumber ?Breed ?LabReference  ?SampleType ?Result ?Pathogen ?PostalCode ?Province ?PathogenIdentification ?PathogenResult ?MycoplasmaResult
WHERE {
  ?CattleSample onto:hasFileNumber ?FileNumber . 
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleNumber ?SampleNumber .
  ?CattleSample onto:hasBreed ?Breed .
  ?CattleSample onto:hasLabReference ?LabReference .
  ?CattleSample onto:hasSampleType ?SampleType .
  ?CattleSample onto:hasResult ?Result .
  ?CattleSample onto:hasPathogen ?Pathogen .
  ?CattleSample onto:hasPostalCode ?PostalCode .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasPathogenIdentification ?PathogenIdentification .
  ?CattleSample onto:hasPathogenResult ?PathogenResult .
  ?CattleSample onto:hasMicoplasmaResult ?MycoplasmaResult .

}
"""

# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["FileNumber", "DiagnosticTest", "SampleNumber", "Breed", "LabReference", "SampleType", "Result", "Pathogen","PostalCode","Province","PathogenIdentification", "PathogenResult", "MycoplasmaResult"])

# display the dataframe
df.head()

Unnamed: 0,FileNumber,DiagnosticTest,SampleNumber,Breed,LabReference,SampleType,Result,Pathogen,PostalCode,Province,PathogenIdentification,PathogenResult,MycoplasmaResult
0,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,0.0,AD Corona Ag (PCR),2910,Limburg,,,
1,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,1.0,AD Corona Ag (PCR),2910,Limburg,,,
2,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,,AD Corona Ag (PCR),2910,Limburg,,,
3,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,0.0,AD Histophilus somnus (PCR),2910,Limburg,,,
4,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,1.0,AD Histophilus somnus (PCR),2910,Limburg,,,


In [13]:
# define the SPARQL query 02
query = """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>
SELECT ?FileNumber ?DiagnosticTest ?SampleNumber ?Breed ?LabReference ?SampleType ?Result ?Pathogen ?PostalCode ?Province ?PathogenIdentification ?PathogenResult ?MycoplasmaResult
WHERE {
  ?CattleSample onto:hasFileNumber ?FileNumber . 
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleNumber ?SampleNumber .
  ?CattleSample onto:hasBreed ?Breed .
  FILTER (?Breed = "Beef")
  ?CattleSample onto:hasLabReference ?LabReference .
  ?CattleSample onto:hasSampleType ?SampleType .
  ?CattleSample onto:hasResult ?Result .
  ?CattleSample onto:hasPathogen ?Pathogen .
  ?CattleSample onto:hasPostalCode ?PostalCode .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasPathogenIdentification ?PathogenIdentification .
  ?CattleSample onto:hasPathogenResult ?PathogenResult .
  ?CattleSample onto:hasMicoplasmaResult ?MycoplasmaResult .
}
"""

# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["FileNumber", "DiagnosticTest", "SampleNumber", "Breed", "LabReference", "SampleType", "Result", "Pathogen","PostalCode","Province","PathogenIdentification", "PathogenResult", "MycoplasmaResult"])

# display the dataframe
df.head()

Unnamed: 0,FileNumber,DiagnosticTest,SampleNumber,Breed,LabReference,SampleType,Result,Pathogen,PostalCode,Province,PathogenIdentification,PathogenResult,MycoplasmaResult
0,952fe97b86661dc65d3c1bae7e87dcac57f0cd96cc92b2...,PCR,22a35575c079e307143b9fcfb252d27eefea042fbbc958...,Beef,1,BAL,0,AD Corona Ag (PCR),8700,West Flanders,,,
1,952fe97b86661dc65d3c1bae7e87dcac57f0cd96cc92b2...,PCR,6e116f0e1b58751c4ad0d6144618e77ffe8f1792437291...,Beef,1,BAL,0,AD Corona Ag (PCR),8700,West Flanders,,,
2,952fe97b86661dc65d3c1bae7e87dcac57f0cd96cc92b2...,PCR,f81c48f81ec2fd83f7485884ec165b48241202fd2430b3...,Beef,1,BAL,0,AD Corona Ag (PCR),8700,West Flanders,,,
3,952fe97b86661dc65d3c1bae7e87dcac57f0cd96cc92b2...,PCR,22a35575c079e307143b9fcfb252d27eefea042fbbc958...,Beef,1,BAL,1,AD Corona Ag (PCR),8700,West Flanders,,,
4,952fe97b86661dc65d3c1bae7e87dcac57f0cd96cc92b2...,PCR,6e116f0e1b58751c4ad0d6144618e77ffe8f1792437291...,Beef,1,BAL,1,AD Corona Ag (PCR),8700,West Flanders,,,


In [14]:
# define the SPARQL query 01
query = """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>
SELECT ?FileNumber ?DiagnosticTest ?SampleNumber ?Breed ?LabReference ?SampleType ?Result ?Pathogen ?PostalCode ?Province ?PathogenIdentification ?PathogenResult ?MycoplasmaResult
WHERE {
  ?CattleSample onto:hasFileNumber ?FileNumber . 
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleNumber ?SampleNumber .
  ?CattleSample onto:hasBreed ?Breed .
  ?CattleSample onto:hasLabReference ?LabReference .
  ?CattleSample onto:hasSampleType ?SampleType .
  ?CattleSample onto:hasResult ?Result .
  ?CattleSample onto:hasPathogen ?Pathogen .
  ?CattleSample onto:hasPostalCode ?PostalCode .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasPathogenIdentification ?PathogenIdentification .
  ?CattleSample onto:hasPathogenResult ?PathogenResult .
  FILTER (?PathogenResult = "POS++")
  ?CattleSample onto:hasMicoplasmaResult ?MycoplasmaResult .
  FILTER (?MycoplasmaResult = "POS++")
}
"""
# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["FileNumber", "DiagnosticTest", "SampleNumber", "Breed", "LabReference", "SampleType", "Result", "Pathogen","PostalCode","Province","PathogenIdentification", "PathogenResult", "MycoplasmaResult"])

# display the dataframe
df.head()

Unnamed: 0,FileNumber,DiagnosticTest,SampleNumber,Breed,LabReference,SampleType,Result,Pathogen,PostalCode,Province,PathogenIdentification,PathogenResult,MycoplasmaResult
0,381ab40916b42502cd14b9de7201d798d31c35255ccd2a...,Culture,854116431e19bdb205ca2a76204cda6aba4069bb500bdc...,Beef,1,Autopsy,0,AD Corona Ag (PCR),9800,,Histophilus somni,POS++,POS++
1,381ab40916b42502cd14b9de7201d798d31c35255ccd2a...,Culture,854116431e19bdb205ca2a76204cda6aba4069bb500bdc...,Beef,1,Autopsy,1,AD Corona Ag (PCR),9800,,Histophilus somni,POS++,POS++
2,381ab40916b42502cd14b9de7201d798d31c35255ccd2a...,Culture,854116431e19bdb205ca2a76204cda6aba4069bb500bdc...,Beef,1,Autopsy,0,AD Corona Ag (PCR),9800,,Mannheimia haemolytica,POS++,POS++
3,381ab40916b42502cd14b9de7201d798d31c35255ccd2a...,Culture,854116431e19bdb205ca2a76204cda6aba4069bb500bdc...,Beef,1,Autopsy,1,AD Corona Ag (PCR),9800,,Mannheimia haemolytica,POS++,POS++
4,381ab40916b42502cd14b9de7201d798d31c35255ccd2a...,Culture,854116431e19bdb205ca2a76204cda6aba4069bb500bdc...,Beef,1,Autopsy,0,AD Corona Ag (PCR),9800,,Mycoplasma bovis,POS++,POS++


In [15]:
# define the SPARQL query with bound variables
query = prepareQuery(
    """
    PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>
    SELECT ?FileNumber ?DiagnosticTest ?SampleNumber ?Breed ?LabReference ?SampleType ?Result ?Pathogen ?PostalCode ?Province ?PathogenIdentification ?PathogenResult ?MycoplasmaResult
    WHERE {
        ?CattleSample onto:hasFileNumber ?FileNumber .
        ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
        ?CattleSample onto:hasSampleNumber ?SampleNumber .
        ?CattleSample onto:hasBreed ?Breed .
        ?CattleSample onto:hasLabReference ?LabReference .
        ?CattleSample onto:hasSampleType ?SampleType .
        ?CattleSample onto:hasResult ?Result .
        ?CattleSample onto:hasPathogen ?Pathogen .
        ?CattleSample onto:hasPostalCode ?PostalCode .
        ?CattleSample onto:hasProvince ?Province .
        ?CattleSample onto:hasPathogenIdentification ?PathogenIdentification .
        ?CattleSample onto:hasPathogenResult ?PathogenResult .
        ?CattleSample onto:hasMicoplasmaResult ?MycoplasmaResult .
    }
    """
)

# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(
    data,
    columns=[
        "FileNumber",
        "DiagnosticTest",
        "SampleNumber",
        "Breed",
        "LabReference",
        "SampleType",
        "Result",
        "Pathogen",
        "PostalCode",
        "Province",
        "PathogenIdentification",
        "PathogenResult",
        "MycoplasmaResult",
    ],
)

# display the dataframe
df.head()

Unnamed: 0,FileNumber,DiagnosticTest,SampleNumber,Breed,LabReference,SampleType,Result,Pathogen,PostalCode,Province,PathogenIdentification,PathogenResult,MycoplasmaResult
0,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,0.0,AD Corona Ag (PCR),2910,Limburg,,,
1,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,1.0,AD Corona Ag (PCR),2910,Limburg,,,
2,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,,AD Corona Ag (PCR),2910,Limburg,,,
3,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,0.0,AD Histophilus somnus (PCR),2910,Limburg,,,
4,3c0000b7a7428b2268565718a7396382c89329cd756fd1...,PCR,897ece52d4952058da8da05b9137eccfd3ba2aed427e3c...,Mixed,1,BAL,1.0,AD Histophilus somnus (PCR),2910,Limburg,,,


In [None]:
query = prepareQuery(
    """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>

SELECT ?CattleSample ?FileNumber ?DiagnosticTest ?SampleNumber ?Breed ?LabReference ?SampleType ?Result ?Pathogen ?PostalCode ?Province ?PathogenIdentification ?PathogenResult ?MycoplasmaResult
WHERE {
  ?CattleSample onto:hasFileNumber ?FileNumber .
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleNumber ?SampleNumber .
  ?CattleSample onto:hasBreed ?Breed .
  ?CattleSample onto:hasLabReference ?LabReference .
  ?CattleSample onto:hasSampleType ?SampleType .
  ?CattleSample onto:hasResult ?Result .
  ?CattleSample onto:hasPathogen ?Pathogen .
  ?CattleSample onto:hasPostalCode ?PostalCode .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasPathogenIdentification ?PathogenIdentification .
  ?CattleSample onto:hasPathogenResult ?PathogenResult .
  ?CattleSample onto:hasMicoplasmaResult ?MycoplasmaResult .

  FILTER(?DiagnosticTest = "your_specific_value" && ?Result = "positive")
}
 """

In [None]:
#query 2 modification:
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto#>

SELECT ?FileNumber ?DiagnosticTest ?SampleNumber ?Breed ?LabReference ?SampleType ?Result ?Pathogen ?PostalCode ?Province ?PathogenIdentification ?PathogenResult ?MycoplasmaResult
WHERE {
  ?CattleSample onto:hasFileNumber ?FileNumber . 
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  FILTER (?DiagnosticTest = "PCR")
  ?CattleSample onto:hasSampleNumber ?SampleNumber .
  ?CattleSample onto:hasBreed ?Breed .
  FILTER (?Breed = "Beef")
  ?CattleSample onto:hasLabReference ?LabReference .
  ?CattleSample onto:hasSampleType ?SampleType .
  ?CattleSample onto:hasResult ?Result .
  FILTER (?Result = "positive")
  ?CattleSample onto:hasPathogen ?Pathogen .
  ?CattleSample onto:hasPostalCode ?PostalCode .
  ?CattleSample onto:hasProvince ?Province .
  OPTIONAL {
    ?CattleSample onto:hasPathogenIdentification ?PathogenIdentification .
  }
  ?CattleSample onto:hasPathogenResult ?PathogenResult .
  ?CattleSample onto:hasMicoplasmaResult ?MycoplasmaResult .
  FILTER (!BOUND(?PathogenIdentification))
}
This query includes an OPTIONAL block for the hasPathogenIdentification property, which means it will retrieve CattleSamples where the hasPathogenIdentification property is not present. By using FILTER (!BOUND(?PathogenIdentification)), you ensure that only CattleSamples without a hasPathogenIdentification property will be included in the results.

By executing this modified query, you should retrieve the desired results with the "hasPathogenIdentification" property excluded for CattleSamples of the "Beef" breed and "PCR" diagnostic test.






