In [1]:
! pip install pandas
! pip install openpyxl

import pandas as pd
import rdflib
import hashlib
import numpy as np
from datetime import datetime
from pandas.api.types import CategoricalDtype
from rdflib import Literal, Namespace, RDF, URIRef
from rdflib.namespace import FOAF, XSD
from rdflib import Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit



### Step01: Define The file paths and load Data into Pandas dataframes and Clean


In [2]:
# Define the file paths
files = ['Data/Pathosen/AllBovineRespiratory_NegativesIncluded.csv']

# Load the data into Pandas dataframes
dfs = []
for file in files:
    df = pd.read_csv(file)
    dfs.append(df)

barometer_dt_raw = dfs[0]

In [3]:
# Rename columns
barometer_dt_raw.rename(columns={
    'sample_id': 'FileNumber',
    'farm_id': 'FarmID',
    'created': 'Date'
}, inplace=True)

# Mutate new columns
barometer_dt_raw['LabReference'] = '4'
barometer_dt_raw['DiagnosticTest'] = 'NPS'
barometer_dt_raw['Breed'] = 'Unknown'
barometer_dt_raw['Province'] = pd.NA

# Map values for Country column
country_mapping = {
    'BE': 'Belgium',
    'NL': 'The Netherlands'
}
barometer_dt_raw['Country'] = barometer_dt_raw['country'].map(country_mapping)

# Map values for Sample_type column
sample_type_mapping = {
    'balFluid': 'BAL',
    'noseSwab': 'Swab'
}
barometer_dt_raw['SampleType'] = barometer_dt_raw['type'].map(sample_type_mapping).fillna('Other')
barometer_dt_raw['Province'].fillna('Unknown', inplace=True)

# Fill missing values in pathogens column with empty string
barometer_dt_raw['pathogens'].fillna('', inplace=True)

# Create new columns for pathogens
barometer_dt_raw['HS'] = barometer_dt_raw['pathogens'].str.contains('Histophilus somni').astype(int)
barometer_dt_raw['MH'] = barometer_dt_raw['pathogens'].str.contains('Mannheimia haemolytica').astype(int)
barometer_dt_raw['PM'] = barometer_dt_raw['pathogens'].str.contains('Pasteurella multocida').astype(int)
barometer_dt_raw['BCV'] = barometer_dt_raw['pathogens'].str.contains('Bovine coronavirus').astype(int)
barometer_dt_raw['MB'] = barometer_dt_raw['pathogens'].str.contains('Mycoplasmopsis bovis').astype(int)
barometer_dt_raw['PI3'] = barometer_dt_raw['pathogens'].str.contains('Bovine respirovirus 3').astype(int)
barometer_dt_raw['BRSV'] = barometer_dt_raw['pathogens'].str.contains('Bovine orthopneumovirus').astype(int)

# Select desired columns
barometer_dt = barometer_dt_raw[['FileNumber', 'LabReference', 'Country', 'Breed', 'Province', 'FarmID',
                                 'DiagnosticTest', 'SampleType', 'PM', 'MH', 'HS', 'MB', 'BRSV', 'PI3', 'BCV', 'Date']]

# Drop duplicates
barometer_dt.drop_duplicates(inplace=True)

# Convert Filenumber and Farm_ID to SHA256 hash
barometer_dt['FileNumber'] = barometer_dt['FileNumber'].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
barometer_dt['FarmID'] = barometer_dt['FarmID'].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barometer_dt.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barometer_dt['FileNumber'] = barometer_dt['FileNumber'].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barometer_dt['FarmID'] = barometer_dt['FarmID'].apply(lambda x: hashlib.sha256(str(x).encode()).hexdig

In [4]:
# Convert Date column to datetime
barometer_dt['Date'] = pd.to_datetime(barometer_dt['Date'])

# Floor date to 1st of month
barometer_dt['Floored_date'] = barometer_dt['Date'].dt.to_period('M').dt.to_timestamp()

# Aggregate data based on farm_ID & month
barometer_groupby = barometer_dt.groupby(['LabReference', 'Country', 'Breed', 'Floored_date', 'Province', 'FarmID', 'DiagnosticTest', 'SampleType'])[['PM', 'MH', 'HS', 'MB', 'BRSV', 'PI3', 'BCV']].max(min_count=1)

# Convert to long
barometer_long = barometer_groupby.reset_index().melt(id_vars=['LabReference', 'Country', 'Breed', 'Floored_date', 'Province', 'FarmID', 'DiagnosticTest', 'SampleType'],
                                                      var_name='Pathogen', value_name='Result')

# Convert Floored_date back to datetime (for consistency)
barometer_long['Floored_date'] = pd.to_datetime(barometer_long['Floored_date'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barometer_dt['Date'] = pd.to_datetime(barometer_dt['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  barometer_dt['Floored_date'] = barometer_dt['Date'].dt.to_period('M').dt.to_timestamp()


### Step 02: Create RDF graph and namespaces 

In [5]:
g = rdflib.Graph()
onto = Namespace("http://www.purl.org/decide/LivestockHealthOnto")
g.bind('onto', onto)
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

###  Step 03: Iterate over the Panda DataFrame and map to ontology properties:

In [6]:
# Iterate through the rows of the barometer_long dataframe and create RDF triples
for index, row in barometer_long.iterrows():
    # Create a URI for the CattleSample based on the index
    CattleSample = onto[f"CattleSample_{index}"]
    
    # Add triples for each attribute in the row
    g.add((CattleSample, onto.hasDiagnosticTest, Literal(row['DiagnosticTest'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasCountry, Literal(row['Country'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasBreed, Literal(row['Breed'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasDate, Literal(row['Floored_date'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasProvince, Literal(row['Province'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasFarmIdentification, Literal(row['FarmID'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasSampleType, Literal(row['SampleType'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogen, Literal(row['Pathogen'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasResult, Literal(row['Result'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasLabreference, Literal(row['LabReference'], datatype=XSD.string)))
    
g.serialize(destination='output/RDFoutputCattleSamplePathosen.ttl', format='turtle')


<Graph identifier=Naf158a00acc046e4a6fe4694c3ac3d67 (<class 'rdflib.graph.Graph'>)>

### Step 4: Load the RDF data and ontology into a Panda DataFrame: 

In [7]:
# Define the original path
path_to_RDF = "output/RDFoutputCattleSamplePathosen.ttl"

# Try to parse the file and catch any errors
try:
    # Create a new graph
    g = Graph()

    # Parse the RDF file in Turtle format
    g.parse(path_to_RDF, format='ttl')

    # Parse the ontology file in OWL format and add it to the graph
    path_to_ontology = "Ontology/LivestockHealthOnto1.0.owl"
    g.parse(path_to_ontology, format="xml")
    
except Exception as e:
    # Print the error message
    print(f"An error occurred while parsing the RDF file: {e}")

In [8]:
#Use RDFS or OWL reasoning to infer additional knowledge
g.bind('rdfs', RDFS)
g.bind('owl', OWL)
g.bind('onto', Namespace("http://www.purl.org/decide/LivestockHealthOnto"))

### Step 05: Query the data from updated ontology 

In [9]:
# Define the SPARQL query and Query the data from the updated ontology (Simple Query)
query = """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>
SELECT ?FarmIdentification ?DiagnosticTest ?SampleType ?Date ?Breed ?LabReference ?Pathogen ?Country ?Province ?Result
WHERE {
  ?CattleSample onto:hasFarmIdentification ?FarmIdentification .
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleType ?SampleType .
  ?CattleSample onto:hasDate ?Date .
  ?CattleSample onto:hasBreed ?Breed .
  ?CattleSample onto:hasLabreference ?LabReference .
  ?CattleSample onto:hasPathogen ?Pathogen .
  ?CattleSample onto:hasCountry ?Country .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasResult ?Result .
  }
"""

# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["FarmIdentification","DiagnosticTest","SampleType", "Date", "Breed","LabReference","Pathogen", "Country","Province","Result"])

# display the dataframe
df.head()


Unnamed: 0,FarmIdentification,DiagnosticTest,SampleType,Date,Breed,LabReference,Pathogen,Country,Province,Result
0,1cdabc0d91a051251a6e7b1bd972f6cb63daafcee9a825...,NPS,BAL,2020-09-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
1,7880bd7fe5dcd28637eade8b95c42961c6c5acffa946a6...,NPS,BAL,2020-10-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
2,a0119e19d7f710c158729bc4153a6996e891b54664262b...,NPS,BAL,2021-02-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
3,10817df0e1576683281c68a35157e456c39ec64bd3887c...,NPS,BAL,2021-04-01T00:00:00,Unknown,4,MH,Belgium,Unknown,0
4,2021dc8bd93e4492bf258d9d18eabf53579b19c8c97bbc...,NPS,BAL,2021-04-01T00:00:00,Unknown,4,MH,Belgium,Unknown,0


In [10]:
# This SPARQL query shows the data over filter if SampleType is Autopsy, diagnostic Test is PCR and Breed is Dairy 
# Define the SPARQL query and Query the data from the updated ontology (Simple Query)
query = """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>
SELECT ?FarmIdentification ?DiagnosticTest ?SampleType ?Date ?Breed ?LabReference ?Pathogen ?Country ?Province ?Result
WHERE {
  ?CattleSample onto:hasFarmIdentification ?FarmIdentification .
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleType ?SampleType .
  FILTER (?SampleType = "Swab")
  ?CattleSample onto:hasDate ?Date .
  ?CattleSample onto:hasBreed ?Breed .
  ?CattleSample onto:hasLabreference ?LabReference .
  ?CattleSample onto:hasPathogen ?Pathogen .
  FILTER (?Pathogen = "PM")
  ?CattleSample onto:hasCountry ?Country .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasResult ?Result .
  }
"""

# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["FarmIdentification","DiagnosticTest","SampleType", "Date", "Breed","LabReference","Pathogen", "Country","Province","Result"])

# display the dataframe
df.head()


Unnamed: 0,FarmIdentification,DiagnosticTest,SampleType,Date,Breed,LabReference,Pathogen,Country,Province,Result
0,bebcee2abff6b6f2e04afac2869e7edec0bf441979a6b1...,NPS,Swab,2021-03-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
1,28b51b9809bf29259b60ea6c65ba866c148e19c46aa935...,NPS,Swab,2021-09-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
2,6103eaafbedc45c707252162c622c99d1377566a56c55d...,NPS,Swab,2021-09-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
3,b28bb581f28c6301ef37c4f38ca420fccc0ad1870a1d45...,NPS,Swab,2021-11-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
4,2ced184d8477465987593807f31360e94b539aa41f515e...,NPS,Swab,2020-11-01T00:00:00,Unknown,4,PM,Belgium,Unknown,0
