### Install and Import the necessary classes from the RDFlib library:

In [1]:
! pip install pandas
! pip install openpyxl

import pandas as pd
import rdflib
import hashlib
import numpy as np
from rdflib import Literal, Namespace, RDF, URIRef
from rdflib.namespace import FOAF, XSD
from rdflib import Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit
from pandas.tseries.offsets import MonthBegin




### Step 01: Define the file paths and Load the data into Pandas dataframes and Clean

In [2]:
# Define the file paths
files = ['Data/Ireland/Jade_2021_Final_Anonymised_data_Only_2023-04-20.v2.xlsx', 
         'Data/Ireland/Jade_2022_Final_Anonymised_data_Only_2023-04-21.xlsx']

# Load the data into Pandas dataframes
dfs = []
for file in files:
    df = pd.read_excel(file, engine='openpyxl')
    dfs.append(df)

barometer_dt_raw_2021 = dfs[0]
barometer_dt_raw_2022 = dfs[1]



### Clean the Data 

In [3]:
# Combine the datasets using pd.concat
barometer_dt_combined = pd.concat([barometer_dt_raw_2021, barometer_dt_raw_2022], ignore_index=True)


In [4]:

# Filter data using pandas
conditions_system = (barometer_dt_combined['SYSTEM'].isin(['Respiratory', 'NA']))
barometer_dt_filter = barometer_dt_combined[conditions_system]

conditions_aliquot_matrix = (barometer_dt_filter['ALIQUOTMATRIXTYPE'].isin(['Pleural Fluid', 'Tissue swab', 'Tonsil',  'Lymph Node - Multiple',  'Trachea',  'Thoracic Fluid', 'Lung', 'Swab', 'Culture',  'Thymus', 'Part Carcass', 'Tissue swab', 'Nasal Swab', 'Nasal Fluid', 'Tissue-Pool', 'Tissue (VTM)', 'Carcass', 'Lymph Node', 'Pooled swab', 'Misc.']))
barometer_dt_filter2 = barometer_dt_filter[conditions_aliquot_matrix]

conditions_test = (barometer_dt_filter2['TEST'].isin(["PI3V PCR", "PCR M. haemolytica - ARVL", "Mycoplasma bovis (PCR)", 
                                                      "PCR H. somni - ARVL", "PCR P. multocida - ARVL", "Miscellaneous Test",
                                                      "Routine Culture", "PCR M. bovis - ARVL", "BRSV PCR", 
                                                      "Culture Growth", "PCR BoCoV", "Mycoplasma bovis (PCR)"]))
barometer_dt_filter3 = barometer_dt_filter2[conditions_test]


In [5]:
# Data manipulation
barometer_dt = barometer_dt_filter3.rename(columns={
    'SDGa': 'FileNumber',
    'SAMPLEa': 'SampleNumber',
    'HERD_NOa': 'FarmID',
    'DELIVERY_DATE': 'Date',
    'Herd.Type': 'Breed'
}).assign(
    Country='Ireland',
    LabReference='5',
    SampleType=lambda x: x['ALIQUOTMATRIXTYPE'].map({
        'Carcass': 'Autopsy',
        'Lung': 'Autopsy',
        'Thymus': 'Autopsy',
        'Lymph Node - Multiple': 'Autopsy',
        'Tissue-Pool': 'Autopsy',
        'Lymph Node': 'Autopsy',
        'Tissue (VTM)': 'Autopsy',
        'Part Carcass': 'Autopsy',
        'Swab': 'Swab',
        'Nasal Swab': 'Swab',
        'Pooled swab': 'Swab',
        'Nasal Fluid': 'Swab',
        'Trachea': 'Unknown',
        'Thoracic Fluid': 'Unknown',
        'Culture': 'Unknown',
        'Fluid': 'Unknown',
        'Misc.': 'Unknown',
        'Pleural Fluid': 'Unknown'
    }),
    DiagnosticTest=lambda x: x['TEST'].map({
        'PI3V PCR': 'PCR',
        'PCR M. haemolytica - ARVL': 'PCR',
        'Mycoplasma bovis (PCR)': 'PCR',
        'PCR H. somni - ARVL': 'PCR',
        'PCR M. bovis - ARVL': 'PCR',
        'BRSV PCR': 'PCR',
        'PCR BoCoV': 'PCR',
        'Mycoplasma bovis (PCR)': 'PCR',
        'PCR P. multocida - ARVL': 'PCR',
        'Routine Culture': 'Culture',
        'Culture Growth': 'Culture'
    }),
    Breed=lambda x: x['Breed'].map({
        'BEEF': 'Beef',
        'DAIRY': 'Dairy',
        'SUCKLER': 'Suckler', 
        'OTHER': 'Unknown'
    }),
    Province=lambda x: x['County'],
    Pathogen=lambda x: x['TEST'].map({
        'PCR P. multocida - ARVL': 'PM',
        'PCR M. haemolytica - ARVL': 'MH',
        'PCR H. somni - ARVL': 'HS',
        'Mycoplasma bovis (PCR)': 'MB',
        'PCR M. bovis - ARVL': 'MB',
        'PI3V PCR': 'PI3',
        'PCR BoCoV': 'BCV',
        'BRSV PCR': 'BRSV'
    })
).filter(items=[
    'FileNumber', 'SampleNumber', 'DiagnosticTest', 'Country', 
    'LabReference', 'SampleType', 'Breed', 'Pathogen', 'Date', 
    'Province', 'RESULT', 'RESULTNAME', 'AGENT', 'FarmID'
]).drop_duplicates().assign(
    FileNumber=lambda x: x['FileNumber'].apply(lambda val: hashlib.sha256(str(val).encode()).hexdigest()),
    SampleNumber=lambda x: x['SampleNumber'].apply(lambda val: hashlib.sha256(str(val).encode()).hexdigest()),
    FarmID=lambda x: x['FarmID'].apply(lambda val: hashlib.sha256(str(val).encode()).hexdigest())
)



In [6]:
# Add extra rows for cultuur (& MALDI & NGS?)
pathogens = ['HS', 'MH', 'PM']
barometer_dt['HS'] = barometer_dt.apply(lambda x: 0 if x['DiagnosticTest'] == 'Culture' else None, axis=1)
barometer_dt['MH'] = barometer_dt.apply(lambda x: 0 if x['DiagnosticTest'] == 'Culture' else None, axis=1)
barometer_dt['PM'] = barometer_dt.apply(lambda x: 0 if x['DiagnosticTest'] == 'Culture' else None, axis=1)

barometer_dt_culture_wide = pd.melt(barometer_dt, id_vars=[
    'FileNumber', 'SampleNumber', 'DiagnosticTest', 'Country', 'LabReference', 
    'SampleType', 'Breed', 'Pathogen', 'Date', 'Province', 'RESULT', 'RESULTNAME', 
    'AGENT', 'FarmID'
], value_vars=pathogens, var_name='Pathogen_culture', value_name='Result_culture')

barometer_dt_culture_wide['Pathogen'] = barometer_dt_culture_wide.apply(lambda x: x['Pathogen_culture'] if x['Pathogen'] == 'Missing' else x['Pathogen'], axis=1)


# Create binary results PCR & culture
def calculate_result(row):
    if row['DiagnosticTest'] == 'PCR':
        if row['RESULT'] in ["Positive", "Weak Positive", "Mycoplasma bovis PCR Positive", "Strong Positive"]:
            return 1
        elif row['RESULT'] in ["No Pathogen detected", "Negative", "Sterile", "No Significant Growth", 
                               "No CT", "Mycoplasma bovis PCR Negative", "Mixed Non-Significant Bacterial Growth", 
                               "No Significant Growth @48hrs", "No Growth", "No Pathogen detectedn", 
                               "No RNA detected", "No DNA detected", "No Virus Detected", "Not Detected"]:
            return 0
        else:
            return None
    elif row['DiagnosticTest'] == 'Culture':
        if row['Pathogen'] in ['MH', 'PM', 'HS']:
            if (row['Pathogen'] == 'MH' and row['RESULT'] == "Mannheimia haemolytica") or \
               (row['Pathogen'] == 'PM' and row['RESULT'] in ["Pasteurella multocida", "P. multocida"]) or \
               (row['Pathogen'] == 'HS' and row['RESULT'] in ["Histophilus somni", "Histophilus somnus", "Histophilus somnii"]):
                return 1
        else:
            return 0
    return None

barometer_results = barometer_dt_culture_wide.assign(Result=lambda x: x.apply(calculate_result, axis=1)).filter(items=[
    'FileNumber', 'SampleNumber', 'DiagnosticTest', 'Country', 
    'LabReference', 'SampleType', 'Breed', 'Pathogen', 'Result', 'Date', 
    'Province', 'RESULT', 'RESULTNAME', 'AGENT', 'FarmID'
]).drop_duplicates().assign(
    Filenumber=lambda x: x['FileNumber'].apply(lambda val: hashlib.sha256(str(val).encode()).hexdigest()),
    Samplenumber=lambda x: x['SampleNumber'].apply(lambda val: hashlib.sha256(str(val).encode()).hexdigest()),
    Farm_ID=lambda x: x['FarmID'].apply(lambda val: hashlib.sha256(str(val).encode()).hexdigest())
)

In [7]:

barometer_results['Floored_date'] = pd.to_datetime(barometer_results['Date']).dt.to_period('M').dt.to_timestamp()

barometer_results['Floored_date'] = barometer_results['Floored_date'].dt.date





In [8]:


#barometer_groupby = barometer_results.groupby(['LabReference', 'Country', 'Breed', 'Floored_date', 'Province',
   #                                            'FarmID', 'DiagnosticTest', 'SampleType', 'Pathogen']) \
   # .apply(lambda group: group.max(numeric_only=True, skipna=True) if not group[["Result"]].isna().all().all() else pd.DataFrame({"Result": [None]}))

#barometer_groupby.reset_index(inplace=True)




barometer_groupby = barometer_results.groupby(['LabReference', 'Country', 'Breed', 'Floored_date', 'Province', 'FarmID', 'DiagnosticTest', 'SampleType', 'Pathogen']).agg(Result=('Result', lambda x: np.nan if all(pd.isna(x)) else max(x.dropna())))
barometer_groupby.reset_index(inplace=True)



### Step 02: Create an RDF graph and namespaces.

In [9]:
g = rdflib.Graph()
onto = Namespace("http://www.purl.org/decide/LivestockHealthOnto")
g.bind('onto', onto)
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

###  Step 03: Iterate over the Panda DataFrame and map to ontology properties:

In [10]:
# iterate over each row in the dataframe and
for index, row in barometer_groupby.iterrows():
    # Create a URI for the CattleSample based on the index
    CattleSample = onto[f"CattleSample_{index}"]
    
    # Add triples for each attribute in the row
    g.add((CattleSample, onto.hasDiagnosticTest, Literal(row['DiagnosticTest'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasCountry, Literal(row['Country'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasBreed, Literal(row['Breed'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasDate, Literal(row['Floored_date'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasProvince, Literal(row['Province'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasFarmIdentification, Literal(row['FarmID'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasSampleType, Literal(row['SampleType'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogen, Literal(row['Pathogen'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasResult, Literal(row['Result'], datatype=XSD.string)))
    g.add((CattleSample, onto.hasLabreference, Literal(row['LabReference'], datatype=XSD.string)))
    
g.serialize(destination='output/RDFoutputCattleSampleArsia.ttl', format='turtle')




<Graph identifier=N3765b2d474c244e4b4609355e2da7056 (<class 'rdflib.graph.Graph'>)>

### Step 4: Load the RDF data and ontology into a Panda DataFrame: 

In [11]:
# Define the original path
path_to_RDF = "output/RDFoutputCattleSampleArsia.ttl"

# Try to parse the file and catch any errors
try:
    # Create a new graph
    g = Graph()

    # Parse the RDF file in Turtle format
    g.parse(path_to_RDF, format='ttl')

    # Parse the ontology file in OWL format and add it to the graph
    path_to_ontology = "Ontology/LivestockHealthOnto1.0.owl"
    g.parse(path_to_ontology, format="xml")
    
except Exception as e:
    # Print the error message
    print(f"An error occurred while parsing the RDF file: {e}")

In [12]:
#Use RDFS or OWL reasoning to infer additional knowledge
g.bind('rdfs', RDFS)
g.bind('owl', OWL)
g.bind('onto', Namespace("http://www.purl.org/decide/LivestockHealthOnto"))

### Step 05: Query the data from updated ontology 

In [13]:
# Define the SPARQL query and Query the data from the updated ontology (Simple Query)
query = """
PREFIX onto: <http://www.purl.org/decide/LivestockHealthOnto>
SELECT ?FarmIdentification ?DiagnosticTest ?SampleType ?Date ?Breed ?LabReference ?Pathogen ?Country ?Province ?Result
WHERE {
  ?CattleSample onto:hasFarmIdentification ?FarmIdentification .
  ?CattleSample onto:hasDiagnosticTest ?DiagnosticTest .
  ?CattleSample onto:hasSampleType ?SampleType .
   FILTER (?SampleType = "Autopsy")
  ?CattleSample onto:hasDate ?Date .
  ?CattleSample onto:hasBreed ?Breed .
   FILTER (?Breed = "Dairy")
  ?CattleSample onto:hasLabreference ?LabReference .
  ?CattleSample onto:hasPathogen ?Pathogen .
  FILTER (?Pathogen = "PM")
  ?CattleSample onto:hasCountry ?Country .
  ?CattleSample onto:hasProvince ?Province .
  ?CattleSample onto:hasResult ?Result .
  FILTER (?Result = "1.0")
  }
"""

# execute the query and retrieve the results
results = g.query(query)

# convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["FarmIdentification","DiagnosticTest","SampleType", "Date", "Breed","LabReference","Pathogen", "Country","Province","Result"])

# display the dataframe
df.head(100)

Unnamed: 0,FarmIdentification,DiagnosticTest,SampleType,Date,Breed,LabReference,Pathogen,Country,Province,Result
0,a997ebbf5c8f84f61029c130c5112509314cbb987fbdf5...,PCR,Autopsy,2021-02-01,Dairy,5,PM,Ireland,Clare,1.0
1,5121cc6d2d455f0b6522493f0ee6dfb4bebebc614744b9...,PCR,Autopsy,2021-03-01,Dairy,5,PM,Ireland,Cork,1.0
2,89e631d32cf3f23d83c821f4e532ce00bb92d955747c36...,PCR,Autopsy,2021-03-01,Dairy,5,PM,Ireland,Cork,1.0
3,59ebba7789896f4e0cb3088c500d751c46a0d81c9ec185...,PCR,Autopsy,2021-03-01,Dairy,5,PM,Ireland,Limerick,1.0
4,1a2d11d24f5ad25b1be8aaa5c816ef821ddb1cd33786b2...,PCR,Autopsy,2021-04-01,Dairy,5,PM,Ireland,Cork,1.0
5,6e65265e6c35f6926570695ae428c5ed6938e292ef8421...,PCR,Autopsy,2021-05-01,Dairy,5,PM,Ireland,Cork,1.0
6,14b96018c8f25ab7e8d18db820f6254708c4a8585e35ba...,PCR,Autopsy,2021-07-01,Dairy,5,PM,Ireland,Cork,1.0
7,f1e24da7f2edbe4549bfcaf3763c68f84b02eab014ecac...,PCR,Autopsy,2021-07-01,Dairy,5,PM,Ireland,Cork,1.0
8,7a919a8e9673b3ddf847962a33ad805d2480febb7230b9...,PCR,Autopsy,2021-08-01,Dairy,5,PM,Ireland,Limerick,1.0
9,eac37fb85b25a9612af5d35ae05da8aea51983033aff82...,PCR,Autopsy,2021-09-01,Dairy,5,PM,Ireland,Cork,1.0
