##Install and Import the necessary classes from the RDFlib library:

In [1]:
! pip install rdflib
! pip install owlrl
from rdflib import Graph, Literal, Namespace, RDF, URIRef
from rdflib.namespace import FOAF, XSD
from rdflib import Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql import prepareQuery
import pandas as pd
import rdflib
import pyspark 
import os
import pandas as pd
from pyspark.sql.functions import col,lit
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnan




### Step1: Create a SparkSession and read the Excel file into a PySpark DataFrame:

In [2]:
spark = SparkSession.builder.appName('DataCleaning').getOrCreate()

In [3]:
# Define the file paths
files = ['DGZ/DECIDE_MTA_UGENT_14nov2022.xlsx', 
         'DGZ/DECIDE_MTA_UGENT_BAC_AERO_14nov2022.xlsx', 
         'DGZ/DECIDE_MTA_UGENTBAC_MYCO_14nov2022.xlsx']

# Load the data into Spark dataframes
dfs = []
for file in files:
    df = spark.read.format('com.crealytics.spark.excel') \
                .option('header', 'true') \
                .option('inferSchema', 'true') \
                .load(file)
    dfs.append(df)

barometer_dt_raw = dfs[0]
barometer_aero_cult_raw = dfs[1]
barometer_myco_cult_raw = dfs[2]
#barometer_aero_cult_raw .show()


#### Step 2: Create an RDF graph and namespaces:

In [4]:
g = rdflib.Graph()
onto = Namespace("http://example.com/animal_health#")
g.bind('onto', onto)
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

### Step 03: Iterate over the PySpark DataFrame and map to ontology properties:

In [5]:
# Data manipulation AEROBIC CULTURE results
barometer_aero_cult = barometer_aero_cult_raw \
    .withColumnRenamed("Dossiernummer", "Filenumber") \
    .withColumnRenamed("KIEMSTAAL IDENTIFICATIE", "Pathogen_identification") \
    .withColumnRenamed("KIEMSTAAL RESULTAAT", "Pathogen_result") \
    .withColumnRenamed("Staalnummer", "Samplenumber") \
    .withColumn("Parameter_code", lit("BAC_AERO")) \
    .withColumn("Result", lit("OK")) \
    .select("Filenumber", "Pathogen_identification", "Pathogen_result", "Parameter_code", "Samplenumber", "Result") \
    .filter(col("Pathogen_identification").isin("Pasteurella multocida", "Mannheimia haemolytica", "Histophilus somni", "Mycoplasma bovis")) \
    .distinct()


df_samples = spark.createDataFrame([
  ("OK", "BAC_AERO", "Culture", "Pasteurella multocida"),
  ("OK", "BAC_AERO", "Culture", "Mannheimia haemolytica"),
  ("OK", "BAC_AERO", "Culture", "Histophilus somni"),
  ("OK", "BAC_MYCOPLASMA", "Culture", "Mycoplasma bovis")
], ["Result", "Parameter_code", "Diagnostic_test", "Pathogen_identification"])
 
#barometer_aero_cult.show()


### Data manipulation MYCOPLASMA CULTURE results

In [6]:
# Data manipulation MYCOPLASMA CULTURE results
barometer_myco_cult = barometer_myco_cult_raw \
    .withColumnRenamed("Dossiernummer", "Filenumber") \
    .withColumnRenamed("KIEMSTAAL IDENTIFICATIE", "Pathogen_identification") \
    .withColumnRenamed("KIEMSTAAL RESULTAAT", "Mycoplasma_result") \
    .withColumnRenamed("Staalnummer", "Samplenumber") \
    .withColumn("Parameter_code", lit("BAC_MYCOPLASMA")) \
    .withColumn("Result", lit("OK")) \
    .select("Filenumber", "Pathogen_identification", "Mycoplasma_result", "Parameter_code", "Samplenumber", "Result") \
    .filter(col("Pathogen_identification").isin("Mycoplasma bovis")) \
    .distinct()

#barometer_myco_cult.show()


### Data manipulation PCR results

In [7]:
# Data manipulation PCR results
barometer_dtt = barometer_dt_raw \
    .withColumnRenamed("Dossiernummer", "Filenumber")\
    .withColumnRenamed("Staalnummer", "Samplenumber")\
    .withColumnRenamed("Staaltype", "Sample_type") \
    .withColumnRenamed("PARAMETER_CODE", "Parameter_code")\
    .withColumnRenamed("Onderzoek", "Pathogen")\
    .withColumnRenamed("Resultaat", "Result")\
    .withColumnRenamed("Creatiedatum", "Date")\
    .withColumnRenamed("Postcode", "Postal_code")\
    .withColumnRenamed("ANON_ID", "Farm_ID")\
    .withColumn("Country", when(col("Parameter_code").isin("BAC_AERO", "BAC_MYCOPLASMA"), "Belgium")) \
    .withColumn("Diagnostic_test", when(col("Parameter_code").isin("BAC_AERO", "BAC_MYCOPLASMA"), "Culture").otherwise("PCR")) \
    .withColumn("Lab_reference", lit("1"))\
    .withColumn("Sample_type", when(col("Sample_type") == "RU Broncho-alveolar lavage (BAL)", "BAL")
        .when(col("Sample_type") == "RU Anderen", "Unknown")
        .when(col("Sample_type").isin("RU Swabs", "RU Swab", "RU Neusswab", "RU Neusswabs"), "Swab")
        .when(col("Sample_type").isin("RU Kadaver", "RU Organen"), "Autopsy")
        .otherwise("Missing")) \
    .withColumn("Breed", when(col("Bedrijfstype") == "VCALF", "Veal")
        .when(col("MEAT").isNull(), "Unknown")
        .when((col("MEAT") / col("TOTAL")) > 0.9, "Beef")
        .when((col("MILK") / col("TOTAL")) > 0.9, "Dairy")
        .otherwise("Mixed")) \
    .withColumn("Pathogen",
        when(col("Pathogen").isin(
            "AD Pasteurella multocida Ag (PCR)",
            "AD Pasteurella multocida Ag pool (PCR)",
            "AD P. multocida Ag (PCR)"
            "AD P. multocida Ag pool (PCR)"),"Pasteurella multocida")
               .when(col("Pathogen").isin(
                        "AD Mannheimia haemolytica Ag (PCR)",
                        "AD Mannheimia haemolytica Ag pool (PCR)"), "Mannheimia haemolytica")
               .when(col("Pathogen").isin(
                        "RU PI3 Ag (PCR)",
                        "RU PI3 Ag pool (PCR)"), "PI3")
               .when(col("Pathogen").isin(
                        "RU BRSV Ag (PCR)",
                        "RU BRSV Ag pool (PCR)"), "BRSV")
               .when(col("Pathogen").isin(
                        "AD Histophilus somnus (PCR)",
                        "AD Histophilus somnus Ag (PCR)",
                        "AD Histophilus somnus Ag pool (PCR)",
                        "AD Histophilus somni Ag (PCR)",
                    "AD Histophilus somni Ag pool (PCR)"), "Histophilus somni")
           .when(col("Pathogen").isin(
                    "RU Mycoplasma bovis (PCR)",
                    "RU Mycoplasma bovis Ag pool (PCR)",
                    "RU Mycoplasma bovis Ag (PCR)"), "Mycoplasma bovis")
           .when(col("Pathogen").isin(
                    "AD Corona Ag (PCR)", "AD Corona Ag pool (PCR)"), "BCV")) \
.withColumn("Province", 
                   when(col("Postal_code").between(1000, 1299), "Brussels") \
                   .when(col("Postal_code").between(1300, 1499), "Walloon Brabant") \
                   .when(col("Postal_code").between(1500, 1999), "Flemish Brabant") \
                   .when(col("Postal_code").between(3000, 3499), "Antwerp") \
                   .when(col("Postal_code").between(2000, 2999), "Limburg") \
                   .when(col("Postal_code").between(5000, 5999), "Namur") \
                   .when(col("Postal_code").between(6000, 6599), "Hainaut") \
                   .when(col("Postal_code").between(7000, 7999), "Hainaut") \
                   .when(col("Postal_code").between(6600, 6999), "Luxembourg") \
                   .when(col("Postal_code").between( 8000, 8999), "West Flanders") \
                   .otherwise("East Flanders"))
            
barometer_dtt= barometer_dtt.select("Filenumber", "Diagnostic_test", "Samplenumber", "Country", "Lab_reference", "Sample_type", "Breed", "Parameter_code", "Result", "Pathogen", "Date", "Postal_code", "Province", "Farm_ID") \
    .distinct() 

#barometer_dtt.show()

### All three joins and clean file

In [8]:


barometer = barometer_dtt.join(df_samples, ['Diagnostic_test', 'Result', 'Parameter_code'], 'left') \
                       .join(barometer_aero_cult, ['Filenumber', 'Samplenumber', 'Result', 'Parameter_code', 'Pathogen_identification'], 'left') \
                       .join(barometer_myco_cult, ['Filenumber', 'Samplenumber', 'Result', 'Parameter_code', 'Pathogen_identification'], 'left') \
                       .withColumn('Pathogen', when(col('Pathogen') == 'Pasteurella multocida', 'PM') \
                                             .when(col('Pathogen') == 'Histophilus somni', 'HS') \
                                             .when(col('Pathogen') == 'Mannheimia haemolytica', 'MH') \
                                             .when(col('Pathogen') == 'Mycoplasma bovis', 'MB') \
                                             .otherwise(col('Pathogen'))) \
                       .withColumn('Pathogen', when(col('Pathogen_identification') == 'Pasteurella multocida', 'PM') \
                                             .when(col('Pathogen_identification') == 'Histophilus somni', 'HS') \
                                             .when(col('Pathogen_identification') == 'Mannheimia haemolytica', 'MH') \
                                             .when(col('Pathogen_identification') == 'Mycoplasma bovis', 'MB') \
                                             .otherwise(col('Pathogen'))) \
                       .withColumn('Result', when(col('Result').isin(["Twijfelachtig (PCR)", "POSITIEF", "GEDETECTEERD", "GEDETECTEERD (sterk)", "GEDETECTEERD (zwak)", "GEDETECTEERD (matig)", "GEDETECTEERD (zeer sterk)", "GEDETECTEERD (zeer zwak)"]), 1) \
                                             .when(col('Result').isin(["negatief", "Niet gedetecteerd"]), 0) \
                                             .when(col('Result').isin(["NI", "niet interpreteerbaar", "Inhibitie"]), None) \
                                             .when((col('Parameter_code') == 'BAC_AERO') & (col('Pathogen_result').isNull()), 0) \
                                             .when((col('Parameter_code') == 'BAC_AERO') & (col('Pathogen_result').isNotNull()), 1) \
                                             .when((col('Parameter_code') == 'BAC_MYCOPLASMA') & (col('Mycoplasma_result').isNull()), None) \
                                             .when((col('Parameter_code') == 'BAC_MYCOPLASMA') & (col('Mycoplasma_result') == 'neg'), 0) \
                                             .when((col('Parameter_code') == 'BAC_MYCOPLASMA') & (col('Mycoplasma_result').rlike('POS')), 1) \
                                             .otherwise(None))


In [9]:
barometer.show(50)

barometer.write.format("com.crealytics.spark.excel") \
    .option("header", "true") \
    .mode("overwrite") \
    .save("DGZ/barometerdata.xlsx")

+------------+----------------+------+--------------------+-----------------------+---------------+-------+-------------+-----------+-----+--------+-------------------+-----------+---------------+-------------+---------------+-----------------+
|  Filenumber|    Samplenumber|Result|      Parameter_code|Pathogen_identification|Diagnostic_test|Country|Lab_reference|Sample_type|Breed|Pathogen|               Date|Postal_code|       Province|      Farm_ID|Pathogen_result|Mycoplasma_result|
+------------+----------------+------+--------------------+-----------------------+---------------+-------+-------------+-----------+-----+--------+-------------------+-----------+---------------+-------------+---------------+-----------------+
|TO-17-116224|TO-17-116224-001|     0|      BO_VIR_RSB_PCR|                   null|            PCR|   null|            1|        BAL|Mixed|    BRSV|2017-05-05 14:40:55|       9860|  East Flanders| BOVBE40_1019|           null|             null|
|TO-16-291154|TO-16-

In [None]:
sc = SparkContext.getOrCreate();

spark = SparkSession(sc)
BarometerExampleData = spark.read \
    .format('com.crealytics.spark.excel') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load('BarometerExampleData.xlsx')

BarometerExampleData.show()

###Step 03: Iterate over the PySpark DataFrame and map to ontology properties:

In [13]:

for row in BarometerExampleData.collect():
    CattleSample = onto[f'CattleSample{row[0]}']
    g.add((CattleSample, RDF.type, onto.CattleSample))
    g.add((CattleSample, onto.hasFileNumber, Literal(row[0], datatype=XSD.string)))
    g.add((CattleSample, onto.hasSampleNumber, Literal(row[1], datatype=XSD.string)))
    g.add((CattleSample, onto.hasResult, Literal(row[2], datatype=XSD.string)))
    g.add((CattleSample, onto.hasParametercode, Literal(row[3], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogenIdentification, Literal(row[4], datatype=XSD.string)))
    g.add((CattleSample, onto.hasDiagnosticTest, Literal(row[5], datatype=XSD.string)))
    g.add((CattleSample, onto.hasCountry, Literal(row[6], datatype=XSD.string)))
    g.add((CattleSample, onto.hasLabRefference, Literal(row[7], datatype=XSD.string)))
    g.add((CattleSample, onto.hasSampleType, Literal(row[8], datatype=XSD.string)))
    g.add((CattleSample, onto.hasBreed, Literal(row[9], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogen, Literal(row[10], datatype=XSD.string)))
    g.add((CattleSample, onto.hasDate, Literal(row[11], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPostalCode, Literal(row[12], datatype=XSD.string)))
    g.add((CattleSample, onto.hasProvince, Literal(row[13], datatype=XSD.string)))
    g.add((CattleSample, onto.hasFarmIdentification, Literal(row[14], datatype=XSD.string)))
    g.add((CattleSample, onto.hasPathogenResult, Literal(row[15], datatype=XSD.string)))
    g.add((CattleSample, onto.hasMicoplasmaResult, Literal(row[16], datatype=XSD.string)))
 # print RDF graph (for testing)
    
print(g.serialize(format='turtle'))

# output RDF graph to file (replace with your desired filename)
g.serialize('output/RDFoutputCattleSample.ttl', format='turtle')


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<Graph identifier=N54427de2c79c4c7a95c6f822030c62bb (<class 'rdflib.graph.Graph'>)>

In [16]:
g = Graph()

# Define the original Path
path_to_RDF = "output/RDFoutputCattleSample.ttl"
g.parse(path_to_RDF, format='xml')

# Secondly Load ontology into the same graph
path_to_ontology = "LivestockHealthOnto1.0.owl"
g.parse(path_to_ontology, format="xml")


SAXParseException: file:///C:/Users/saban/Saba%20PhD%20Work/CattleUseCaseOntology%28Decide%29/output/RDFoutputCattleSample.ttl:1:0: not well-formed (invalid token)