##Install and Import the necessary classes from the RDFlib library:

In [None]:

! pip install rdflib
! pip install owlrl
from rdflib import Graph, Literal, Namespace, RDF, URIRef
from rdflib.namespace import FOAF, XSD
from rdflib import Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql import prepareQuery
import pandas as pd
import rdflib
import pyspark 
import os
import pandas as pd
from pyspark.sql.functions import col,lit
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

### Step1: Create a SparkSession and read the Excel file into a PySpark DataFrame:

In [None]:
spark = SparkSession.builder.appName('DataCleaning').getOrCreate()

In [None]:
# Define the file paths
files = ['DGZ/DECIDE_MTA_UGENT_14nov2022.xlsx', 
         'DGZ/DECIDE_MTA_UGENT_BAC_AERO_14nov2022.xlsx', 
         'DGZ/DECIDE_MTA_UGENTBAC_MYCO_14nov2022.xlsx']

# Load the data into Spark dataframes
dfs = []
for file in files:
    df = spark.read.format('com.crealytics.spark.excel') \
                .option('header', 'true') \
                .option('inferSchema', 'true') \
                .load(file)
    dfs.append(df)

barometer_dt_raw = dfs[0]
barometer_aero_cult_raw = dfs[1]
barometer_myco_cult_raw = dfs[2]
barometer_aero_cult_raw .show()


#### Step 2: Create an RDF graph and namespaces:

In [None]:
g = rdflib.Graph()
onto = Namespace('http://www.semanticweb.org/admin/ontologies/2023/1/PersonTestOntology#')
g.bind('onto', onto)
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

### Step 03: Iterate over the PySpark DataFrame and map to ontology properties:

In [None]:
# Data manipulation AEROBIC CULTURE results
barometer_aero_cult = barometer_aero_cult_raw \
    .withColumnRenamed("Dossiernummer", "Filenumber") \
    .withColumnRenamed("KIEMSTAAL IDENTIFICATIE", "Pathogen_identification") \
    .withColumnRenamed("KIEMSTAAL RESULTAAT", "Pathogen_result") \
    .withColumnRenamed("Staalnummer", "Samplenumber") \
    .withColumn("Parameter_code", lit("BAC_AERO")) \
    .withColumn("Result", lit("OK")) \
    .select("Filenumber", "Pathogen_identification", "Pathogen_result", "Parameter_code", "Samplenumber", "Result") \
    .filter(col("Pathogen_identification").isin("Pasteurella multocida", "Mannheimia haemolytica", "Histophilus somni", "Mycoplasma bovis")) \
    .distinct()


df_samples = spark.createDataFrame([
  ("OK", "BAC_AERO", "Culture", "Pasteurella multocida"),
  ("OK", "BAC_AERO", "Culture", "Mannheimia haemolytica"),
  ("OK", "BAC_AERO", "Culture", "Histophilus somni"),
  ("OK", "BAC_MYCOPLASMA", "Culture", "Mycoplasma bovis")
], ["Result", "Parameter_code", "Diagnostic_test", "Pathogen_identification"])
 
barometer_aero_cult.show()


### Data manipulation MYCOPLASMA CULTURE results

In [None]:
barometer_myco_cult = barometer_myco_cult_raw \
    .withColumnRenamed("Dossiernummer", "Filenumber") \
    .withColumnRenamed("KIEMSTAAL IDENTIFICATIE", "Pathogen_identification") \
    .withColumnRenamed("KIEMSTAAL RESULTAAT", "Mycoplasma_result") \
    .withColumnRenamed("Staalnummer", "Samplenumber") \
    .withColumn("Parameter_code", lit("BAC_MYCOPLASMA")) \
    .withColumn("Result", lit("OK")) \
    .select("Filenumber", "Pathogen_identification", "Mycoplasma_result", "Parameter_code", "Samplenumber", "Result") \
    .filter(col("Pathogen_identification").isin("Mycoplasma bovis")) \
    .distinct()

barometer_myco_cult.show()


In [None]:


barometer_dt = barometer_dt_raw \
    .withColumnRenamed("Dossiernummer", "Filenumber")\
    .withColumnRenamed("Staalnummer", "Samplenumber")\
    .withColumnRenamed("Staaltype", "Sample_type") \
    .withColumnRenamed("PARAMETER_CODE", "Parameter_code")\
    .withColumnRenamed("Onderzoek", "Pathogen")\
    .withColumnRenamed("Resultaat", "Result")\
    .withColumnRenamed("Creatiedatum", "Date")\
    .withColumnRenamed("Postcode", "Postal_code")\
    .withColumnRenamed("ANON_ID", "Farm_ID")\
    .withColumn("Country", when(col("Parameter_code").isin("BAC_AERO", "BAC_MYCOPLASMA"), "Belgium")) \
    .withColumn("Diagnostic_test", when(col("Parameter_code").isin("BAC_AERO", "BAC_MYCOPLASMA"), "Culture").otherwise("PCR")) \
    .withColumn("Lab_reference", "1")\
    .withColumn("Sample_type", when(col("Sample_type") == "RU Broncho-alveolar lavage (BAL)", "BAL")
        .when(col("Sample_type") == "RU Anderen", "Unknown")
        .when(col("Sample_type").isin("RU Swabs", "RU Swab", "RU Neusswab", "RU Neusswabs"), "Swab")
        .when(col("Sample_type").isin("RU Kadaver", "RU Organen"), "Autopsy")
        .otherwise("Missing")) \
    .withColumn("Breed", when(col("Bedrijfstype") == "VCALF", "Veal")
        .when(col("MEAT").isNull(), "Unknown")
        .when((col("MEAT") / col("TOTAL")) > 0.9, "Beef")
        .when((col("MILK") / col("TOTAL")) > 0.9, "Dairy")
        .otherwise("Mixed")) \
    .withColumn("Pathogen",
        when(col("Pathogen").isin(
            "AD Pasteurella multocida Ag (PCR)",
            "AD Pasteurella multocida Ag pool (PCR)",
            "AD P. multocida Ag (PCR)"
            "AD P. multocida Ag pool (PCR)"),"Pasteurella multocida")
               .when(col("Pathogen").isin(
                        "AD Mannheimia haemolytica Ag (PCR)",
                        "AD Mannheimia haemolytica Ag pool (PCR)"), "Mannheimia haemolytica")
               .when(col("Pathogen").isin(
                        "RU PI3 Ag (PCR)",
                        "RU PI3 Ag pool (PCR)"), "PI3")
               .when(col("Pathogen").isin(
                        "RU BRSV Ag (PCR)",
                        "RU BRSV Ag pool (PCR)"), "BRSV")
               .when(col("Pathogen").isin(
                        "AD Histophilus somnus (PCR)",
                        "AD Histophilus somnus Ag (PCR)",
                        "AD Histophilus somnus Ag pool (PCR)",
                        "AD Histophilus somni Ag (PCR)",
                    "AD Histophilus somni Ag pool (PCR)"), "Histophilus somni")
           .when(col("Pathogen").isin(
                    "RU Mycoplasma bovis (PCR)",
                    "RU Mycoplasma bovis Ag pool (PCR)",
                    "RU Mycoplasma bovis Ag (PCR)"), "Mycoplasma bovis")
           .when(col("Pathogen").isin(
                    "AD Corona Ag (PCR)", "AD Corona Ag pool (PCR)"), "BCV")) \
.withColumn("Province", 
                   when(between(col("Postal_code"), 1000, 1299), "Brussels") \
                   .when(between(col("Postal_code"), 1300, 1499), "Walloon Brabant") \
                   .when(between(col("Postal_code"), 1500, 1999), "Flemish Brabant") \
                   .when(between(col("Postal_code"), 3000, 3499), "Antwerp") \
                   .when(between(col("Postal_code"), 2000, 2999), "Limburg") \
                   .when(between(col("Postal_code"), 5000, 5999), "Namur") \
                   .when(between(col("Postal_code"), 6000, 6599), "Hainaut") \
                   .when(between(col("Postal_code"), 7000, 7999), "Hainaut") \
                   .when(between(col("Postal_code"), 6600, 6999), "Luxembourg") \
                   .when(between(col("Postal_code"), 8000, 8999), "West Flanders") \
                   .otherwise("East Flanders"))
            
barometer_dt= barometer_dt.select("Filenumber", "Diagnostic_test", "Samplenumber", "Country", "Lab_reference", "Sample_type", "Breed", " Parameter_code", "Result", "Pathogen", "Date", " Postal_code", "Province", " Farm_ID ") \
    .distinct() 

            
            
            

In [None]:
barometer_dt = barometer_dt_raw \
    .withColumnRenamed('Dossiernummer', 'Filenumber') \
    .withColumnRenamed('Staalnummer', 'Samplenumber') \
    .withColumnRenamed('Staaltype', 'Sample_type') \
    .withColumnRenamed('PARAMETER_CODE', 'Parameter_code') \
    .withColumnRenamed('Onderzoek', 'Pathogen') \
    .withColumnRenamed('Resultaat', 'Result') \
    .withColumnRenamed('Creatiedatum', 'Date') \
    .withColumnRenamed('Postcode', 'Postal_code') \
    .withColumnRenamed('ANON_ID', 'Farm_ID')
barometer_dt= barometer_dt.select("Filenumber","Samplenumber","Sample_type","Parameter_code","Pathogen","Result", "Date", "Postal_code","Farm_ID") \
    .distinct() 
barometer_dt.show()
# Add columns











In [None]:
barometer_dtt = barometer_dt \
.withColumn("Country", when(col("Parameter_code").isin("BAC_AERO", "BAC_MYCOPLASMA"), "Belgium")) \
    .withColumn("Diagnostic_test", when(col("Parameter_code").isin("BAC_AERO", "BAC_MYCOPLASMA"), "Culture").otherwise("PCR")) \
    .withColumn("Lab_reference", "1")\
    .withColumn("Sample_type", when(col("Sample_type") == "RU Broncho-alveolar lavage (BAL)", "BAL")
        .when(col("Sample_type") == "RU Anderen", "Unknown")
        .when(col("Sample_type").isin("RU Swabs", "RU Swab", "RU Neusswab", "RU Neusswabs"), "Swab")
        .when(col("Sample_type").isin("RU Kadaver", "RU Organen"), "Autopsy")
        .otherwise("Missing")) \
    .withColumn("Breed", when(col("Bedrijfstype") == "VCALF", "Veal")
        .when(col("MEAT").isNull(), "Unknown")
        .when((col("MEAT") / col("TOTAL")) > 0.9, "Beef")
        .when((col("MILK") / col("TOTAL")) > 0.9, "Dairy")
        .otherwise("Mixed")) \
    .withColumn("Pathogen",
        when(col("Pathogen").isin(
            "AD Pasteurella multocida Ag (PCR)",
            "AD Pasteurella multocida Ag pool (PCR)",
            "AD P. multocida Ag (PCR)"
            "AD P. multocida Ag pool (PCR)"),"Pasteurella multocida")
               .when(col("Pathogen").isin(
                        "AD Mannheimia haemolytica Ag (PCR)",
                        "AD Mannheimia haemolytica Ag pool (PCR)"), "Mannheimia haemolytica")
               .when(col("Pathogen").isin(
                        "RU PI3 Ag (PCR)",
                        "RU PI3 Ag pool (PCR)"), "PI3")
               .when(col("Pathogen").isin(
                        "RU BRSV Ag (PCR)",
                        "RU BRSV Ag pool (PCR)"), "BRSV")
               .when(col("Pathogen").isin(
                        "AD Histophilus somnus (PCR)",
                        "AD Histophilus somnus Ag (PCR)",
                        "AD Histophilus somnus Ag pool (PCR)",
                        "AD Histophilus somni Ag (PCR)",
                    "AD Histophilus somni Ag pool (PCR)"), "Histophilus somni")
           .when(col("Pathogen").isin(
                    "RU Mycoplasma bovis (PCR)",
                    "RU Mycoplasma bovis Ag pool (PCR)",
                    "RU Mycoplasma bovis Ag (PCR)"), "Mycoplasma bovis")
           .when(col("Pathogen").isin(
                    "AD Corona Ag (PCR)", "AD Corona Ag pool (PCR)"), "BCV")) \
.withColumn("Province", 
                   when(between(col("Postal_code"), 1000, 1299), "Brussels") \
                   .when(between(col("Postal_code"), 1300, 1499), "Walloon Brabant") \
                   .when(between(col("Postal_code"), 1500, 1999), "Flemish Brabant") \
                   .when(between(col("Postal_code"), 3000, 3499), "Antwerp") \
                   .when(between(col("Postal_code"), 2000, 2999), "Limburg") \
                   .when(between(col("Postal_code"), 5000, 5999), "Namur") \
                   .when(between(col("Postal_code"), 6000, 6599), "Hainaut") \
                   .when(between(col("Postal_code"), 7000, 7999), "Hainaut") \
                   .when(between(col("Postal_code"), 6600, 6999), "Luxembourg") \
                   .when(between(col("Postal_code"), 8000, 8999), "West Flanders") \
                   .otherwise("East Flanders"))
            
barometer_dtt= barometer_dtt.select("Filenumber", "Diagnostic_test", "Samplenumber", "Country", "Lab_reference", "Sample_type", "Breed", "Parameter_code", "Result", "Pathogen", "Date", " Postal_code", "Province", " Farm_ID ") \
    .distinct() 