In [None]:
import pandas as pd

! pip install pandas
! pip install openpyxl
import rdflib
import hashlib
import numpy as np
import urllib.parse
from rdflib import Literal, Namespace, RDF, URIRef, Graph, RDFS, OWL
from rdflib.namespace import FOAF, XSD
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit

In [None]:
# Define the file paths
files = ['data/PigData/Lab4/Copia de Base Datos DECIDE-MYCO HYOPNEUMONIAE.xlsx']

# Load the data into Pandas dataframes
dfs = []
for file in files:
    df = pd.read_excel(file, engine='openpyxl')
    dfs.append(df)

df = dfs[0]


In [None]:
df

In [None]:
df = df.rename(columns={
    'Identificación': 'FileNumber',
    'Fecha': 'Date',
    'Provincia': 'Province',
    'Patógeno': 'Pathogen',
    'Resultado': 'Result',
    'Edad': 'Age', #Age of animal
    'Tipo Edad': 'AgeUnit', #represents the unit used to measure the age of animals (e.g, Weeks, Month,year)
    'Material': 'SampleType',
    'Animal': 'BreedType', 
    'Categoría':'FarmingPeriod',#refers to the productive stage (or the farming period) of the animals (fattening, nursery, weaned pigs, etc.)
})

# Set DiagnosticTest to "PCR" Country to "Spain", Breed to "Meat", and LabReference to "1".
df['DiagnosticTest'] = 'PCR'
df['Country'] = 'Spain'
df['Breed'] = 'Meat'
df["Lab_Reference"] = "1"


# Add Result mapping for NEG = 0 and POS =1
resultMapping = {
    "NEG": "0",
    "POS": "1",
}
df["Result"] = df["Result"].map(resultMapping).fillna("Missing")

PathogenMapping = {
    "M. hyopneumoniae": "MaycoplasmaHyopneumoniae",
    "Mycoplasma hyopneumoniae antistoffen" : "MaycoplasmaHyopneumoniae",
    "Mycoplasma hyopneumoniae" : "MaycoplasmaHyopneumoniae",
}

df["Pathogen"] = df["Pathogen"].map(PathogenMapping).fillna("Missing")

SampleTypeMapping = {
    "Visceras": "Tissue",
    "Pulmones": "Tissue",
    "Pulmon y ganglios": "Tissue",
    "Pulmon": "Tissue",
    "Raspados nasales": "Swab",
    "Hisopos nasales": "Swab",
    "Fluido oral": "Fluid",
    "Pulmon e hisopo": "Tissue",
    "Hisopos vias respiratorias": "Swab",
    "Lechon": "Tissue",
    "E-Vaginales": "Swab",
    "E-Escobillón de saliva": "Swab",
    "T-Pulmón":   "Tissue",
    "F-Fluido Oral":  "Fluid",
    "T-Pulmón y linfonodo":   "Tissue",
}

# Map the SampleType column using the defined mapping
df["SampleType"] = df["SampleType"].map(SampleTypeMapping).fillna("Missing")

# Add Breed mapping for Female, piglet, and fatteningPig
breedTypeMapping = {
    'Female': 'Hembra',
    'piglet': 'Lechón',
    'fatteningPig': 'Engorde',
    'Cerdos': 'AdultPigs',  # Assuming "Cerdos" refers to adult pigs
    'Cerdas jóvenes': 'FemalePigs', 
}

# Map the "Breed" column using the breedMapping dictionary and fill missing values with "Missing"
df['AgeUnit'] = df['AgeUnit'].fillna('Missing')
df["BreedType"] = df["BreedType"].map(breedTypeMapping).fillna("Missing")
df['Province'] = df['Province'].fillna('Missing')
df['BreedType'] = df['BreedType'].fillna('Missing')
df['FarmingPeriod'] = df['FarmingPeriod'].fillna('Missing')


# Convert 'FECHA/Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# Format 'FECHA' column as 'DD-MM-YYYY'
df['Date'] = df['Date'].dt.strftime('%d-%m-%Y')



# Show the resulting DataFrame
print(df)

In [None]:
g = rdflib.Graph()
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

ontology_file = "output/RDFoutputPigSampleLab3.rdf"
g.parse(ontology_file, format="xml")

# Define your custom namespace for your ontology's properties
LHO = Namespace("https://www.purl.org/decide/LiveStockHealthOnto/LHO#")
g.bind('LHO', LHO)

SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
g.bind('skos', SKOS)

decide=  Namespace("http://www.purl.org/decide#")

ncit = Namespace("http://purl.obolibrary.org/obo/NCIT_C25464")
agrovoc = Namespace ("http://aims.fao.org/aos/agrovoc")

In [None]:
for index, row in df.iterrows():
    FileNumber = row["FileNumber"]
    Date = row["Date"]
    Pathogen = row["Pathogen"]
    Breed = row["Breed"]
    BreedType = row["BreedType"]
    Country = row["Country"]
    Province = row["Province"]
    SampleType = row["SampleType"]
    Result = row["Result"]
    DiagnosticTest = row["DiagnosticTest"]
    FarmingPeriod = row["FarmingPeriod"]
    Age= row["Age"]
    Age= row["AgeUnit"]
    

    
    # Create a unique URI for each sample based on the row index
    PigSample_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#Lab4PigSample_{index}")
    
    # Add sample type assertion
    g.add((PigSample_uri, RDF.type, LHO.PigSample))
    description = "An individual representing a sample from a Pig and Piglets."
    g.add((PigSample_uri, RDFS.comment, Literal(description, lang="en")))
    
    if "Pathogen" in row and row["Pathogen"]:
        Pathogen = row["Pathogen"]
        Pathogen_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Pathogen}")
        g.add((PigSample_uri, LHO.hasPathogen, Pathogen_uri))
        g.add((Pathogen_uri, RDF.type, LHO.Pathogen))
        description = "An individual representing Pig pathogen."
        g.add((Pathogen_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Breed" in row and row["Breed"]:
        Breed = row["Breed"]
        Breed_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Breed}")
        g.add((PigSample_uri, LHO.hasBreed, Breed_uri))
        g.add((Breed_uri, RDF.type, LHO.Breed))
        description = "An individual representing a Piglet breed which is Meat."
        g.add((Breed_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "DiagnosticTest" in row and row["DiagnosticTest"]:
        DiagnosticTest= row["DiagnosticTest"]
        DiagnosticTest_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{DiagnosticTest}")
        g.add((PigSample_uri, LHO.hasDiagnosticTest, DiagnosticTest_uri))
        g.add((DiagnosticTest_uri, RDF.type, LHO.DiagnosticTest))
    if DiagnosticTest == "PCR":
        description = "An individual representing DNA/RNA amplification for rapid pathogen detection."
    elif SampleType == "Culture":
        description = "An individual representing a Growing live microorganisms for identification."
        g.add((DiagnosticTest_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Country" in row and row["Country"]:
        Country = row["Country"].strip()  # Use strip() to remove leading and trailing spaces
        # Encode the Country value to create a valid URI
        Country = urllib.parse.quote(Country)
        Country_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Country}")
    
        # Map to the specific class URI in your ontology and add the label
        g.add((PigSample_uri, LHO.hasCountry, Country_uri))
        g.add((Country_uri, RDF.type, URIRef("http://purl.obolibrary.org/obo/NCIT_C25464")))  # Use the specific class URI
        g.add((Country_uri, RDFS.label, Literal("Spain")))
        description = "An individual representing different Countries."
        g.add((Country_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Province" in row and row["Province"]:
        Province = row["Province"].strip()
        Province = urllib.parse.quote(Province)
        Province_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Province}")
        g.add((PigSample_uri, LHO.hasProvince, Province_uri))
        g.add((Province_uri, RDF.type, LHO.Province))
        description = "An individual representing different Province."
        g.add((Province_uri, RDFS.comment, Literal(description, lang="en")))
    
    
    if "Age" in row and row["Age"]:
        Age = row["Age"]
        Age_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Age}")
        g.add((PigSample_uri, LHO.hasAge, Age_uri))
        g.add((Age_uri, RDF.type, LHO.Age))
        description = "An individual representing Age ."
        g.add((Age_uri, RDFS.comment, Literal(description, lang="en")))
        
    
    if "SampleType" in row and row["SampleType"]:
        SampleType = row["SampleType"]
        SampleType_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{SampleType}")
        # Add statements for SampleType
        g.add((PigSample_uri, LHO.hasSampleType, SampleType_uri))
        g.add((SampleType_uri, RDF.type, URIRef("http://www.purl.org/decide#SampleType")))
    if SampleType == "Autopsy":
        description = "An individual representing a cattle sample obtained through autopsy."
    elif SampleType == "BAL":
        description = "An individual representing a cattle sample obtained through bronchoalveolar lavage (BAL)."
    elif SampleType == "SWAB":
        description = "An individual representing a cattle sample obtained through swabbing."
    elif SampleType == "Missing":
        description = "An individual representing a missing or unspecified cattle sample type."
        g.add((SampleType_uri, RDFS.comment, Literal(description, lang="en")))


           
    if "Result" in row and row["Result"]:
        SampleResult = row["Result"]
    # Create a unique URI for the Result based on the value
        Result_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{SampleResult}")
        g.add((PigSample_uri, LHO.hasResult, Result_uri))
        g.add((Result_uri, RDF.type, decide.SampleResult))
    if Result == "1.0":
        description = "An individual representing a Positive test result"
    elif Result == "0.0":
        description = "An individual representing a negative test result"
    elif Result == "missing":
        description = "n individual representing an unknown or missing test result."
        g.add((Country_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Lab_Reference" in row and row["Lab_Reference"]:
        LabReference = row["Lab_Reference"]
    # Create a unique URI for the LabReference based on the value
        LabReference_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{LabReference}")
        g.add((PigSample_uri, LHO.hasLabReference, LabReference_uri))
        g.add((LabReference_uri, RDF.type, LHO.LabReference))

    if "Date" in row and not pd.isnull(row["Date"]):  # Check for NaN or None values
        Date = str(row["Date"]).strip()
    
    # Split the datetime string and take the date part
        Date = Date.split()[0]
    
    # Create a unique URI for the Date based on the value (use the Date directly)
        Date_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Date}")
    
    # Add Date assertion
        g.add((PigSample_uri, LHO.hasDate, Date_uri))
        g.add((Date_uri, RDF.type, LHO.Date))
    

# Serialize the RDF graph to a file
rdf_output_file = "output/RDFoutputPigSampleLab4.rdf"
g.serialize(rdf_output_file, format="xml")



    
    

In [None]:
# Serialize the RDF graph to Turtle format and print it
turtle_data = g.serialize(format="turtle")
print(turtle_data)

# Query

In [None]:
query = """
PREFIX decide: <http://www.purl.org/decide#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX LHO: <http://www.purl.org/decide/LiveStockHealthOnto/LHO#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT
  (strafter(str(?Sample), "#") AS ?SampleName)
  (strafter(str(?Pathogen), "#") AS ?PathogenName)
  (strafter(str(?Breed), "#") AS ?BreedName)
  (strafter(str(?SampleType), "#") AS ?SampleTypeName)
  (strafter(str(?LabReference), "#") AS ?LabReferenceName)
  (strafter(str(?DiagnosticTest), "#") AS ?DiagnosticTestName)
  (strafter(str(?Country), "#") AS ?CountryName)
  (strafter(str(?SampleResult), "#") AS ?SampleResultName)

WHERE {
  ?Sample rdf:type LHO:PigSample .
  ?Sample LHO:hasPathogen ?Pathogen .
  FILTER (?Pathogen = LHO:MaycoplasmaHyopneumoniae)
  ?Sample LHO:hasBreed ?Breed .
  ?Sample LHO:hasSampleType ?SampleType .
  ?Sample LHO:hasLabReference ?LabReference .
  ?Sample LHO:hasDiagnosticTest ?DiagnosticTest .
  ?Sample LHO:hasCountry ?Country .
  ?Sample LHO:hasResult ?SampleResult .
  
}
"""

# Execute the query and retrieve the results
results = g.query(query)

# Convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["Sample", "Pathogen", "Breed", "SampleType", "LabReference", "DiagnosticTest", "Country","SampleResult"])

# Display the dataframe
df.head(20)