In [None]:
import pandas as pd

! pip install pandas
! pip install openpyxl
import rdflib
import hashlib
import numpy as np
import urllib.parse
from rdflib import Literal, Namespace, RDF, URIRef, Graph, RDFS, OWL
from rdflib.namespace import FOAF, XSD
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit

In [43]:
# Define the file paths
files = ['data/PigData/Lab3/Copia de Base Datos DECIDE-INFLUENZA.xlsx']

# Load the data into Pandas dataframes
dfs = []
for file in files:
    df = pd.read_excel(file, engine='openpyxl')
    dfs.append(df)

df = dfs[0]


In [None]:
df

In [45]:
df = df.rename(columns={
    'ID': 'SampleNumber',
    'PROVINCIA': 'Province',
    'FECHA': 'Date',
    'PATOGENO': 'Pathogen',
    'RESULTADO': 'Result',
    'MUESTRA': 'SampleType',
    'EDAD': 'Age', #Age of animal
    'GRANJA': 'FarmType',
})

# Set DiagnosticTest to "PCR" Country to "Spain", Breed to "Meat", and LabReference to "1".
df['DiagnosticTest'] = 'PCR'
df['Country'] = 'Spain'
df['Breed'] = 'Meat'
df["Lab_Reference"] = "3"


# Add Result mapping for NEG = 0 and POS =1
resultMapping = {
    "Positivo": "1",
    "Negativo": "0",
}
df["Result"] = df["Result"].map(resultMapping).fillna("Missing")

PathogenMapping = {
    "M. hyopneumoniae": "MaycoplasmaHyopneumoniae",
    "Mycoplasma hyopneumoniae antistoffen" : "MaycoplasmaHyopneumoniae",
    "Mycoplasma hyopneumoniae" : "MaycoplasmaHyopneumoniae",
    "Influenza" : "SwineInfluenza",
    
}

# SampleTypeMapping for the additional sample types
SampleTypeMapping = {
    "Pulmón": "Tissue",
    "Pulmon": "Tissue",
    "Pulmones": "Tissue",
    "Lechón": "Tissue",
    "Lechon": "Tissue",
    "Lechones": "Tissue",
    "Lenguas lechones": "Tissue",
    "Lenguas": "Tissue",
    "Fluido oral": "Fluid",
    "Fluidos orales": "Fluid",
    "Pulmón/ Fluido oral": "Mixed",
    "Heces": "Feces",
    "Raspado traqueal": "Swab",
    "Hisopo/ Raspado traqueal/ Pulmón": "Mixed",
    "Intestino": "Tissue",
    "Hisopo": "Swab",
    "Hisopos": "Swab",
    "Hisopos nasales": "Swab",
    "Raspado traqueal/ Pulmón": "Mixed",
    "Fluido oral/ Pulmón": "Mixed",
    "Heces/intestino": "Feces",
    "Hisopo rectal": "Swab",
    "Porción intestinal": "Tissue",
    "Exudado": "Fluid",
    "Pulmon y ganglios ": "Tissue",
    "Líquido pericárdico": "Fluid",
    "Raspado traqueal/ Fluido oral": "Mixed",
    "Heces/hisopo rectal": "Feces",
    "Hisopo nasal": "Swab",
    "Hisopo nasal/ Pulmón/ Fluido Oral": "Mixed",
    " Lechones/hisopos nasales": "Mixed",
    "Órgano": "Tissue",
    "Hisopo intestinal": "Swab",
    "Feto": "Tissue",
    "Fetos": "Tissue",
    "Cabeza, cuatro patas y pulmon": "Swab",
    "Hisopo pulmonar": "Swab",
    "Heces/hisopos": "Feces",
    "Ganglio": "Tissue",
    "Intestino/heces/hisopo": "Mixed",
    "Pulmón/ Fluido Oral": "Mixed",
    "Pulmón / Fluido Oral": "Mixed",
    "Pulmón /gánglios e hisopos": "Mixed",
    "Hisopo fecal": "Swab",
    "Hisopos de vias respiratorias": "Swab",
    "Cabeza": "Tissue",
    "Feto y placenta": "Tissue",
    "Lenguas y cordones umbilicales": "Tissue",
    "Pulmnes + 1 lechon": "Tissue",
    "Sueros": "Blood",
}

# Map the SampleType column using the defined mapping
df["SampleType"] = df["SampleType"].map(SampleTypeMapping).fillna("Missing")

FarmTypeMapping = {
    "Granja de madres": "BreedingFarm",
    "Cebo" : "FeedLot",
    "Cebadero" : "FeedLot",
    
}

df["FarmType"] = df["FarmType"].map(FarmTypeMapping).fillna("Missing")

df["Pathogen"] = df["Pathogen"].map(PathogenMapping).fillna("Missing")

df['Province'] = df['Province'].fillna('Missing')

# Convert 'FECHA' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Format 'FECHA' column as 'DD-MM-YYYY'
df['Date'] = df['Date'].dt.strftime('%d-%m-%Y')


In [46]:
#Cleaning Age Data
data = {
    'Age': ['9-10 semanas', '2 Meses', 'Cerda', '100 Kg', 'Lechon', '50 Kg', '4-5 meses', '60 Kg', '6 meses',
             '4 meses', '6 meses', '5 meses', '4-5meses', '30-40 Kg', '3 meses', '20 Kg', 'Lechon', '3.5 meses', '30 Kg', '6 meses', '5 meses',
             'Lechon', '48 dias', '8-9 semanas', '45-50 dias', '20 dias postdestete', 'Cebo']
}

df_cleaned_age = pd.DataFrame(data)

# Function to clean and map age data
def clean_and_map_age(row):
    if 'semanas' in row['Age']:
        # Extract the whole age string and keep 'semanas' in Unit
        age_string = row['Age'].replace('semanas', '').strip()
        return pd.Series([age_string, 'weeks'], index=['AgeValue', 'Unit'])

    elif 'Meses' in row['Age']:
        # Extract numeric value for months
        return pd.Series([float(row['Age'].split()[0]), 'months'], index=['AgeValue', 'Unit'])
    
    elif 'meses' in row['Age']:
        # Extract the whole age string and remove 'meses'
        age_string = row['Age'].replace('meses', '').strip()
        return pd.Series([age_string, 'months'], index=['AgeValue', 'Unit'])
    
    elif 'Kg' in row['Age']:
        # Extract numeric value for weight and set 'Kg' in Unit
        weight_string = row['Age'].replace('Kg', '').strip()
        return pd.Series(['Missing', 'Missing', weight_string, 'Kg'], index=['AgeValue', 'AgeUnit', 'WeightValue', 'WeightUnit'])

    elif 'dias' in row['Age']:
        # Extract the whole age string and keep 'semanas' in Unit

        age_string = row['Age'].replace('postdestete', '').replace('dias', '').strip()
        return pd.Series([age_string, 'days'], index=['AgeValue', 'Unit'])
    
    else:
        # Handle other cases as needed
        return pd.Series(['Missing', 'Missing'], index=['AgeValue', 'Unit'])

# Apply the cleaning function and join the result with the original DataFrame
df_cleaned_age = df_cleaned_age.join(df_cleaned_age.apply(clean_and_map_age, axis=1))

# Merge the cleaned age data with the existing DataFrame based on the index or a common column
df = pd.merge(df, df_cleaned_age[['AgeValue', 'Unit', 'WeightValue', 'WeightUnit']], left_index=True, right_index=True)

# Drop the original 'Age' column
df.drop('Age', axis=1, inplace=True)

# Rename columns to avoid suffixes
df.rename(columns={'AgeValue': 'Age', 'Unit': 'AgeUnit', 'WeightValue': 'Weight', 'WeightUnit': 'WeightUnit'}, inplace=True)




In [None]:
df

In [48]:
g = rdflib.Graph()
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

ontology_file = "output/RDFoutputPigSampleLab2.rdf"
g.parse(ontology_file, format="xml")

# Define your custom namespace for your ontology's properties
LHO = Namespace("https://www.purl.org/decide/LiveStockHealthOnto/LHO#")
g.bind('LHO', LHO)

SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
g.bind('skos', SKOS)

decide=  Namespace("http://www.purl.org/decide#")

ncit = Namespace("http://purl.obolibrary.org/obo/NCIT_C25464")

agrovoc = Namespace ("http://aims.fao.org/aos/agrovoc")

In [49]:
for index, row in df.iterrows():
    Pathogen = row["Pathogen"]
    Breed = row["Breed"]
    Province = row["Province"]
    SampleNumber = row["SampleNumber"]
    Result = row["Result"]
    DiagnosticTest = row["DiagnosticTest"]
    Age= row["Age"]
    

    
    # Create a unique URI for each sample based on the row index
    PigSample_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#Lab3PigSample_{index}")
    
    # Add sample type assertion
    g.add((PigSample_uri, RDF.type, LHO.PigSample))
    description = "An individual representing a sample from a Pig and Piglets."
    g.add((PigSample_uri, RDFS.comment, Literal(description, lang="en")))
    
    if "Pathogen" in row and row["Pathogen"]:
        Pathogen = row["Pathogen"]
        Pathogen_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Pathogen}")
        g.add((PigSample_uri, LHO.hasPathogen, Pathogen_uri))
        g.add((Pathogen_uri, RDF.type, decide.Pathogen))
        description = "An individual representing Pig pathogen."
        g.add((Pathogen_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Breed" in row and row["Breed"]:
        Breed = row["Breed"]
        Breed_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Breed}")
        g.add((PigSample_uri, LHO.hasBreed, Breed_uri))
        g.add((Breed_uri, RDF.type, LHO.Breed))
        description = "An individual representing a Piglet breed which is Meat."
        g.add((Breed_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "FarmType" in row and row["FarmType"]:
        FarmType = row["FarmType"]
        FarmType_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{FarmType}")
        g.add((PigSample_uri, LHO.hasFarmType, FarmType_uri))
        g.add((FarmType_uri, RDF.type, LHO.FarmType))
        description = "An individual describes the type of farm or facility from which the samples were collected.Breeding Farm: If the farm is primarily focused on breeding animals, Feedlot : If the farm is a feedlot or facility where animals are raised for fattening, Mixed Farm: If the farm has a combination of activities, such as breeding and fattening"
        g.add((FarmType_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "DiagnosticTest" in row and row["DiagnosticTest"]:
        DiagnosticTest= row["DiagnosticTest"]
        DiagnosticTest_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{DiagnosticTest}")
        g.add((PigSample_uri, LHO.hasDiagnosticTest, DiagnosticTest_uri))
        g.add((DiagnosticTest_uri, RDF.type, LHO.DiagnosticTest))
    if DiagnosticTest == "PCR":
        description = "An individual representing DNA/RNA amplification for rapid pathogen detection."
        
    if "Province" in row and row["Province"]:
        Province = row["Province"].strip()
        Province = urllib.parse.quote(Province)
        Province_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Province}")
        g.add((PigSample_uri, LHO.hasProvince, Province_uri))
        g.add((Province_uri, RDF.type, LHO.Province))
        description = "An individual representing different Province."
        g.add((Province_uri, RDFS.comment, Literal(description, lang="en")))
    
    if "Age" in row and row["Age"]:
        Age = row["Age"]
        Age_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Age}")
        g.add((PigSample_uri, LHO.hasAge, Age_uri))
        g.add((Age_uri, RDF.type, LHO.Age))
        description = "An individual representing Age ."
        g.add((Age_uri, RDFS.comment, Literal(description, lang="en")))
           
    if "Result" in row and row["Result"]:
        SampleResult = row["Result"]
    # Create a unique URI for the Result based on the value
        Result_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{SampleResult}")
        g.add((PigSample_uri, LHO.hasSampleResult, Result_uri))
        g.add((Result_uri, RDF.type, decide.SampleResult))
    if Result == "1.0":
        description = "An individual representing a Positive test result"
    elif Result == "0.0":
        description = "An individual representing a negative test result"
    elif Result == "missing":
        description = "n individual representing an unknown or missing test result."
        g.add((Country_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Lab_Reference" in row and row["Lab_Reference"]:
        LabReference = row["Lab_Reference"]
    # Create a unique URI for the LabReference based on the value
        LabReference_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{LabReference}")
        g.add((PigSample_uri, LHO.hasLabReference, LabReference_uri))
        g.add((LabReference_uri, RDF.type, LHO.LabReference)) 
        
    if "SampleType" in row and row["SampleType"]:
        SampleType = row["SampleType"]
        SampleType_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{SampleType}")
        g.add((PigSample_uri, LHO.hasSampleType, SampleType_uri))
        g.add((SampleType_uri, RDF.type, decide.SampleType))
        description = "An individual representing a Piglet Sampling Method for diagnosing Disease."
        g.add((Breed_uri, RDFS.comment, Literal(description, lang="en")))   
    
    if "Age" in row and row["Age"]:
        AgeValue = row["Age"]
        AgeUnit = row["AgeUnit"] if "AgeUnit" in row and row["AgeUnit"] else "Unknown"

        # Create a unique URI for the Age
        Age_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{AgeValue}_{AgeUnit}")
        g.add((PigSample_uri, LHO.hasAge, Age_uri))
        g.add((Age_uri, RDF.type, LHO.Age))
        g.add((Age_uri, LHO.hasAgeUnit, URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{AgeUnit}")))
        g.add((Age_uri, LHO.hasAgeValue, URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{AgeValue}")))
        
    if "Weight" in row and row["Weight"]:
        WeightValue = row["Weight"]
        WeightUnit = row["WeightUnit"] if "WeightUnit" in row and row["WeightUnit"] else "Unknown"

        # Create a unique URI for the Weight
        Weight_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{WeightValue}_{WeightUnit}")
        g.add((PigSample_uri, LHO.hasWeight, Weight_uri))
        g.add((Weight_uri, RDF.type, LHO.Weight))
        g.add((Weight_uri, LHO.hasWeightUnit, URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{WeightUnit}")))
        g.add((Weight_uri, LHO.hasWeightValue, URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{WeightValue}")))


    if "Date" in row and not pd.isnull(row["Date"]):  # Check for NaN or None values
        Date = str(row["Date"]).strip()
    
    # Split the datetime string and take the date part
        Date = Date.split()[0]
    
    # Create a unique URI for the Date based on the value (use the Date directly)
        Date_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Date}")
    
    # Add Date assertion
        g.add((PigSample_uri, LHO.hasDate, Date_uri))
        g.add((Date_uri, RDF.type, LHO.Date))
        
    

# Serialize the RDF graph to a file
rdf_output_file = "output/RDFoutputPigSampleLab3.rdf"
g.serialize(rdf_output_file, format="xml")



    
    

<Graph identifier=Nfd718a2b1c884d9b9cfb682edd2a0f44 (<class 'rdflib.graph.Graph'>)>

In [None]:
# Serialize the RDF graph to Turtle format and print it
turtle_data = g.serialize(format="turtle")
print(turtle_data)

# Query

In [51]:
query = """
PREFIX decide: <http://www.purl.org/decide#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX LHO: <https://www.purl.org/decide/LiveStockHealthOnto/LHO#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT
  (strafter(str(?Sample), "#") AS ?SampleName)
  (strafter(str(?FarmType), "#") AS ?FarmTypeName)
  (strafter(str(?Pathogen), "#") AS ?PathogenName)
  (strafter(str(?Breed), "#") AS ?BreedName)
  (strafter(str(?LabReference), "#") AS ?LabReferenceName)
  (strafter(str(?SampleType), "#") AS ?SampleTypeName)
  (strafter(str(?DiagnosticTest), "#") AS ?DiagnosticTestName)
  (strafter(str(?Age), "#") AS ?AgeName)
  (strafter(str(?SampleResult), "#") AS ?SampleResultName)

WHERE {
  ?Sample rdf:type LHO:PigSample .
  ?Sample LHO:hasFarmType ?FarmType.
  ?Sample LHO:hasPathogen ?Pathogen .
  ?Sample LHO:hasBreed ?Breed .
  ?Sample LHO:hasLabReference ?LabReference .
  ?Sample LHO:hasSampleType ?SampleType .
  ?Sample LHO:hasDiagnosticTest ?DiagnosticTest .
  ?Sample LHO:hasAge ?Age.
  ?Sample LHO:hasSampleResult ?SampleResult .
  
}
"""

# Execute the query and retrieve the results
results = g.query(query)

# Convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["Sample", "FarmType", "Pathogen", "Breed","LabReference", "SampleType", "DiagnosticTest", "Age", "SampleResult"])

# Display the dataframe
df.head(100)

Unnamed: 0,Sample,FarmType,Pathogen,Breed,LabReference,SampleType,DiagnosticTest,Age,SampleResult
0,Lab3PigSample_0,BreedingFarm,SwineInfluenza,Meat,3,Swab,PCR,9-10,0
1,Lab3PigSample_0,BreedingFarm,SwineInfluenza,Meat,3,Swab,PCR,9-10_weeks,0
2,Lab3PigSample_1,FeedLot,SwineInfluenza,Meat,3,Tissue,PCR,2.0,0
3,Lab3PigSample_1,FeedLot,SwineInfluenza,Meat,3,Tissue,PCR,2.0_months,0
4,Lab3PigSample_2,FeedLot,SwineInfluenza,Meat,3,Fluid,PCR,Missing,1
5,Lab3PigSample_2,FeedLot,SwineInfluenza,Meat,3,Fluid,PCR,Missing_Missing,1
6,Lab3PigSample_3,FeedLot,SwineInfluenza,Meat,3,Tissue,PCR,Missing,0
7,Lab3PigSample_3,FeedLot,SwineInfluenza,Meat,3,Tissue,PCR,Missing_nan,0
8,Lab3PigSample_4,FeedLot,SwineInfluenza,Meat,3,Tissue,PCR,Missing,0
9,Lab3PigSample_4,FeedLot,SwineInfluenza,Meat,3,Tissue,PCR,Missing_Missing,0


In [None]:
query = """
PREFIX decide: <http://www.purl.org/decide#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX LHO: <https://www.purl.org/decide/LiveStockHealthOnto/LHO#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT
  (strafter(str(?Sample), "#") AS ?SampleName)
  (strafter(str(?FarmType), "#") AS ?FarmTypeName)
  (strafter(str(?Pathogen), "#") AS ?PathogenName)
  (strafter(str(?Breed), "#") AS ?BreedName)
  (strafter(str(?LabReference), "#") AS ?LabReferenceName)
  (strafter(str(?SampleType), "#") AS ?SampleTypeName)
  (strafter(str(?DiagnosticTest), "#") AS ?DiagnosticTestName)
  (strafter(str(?Weight), "#") AS ?WeightName)
  (strafter(str(?SampleResult), "#") AS ?SampleResultName)

WHERE {
  ?Sample rdf:type LHO:PigSample .
  ?Sample LHO:hasFarmType ?FarmType.
  ?Sample LHO:hasPathogen ?Pathogen .
  ?Sample LHO:hasBreed ?Breed .
  ?Sample LHO:hasLabReference ?LabReference .
  ?Sample LHO:hasSampleType ?SampleType .
  ?Sample LHO:hasDiagnosticTest ?DiagnosticTest .
  ?Sample LHO:hasWeight ?Weight.
  FILTER (?Weight = LHO:100-Kg)
  ?Sample LHO:hasSampleResult ?SampleResult .
  
}
"""

# Execute the query and retrieve the results
results = g.query(query)

# Convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["Sample", "FarmType", "Pathogen", "Breed","LabReference", "SampleType", "DiagnosticTest", "Weight", "SampleResult"])

# Display the dataframe
df.head(100)