In [None]:
import pandas as pd
! pip install pandas
! pip install openpyxl
import rdflib
import hashlib
import numpy as np
import urllib.parse
from rdflib import Literal, Namespace, RDF, URIRef, Graph, RDFS, OWL
from rdflib.namespace import FOAF, XSD
from rdflib.plugins.sparql import prepareQuery
from pyspark.sql.functions import when, col, lit



In [None]:
# Define the file paths
files = ['data/PigData/Lab5/DECIDE Pig12102023.xlsx']

# Load the data into Pandas dataframes and Translate in English
dfs = []
for file in files:
    df = pd.read_excel(file, engine='openpyxl')
    dfs.append(df)

df = dfs[0]
df = df.rename(columns={
     'date': 'Date',
    'inzendnummer': 'SubmissionNumber',
    'farm': 'FarmIdentification',
    'sc': 'Sample',
    'methode': 'Method',
    'ME_DESCRIPTION': 'MethodDescription',
    'parameter': 'Pathogen',
    'PA_DESCRIPTION': 'PathogenDescription',
    'uitslag': 'Result',
    'uit': 'ResultPN',
    'II_REDEN_INZENDING': 'Reason',
    'MATERIAAL_ONDERZOEK': 'Materials',
    'year': 'Year',
    'project': 'ProjectCode',
})





In [None]:
df

In [105]:

# Split date
df['DateDay'] = df['Date'].str[0:2]
df['DateMonth'] = df['Date'].str[2:5]
df['DateYear'] = df['Date'].str[5:10]
#df = df.drop(columns=['Date'])


# Just respiratory pathogens
respiratory_pathogens = [
    "App_Indirect", "M_Hyopneum_Antistof", "PRRS_Virus", "PRRS_Antistoffen",
    "Influenza_A_Antistof", "Influenza_Virus", "M_Hyopneumoniae", "App_ApxIV",
    "App_2", "App_5", "App_2_9", "PRRSv_Spe_Seq_P663", "A_Pleuropneumoniae",
    "PRRS_Eu", "PRRS_Am", "pH1N1", "Porcine_H1N2", "H1N1 2012", "Porcine_H3N2",
    "PRRSv_Lichaams_W1733", "Mycoplasma", "PRRSv_Spe_ORF_P706", "PRRSv_SerP_Seq_W1628",
    "PRRSv_We_ORF_P707", "PRRSv_Ser61_ORF_P708", "PRRS"
]

df_respiratory = df[df['Pathogen'].isin(respiratory_pathogens)]


# Remove typing and confirmation samples
respi_typing_confirm = ["PCR subtypering", "Typering", "confirmatie ELISA", "Sequentie", "Confirmatie ELISA", "confirmatie PCR"]
df_respi_typing_confirm = df_respiratory[df_respiratory['MethodDescription'].isin(respi_typing_confirm)]
df_respiratory = df_respiratory[~df_respiratory.isin(df_respi_typing_confirm)].dropna()

# Only send-in samples
respi_reasons = ["Bedrijfsbegeleiding","Export", "Herregistratie", "K.I.", "Pilotonderzoek GD", "Slachtlijn", "SPF"]
df_respi_reasons = df_respiratory[df_respiratory['Reason'].isin(respi_reasons)]
df_respiratory = df_respiratory[~df_respiratory.isin(df_respi_reasons)].dropna()

# Column with aggregated pathogen
df_respiratory['AggrPath'] = df_respiratory['Pathogen'].map({
    'App_2': 'App', 'App_2_9': 'App', 'App_5': 'App', 'App_ApxIV': 'App', 'App_Indirect': 'App',
    'H1N1 2012': 'SI', 'Influenza_A_Antistof': 'SI', 'Influenza_Virus': 'SI',
    'M_Hyopneum_Antistof': 'Mhyo', 'M_Hyopneumoniae': 'Mhyo', 'Mycoplasma': 'Mhyo',
    'PRRS': 'PRRS', 'PRRS_Am': 'PRRS', 'PRRS_Antistoffen': 'PRRS', 'PRRS_Eu': 'PRRS',
    'PRRS_Virus': 'PRRS', 'PRRSv_Lichaams_W1733': 'PRRS', 'PRRSv_Ser61_ORF_P708': 'PRRS',
    'PRRSv_SerP_Seq_W1628': 'PRRS', 'PRRSv_Spe_ORF_P706': 'PRRS', 'PRRSv_Spe_Seq_P663': 'PRRS',
    'PRRSv_We_ORF_P707': 'PRRS'
})

# Results dataframe
results_df = df_respiratory[[ 'SubmissionNumber', 'FarmIdentification', 'Date','DateMonth', 'DateYear', 'Materials', 'Reason', 'Method', 'Pathogen', 'ResultPN']]
results_df = results_df.dropna(subset=['ResultPN'])
#It Optional if some one want new column for outpu with value POTIVE or Negative
#results_df['Output'] = results_df['ResultPN'].apply(lambda x: f"{x} Positive" if x == 1 else f"{x} Negative")





In [None]:
results_df

In [None]:
unique_values = {}
for column in results_df.columns:
    unique_values[column] = results_df[column].unique()

print(unique_values)

In [None]:
#Now rename according to LHO ontology Names and classes
results_df = results_df.rename(columns={
    'SubmissionNumber': 'SampleNumber',
    'Materials': 'SampleType',
    'Reason': 'EventType',
    'Method': 'DiagnosticTest',
    'ResultPN': 'SampleResult',
})



# Diagnostic Test mapping
DiagnosticTest_Mapping = {
    'ELISA_08': 'ELISA',
    'ELISA_02': 'ELISA',
    'ELISA_09': 'ELISA',
    'ELISA_07': 'ELISA',
    'HAR_03_T_<9-9216': 'HAR',
    'ELISA_02_Spe': 'ELISA',
    'ELISA_05': 'ELISA',
    'ELISA_04': 'ELISA',
    'HAR_08_S': 'HAR',
    'PCR_03_WE_Ext': 'PCR',
    'ELISA_07_Indirect': 'ELISA',
    'PCR_06_Ser': 'PCR',
    'PCR_06_SPK': 'PCR',
    'PCR_06_Ser_pool': 'PCR',
    'PCR_06_probe': 'PCR',
    'PCR_06_Spoed_Pool': 'PCR',
    'ELISA_05_Indirect': 'ELISA',
    'PCR_03_WE': 'PCR',
    'PCR_06_Spoed_Ind': 'PCR',
    'PCR_06_WE': 'PCR',
    'PCR_06_Ser_pool_6-10': 'PCR',
    'PCR_06_SPK_SWS': 'PCR',
    'PCR_06_Buikvocht': 'PCR',
    'CBR_02_S': 'CBR',
    'PCR_06_Lichaamsvl': 'PCR'
}


# Sampletype mapping
Sampletype_Mapping = {
    'Bloed': 'Blood',
    'Diverse Materialen': 'VariousMaterials',
    'Bloed (Varken) Pool': 'Blood',
    'Bloed slachtlijn': 'Blood',
    'Sectie Zoogdieren': 'SectionMammals',
    'SERUM': 'Serum',
     'Faeces': 'Faeces'
    
}

# EventType mapping
EventType_Mapping = {
    'GD Labonderzoek': 'LaboratoryInvestigation',
    'Klachten': 'Complaints',
    'Verwerper': 'Rejector'
}

# Translate Pathogen terms to English
Pathogen_Mapping = {
    'PRRS_Antistoffen': 'PorcineReproductiveAndRespiratorySyndrome',
    'Influenza_A_Antistof': 'SwineInfluenza',
    'App_ApxIV': 'ActinobacillusPleuropneumoniae',
    'pH1N1': 'H1N1',
    'Porcine_H3N2': 'H3N2',
    'App_Indirect': 'ActinobacillusPleuropneumoniae',
    'PRRS_Am': 'PorcineReproductiveAndRespiratorySyndrome',
    'Porcine_H1N2': 'H1N2',
    'M_Hyopneumoniae': 'MaycoplasmaHyopneumoniae',
    'M_Hyopneum_Antistof': 'MaycoplasmaHyopneumoniae',
    'PRRS_Virus': 'PorcineReproductiveAndRespiratorySyndrome',
    'App_5': 'ActinobacillusPleuropneumoniae',
    'H1N1 2012': 'H1N1',
    'App_2': 'ActinobacillusPleuropneumoniae',
    'PRRS_Eu': 'PorcineReproductiveAndRespiratorySyndrom',
    'App_2_9': 'ActinobacillusPleuropneumoniae'
}

# Map the Pathogen column to its English translation
results_df['Pathogen'] = results_df['Pathogen'].map(Pathogen_Mapping)


results_df['SampleType'] = results_df['SampleType'].map(Sampletype_Mapping)
results_df['EventType'] = results_df['EventType'].map(EventType_Mapping)
results_df['DiagnosticTest'] = results_df['DiagnosticTest'].map(DiagnosticTest_Mapping)



# Set Country , Breed and LabReference
results_df['Country'] = 'TheNetherlands'
results_df['Breed'] = 'Meat'
results_df["LabReference"] = "2"




In [None]:
results_df

In [110]:
#Group by relevant columns and concatenate results
#results_grouped = results_df.groupby(['SampleNumber', 'DateMonth', 'DateYear']).agg(
  #  Results=('SampleResult', lambda x: ';'.join(x.astype(str).unique()))).reset_index()
#results_grouped

# RDF Mapping Section

In [None]:
g = rdflib.Graph()
xsd = Namespace('http://www.w3.org/2001/XMLSchema#')
g.bind('xsd', xsd)

ontology_file = "output/RDFoutputPigSampleLab3.rdf"
g.parse(ontology_file, format="xml")

# Define your custom namespace for your ontology's properties
LHO = Namespace("https://www.purl.org/decide/LiveStockHealthOnto/LHO#")
g.bind('LHO', LHO)

SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
g.bind('skos', SKOS)

decide=  Namespace("http://www.purl.org/decide#")

ncit = Namespace("http://purl.obolibrary.org/obo/NCIT_C25464")
agrovoc = Namespace ("http://aims.fao.org/aos/agrovoc")

In [None]:
for index, row in results_df.iterrows():
    
    # Create a unique URI for each sample based on the row index
    PigSample_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#Lab5PigSample_{index}")
    
    # Add sample type assertion
    g.add((PigSample_uri, RDF.type, LHO.PigSample))
    description = "An individual representing a sample from a Pig and Piglets."
    g.add((PigSample_uri, RDFS.comment, Literal(description, lang="en")))

        
    if "Pathogen" in row and row["Pathogen"]:
        Pathogen = row["Pathogen"]
        Pathogen_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Pathogen}")
        g.add((PigSample_uri, LHO.hasPathogen, Pathogen_uri))
        g.add((Pathogen_uri, RDF.type, decide.Pathogen))
        description = "An individual representing Pig pathogen."
        g.add((Pathogen_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "Breed" in row and row["Breed"]:
        Breed = row["Breed"]
        Breed_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Breed}")
        g.add((PigSample_uri, LHO.hasBreed, Breed_uri))
        g.add((Breed_uri, RDF.type, LHO.Breed))
        description = "An individual representing a Piglet breed which is Meat."
        g.add((Breed_uri, RDFS.comment, Literal(description, lang="en")))
        
    if "DiagnosticTest" in row and row["DiagnosticTest"]:
        DiagnosticTest= row["DiagnosticTest"]
        DiagnosticTest_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{DiagnosticTest}")
        g.add((PigSample_uri, LHO.hasDiagnosticTest, DiagnosticTest_uri))
        g.add((DiagnosticTest_uri, RDF.type, LHO.DiagnosticTest))

        
    if "Country" in row and row["Country"]:
        Country = row["Country"].strip()  # Use strip() to remove leading and trailing spaces
        # Encode the Country value to create a valid URI
        Country = urllib.parse.quote(Country)
        Country_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Country}")
    
        # Map to the specific class URI in your ontology and add the label
        g.add((PigSample_uri, LHO.hasCountry, Country_uri))
        g.add((Country_uri, RDF.type, URIRef("http://purl.obolibrary.org/obo/NCIT_C25464")))  # Use the specific class URI
        g.add((Country_uri, RDFS.label, Literal("Spain")))
        description = "An individual representing different Countries."
        g.add((Country_uri, RDFS.comment, Literal(description, lang="en")))
    
    if "SampleType" in row and row["SampleType"]:
        SampleType = row["SampleType"]
        SampleType_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{SampleType}")
        # Add statements for SampleType
        g.add((PigSample_uri, LHO.hasSampleType, SampleType_uri))
        g.add((SampleType_uri, RDF.type, decide.SampleType))

    if "SampleResult" in row and row["SampleResult"]:
        SampleResult = row["SampleResult"]
        # Create a unique URI for the Result based on the value
        Result_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{SampleResult}")
        g.add((PigSample_uri, LHO.hasSampleResult, Result_uri))
        g.add((Result_uri, RDF.type, decide.SampleResult))
        
    if "LabReference" in row and row["LabReference"]:
        LabReference = row["LabReference"]
        # Create a unique URI for the LabReference based on the value
        LabReference_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{LabReference}")
        g.add((PigSample_uri, LHO.hasLabReference, LabReference_uri))
        g.add((LabReference_uri, RDF.type, LHO.LabReference))
        
    if "EventType" in row and row["EventType"]:
        EventType = row["EventType"]
        # Create a unique URI for the LabReference based on the value
        EventType_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{EventType}")
        g.add((PigSample_uri, LHO.hasEventType, EventType_uri))
        g.add((EventType_uri, RDF.type, decide.EventType))
        
    if "Date" in row and row["Date"]:
        Date = row["Date"]
        # Create a unique URI for the Date based on the value
        Date_uri = URIRef(f"http://www.purl.org/decide/LiveStockHealthOnto/LHO#{Date}")
        g.add((PigSample_uri, LHO.hasDate, Date_uri))
        g.add((Date_uri, RDF.type, LHO.Date))

# Serialize the RDF graph to a file
rdf_output_file = "output/RDFoutputPigSampleLab5.rdf"
g.serialize(rdf_output_file, format="xml")


In [None]:
print(g.serialize(format="ttl"))

In [114]:
query = """
PREFIX decide: <http://www.purl.org/decide#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX LHO: <https://www.purl.org/decide/LiveStockHealthOnto/LHO#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT
  (strafter(str(?Sample), "#") AS ?SampleName)
  (strafter(str(?Breed), "#") AS ?BreedName)
  (strafter(str(?SampleType), "#") AS ?SampleTypeName)
  (strafter(str(?LabReference), "#") AS ?LabReferenceName)
  (strafter(str(?DiagnosticTest), "#") AS ?DiagnosticTestName)
  (strafter(str(?Pathogen), "#") AS ?Pathogen)
  (strafter(str(?Country), "#") AS ?CountryName)
  (strafter(str(?SampleResult), "#") AS ?SampleResultName)

WHERE {
  ?Sample rdf:type LHO:PigSample .
  ?Sample LHO:hasBreed ?Breed .
  ?Sample LHO:hasSampleType ?SampleType .
  ?Sample LHO:hasLabReference ?LabReference .
  ?Sample LHO:hasDiagnosticTest ?DiagnosticTest .
  ?Sample LHO:hasPathogen ?Pathogen .
  ?Sample LHO:hasCountry ?Country .
  ?Sample LHO:hasSampleResult ?SampleResult .
  
}
"""

# Execute the query and retrieve the results
results = g.query(query)

# Convert the results to a Pandas dataframe
data = []
for row in results:
    data.append(list(row))
df = pd.DataFrame(data, columns=["Sample",  "Breed", "SampleType", "LabReference", "DiagnosticTest","Pathogen", "Country", "SampleResult"])

# Display the dataframe
df.head(50)

Unnamed: 0,Sample,Breed,SampleType,LabReference,DiagnosticTest,Pathogen,Country,SampleResult
0,Lab5PigSample_429,Meat,Blood,2,ELISA,PorcineReproductiveAndRespiratorySyndrome,TheNetherlands,1.0
1,Lab5PigSample_585,Meat,Blood,2,ELISA,ActinobacillusPleuropneumoniae,TheNetherlands,1.0
2,Lab5PigSample_650,Meat,Blood,2,ELISA,PorcineReproductiveAndRespiratorySyndrome,TheNetherlands,1.0
3,Lab5PigSample_673,Meat,Blood,2,ELISA,SwineInfluenza,TheNetherlands,1.0
4,Lab5PigSample_812,Meat,Blood,2,ELISA,ActinobacillusPleuropneumoniae,TheNetherlands,1.0
5,Lab5PigSample_956,Meat,Blood,2,ELISA,PorcineReproductiveAndRespiratorySyndrome,TheNetherlands,1.0
6,Lab5PigSample_1245,Meat,Blood,2,ELISA,PorcineReproductiveAndRespiratorySyndrome,TheNetherlands,1.0
7,Lab5PigSample_1368,Meat,Blood,2,ELISA,PorcineReproductiveAndRespiratorySyndrome,TheNetherlands,1.0
8,Lab5PigSample_1394,Meat,Blood,2,ELISA,PorcineReproductiveAndRespiratorySyndrome,TheNetherlands,1.0
9,Lab5PigSample_1418,Meat,Blood,2,ELISA,ActinobacillusPleuropneumoniae,TheNetherlands,1.0
