In [1]:
using<-function(...) {
    libs<-unlist(list(...))
    req<-unlist(lapply(libs,require,character.only=TRUE))
    need<-libs[req==FALSE]
    if(length(need)>0){ 
        install.packages(need)
        lapply(need,require,character.only=TRUE)
    }
}

In [2]:
using("data.table", "tidyverse", "naniar", "stringr", "readr",  "dplyr", "magrittr", "readxl", "writexl", "sjmisc", "tidyr", "rdflib"
                  )

Loading required package: data.table

Loading required package: tidyverse

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.0     [32mv[39m [34mreadr    [39m 2.1.4
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.0
[32mv[39m [34mggplot2  [39m 3.4.1     [32mv[39m [34mtibble   [39m 3.2.0
[32mv[39m [34mlubridate[39m 1.9.2     [32mv[39m [34mtidyr    [39m 1.3.0
[32mv[39m [34mpurrr    [39m 1.0.1     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mbetween()[39m     masks [34mdata.table[39m::between()
[31mx[39m [34mdplyr[39m::[32mfilter()[39m      masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mfirst()[39m       masks [34mdata.table[39m::first()
[31mx[39m [34mlubridate[39m::[32mhour()[39m    masks [34mdata.table[39m::hour()
[31mx[39m [34mlubridate[3

In [3]:
##Function to apply SHA-256 hashing

sha256_hash <- function(data) {
  openssl::sha256(data)
}

In [4]:
## Data loading
barometer_dt_raw <- readxl::read_excel("Data/ARSIA/ARSIA_DECIDE_20221201.xlsx")




In [5]:
## Data manipulation

barometer_dt <- barometer_dt_raw %>% 
  dplyr::rename(
    Dossier = 'N° échantillon',
    Date = 'Date of Sample',
    Sample_type= 'Sample Type',
    Diagnostic_test = METH,
    Farm_ID = TRP,
    PM = P_multocida,
    MH = M_haemolytica,
    HS = H_somnus,
    MB = M_bovis,
    BRSV = BRSV,
    PI3 = PI3,
    BCV = Coronavirus
    ) %>% 
  tidyr::separate(ADDRESS, c('Postal_code', 'City')) %>%
  dplyr::mutate(
    Postal_code = as.double(Postal_code),
    Filenumber = str_sub(Dossier, 1, 12),
    Samplenumber = str_sub(Dossier, -3),
    Country ='Belgium',
    Lab_reference ='3', 
    Sample_type = case_when(
      Sample_type == "BAL" ~ 'BAL', 
      Sample_type == "SWAB" ~'Swab',
      Sample_type == "CARCASS" ~ 'Autopsy',
      TRUE ~ 'Missing'
    ),
    Breed = case_when(
      SPECUL == "MEAT" ~ 'Beef', 
      SPECUL == "MILK" ~'Dairy',
      SPECUL == "MXD" ~ 'Mixed',
      TRUE ~ 'Unknown'
    ),
    Province = case_when(
        between(as.numeric(Postal_code), 1000, 1299) ~ 'Brussels',
        between(as.numeric(Postal_code), 1300, 1499) ~ 'Walloon Brabant',
        between(as.numeric(Postal_code), 1500, 1999) ~ 'Flemish Brabant',
        between(as.numeric(Postal_code), 3000, 3499) ~ 'Antwerp',
        between(as.numeric(Postal_code), 2000, 2999) ~ 'Limburg',
        between(as.numeric(Postal_code), 3500, 3999) ~ 'Limburg',
        between(as.numeric(Postal_code), 4000, 4999) ~ 'Liège',
        between(as.numeric(Postal_code), 5000, 5999) ~ 'Namur',
        between(as.numeric(Postal_code), 6000, 6599) ~ 'Hainaut',
        between(as.numeric(Postal_code), 7000, 7999) ~ 'Hainaut',
        between(as.numeric(Postal_code), 6600, 6999) ~ 'Luxembourg',
        between(as.numeric(Postal_code), 8000, 8999) ~ 'West Flanders',
        TRUE ~ 'East Flanders'
    )
  
  )%>%
  dplyr::select(
    Filenumber,
    Diagnostic_test,
    Samplenumber,
    Country,
    Lab_reference,
    Sample_type,
    Breed,
    PM,
    MH,
    HS,
    MB,
    BRSV,
    PI3,
    BCV,
    Date,
    Postal_code,
    Province,
    Farm_ID
  ) %>%
  dplyr::distinct() %>%
  dplyr::mutate(
    Filenumber = sha256_hash(as.character(Filenumber)),
    Samplenumber = sha256_hash(as.character(Samplenumber)),
    Farm_ID = sha256_hash(as.character(Samplenumber))
  ) 

"[1m[22mExpected 2 pieces. Additional pieces discarded in 2840 rows [1, 2, 3, 4, 7, 8,
9, 10, 15, 16, 17, 18, 31, 33, 34, 37, 38, 43, 47, 48, ...]."


In [6]:
## Floor date to 1st of month

barometer_dt$Floored_date <- lubridate::floor_date(barometer_dt$Date, "month")

In [7]:
## Aggregate data based on farm_ID and month (WIDE)


barometer_groupby <- barometer_dt %>%
  group_by(Lab_reference, Country, Breed, Floored_date, Province, Farm_ID, Diagnostic_test, Sample_type) %>%
  summarise(across(c(PM, MH, HS, MB, BRSV, PI3, BCV), max))

[1m[22m`summarise()` has grouped output by 'Lab_reference', 'Country', 'Breed',
'Floored_date', 'Province', 'Farm_ID', 'Diagnostic_test'. You can override
using the `.groups` argument.


In [10]:
## Convert to LONG

barometer_long <- barometer_groupby %>%
  tidyr::pivot_longer(
    cols = c('PM', 'MH', 'HS', 'MB', 'BRSV', 'PI3', 'BCV'),
    names_to = 'Pathogen',
    values_to = 'Result',
  )
      

In [9]:
library(magrittr)
library(rdflib)
library(XML)

In [11]:
barometer_long

Lab_reference,Country,Breed,Floored_date,Province,Farm_ID,Diagnostic_test,Sample_type,Pathogen,Result
<chr>,<chr>,<chr>,<dttm>,<chr>,<hash>,<chr>,<chr>,<chr>,<dbl>
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,PM,0
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,MH,1
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,HS,0
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,MB,0
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,BRSV,
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,PI3,
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,Culture,BAL,BCV,
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,PCR,BAL,PM,1
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,PCR,BAL,MH,1
3,Belgium,Beef,2016-01-01,Luxembourg,31c259b5d294c74587b298985044c14f221da76cda3a6f12a26a717c74c5a38c,PCR,BAL,HS,1


In [14]:
# create an RDF object
Literal <- function(value, datatype="xsd:string") {
  rdf_literal(value, datatype=datatype)
}
rdf <- rdf()
 
# create a namespace prefix for the ontology
onto <- "http://www.purl.org/decide/LivestockHealthOnto"
 
# iterate through each row in the barometer DataFrame
for (i in 1:nrow(barometer_long)) {
  row <- barometer_long[i,] 
# create a URI for the CattleSample individual based on the row ID
CattleSample <- paste0(onto, "CattleSample", i)

# add triples to the RDF graph


  #rdf_add(rdf, CattleSample, paste0(prefix, "hasBreed"), as.character(row[3]), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasLabReference"), ifelse(is.na(row$Lab_reference), "", as.character(row$Lab_reference)))
  rdf_add(rdf, CattleSample, paste0(onto, "hasCountry"), ifelse(is.na(row$Country), "", as.character(row$Country)))
  rdf_add(rdf, CattleSample, paste0(onto, "hasBreed"), ifelse(is.na(row$Breed), "", as.character(row$Breed)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasDate"), ifelse(is.na(row$Floored_date), "", as.character(row$Floored_date)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasProvince"), ifelse(is.na(row$Province), "", as.character(row$Province)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasFarmIdentification"), ifelse(is.na(row$Farm_ID), "", as.character(row$Farm_ID)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasDiagnosticTest"), ifelse(is.na(row$Diagnostic_test), "", as.character(row$Diagnostic_test)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasSampleType"), ifelse(is.na(row$Sample_type), "", as.character(row$Sample_type)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasPathogen"), ifelse(is.na(row$Pathogen), "", as.character(row$Pathogen)), datatype="xsd:string")
  rdf_add(rdf, CattleSample, paste0(onto, "hasResult"), ifelse(is.na(row$Result), "Missing", as.character(row$Result)), datatype="xsd:string")

}
 
# print the RDF graph (for testing)
cat(rdf_serialize(rdf, format="turtle"))
 
# output the RDF graph to a file
## Save file to RDF (Long Version) instead of CSV
rdf_serialize(rdf, "output/RDFoutputCattleSampleArsia.ttl", format="turtle")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

