In [28]:
import pandas as pd
from rdflib import Graph, Literal, RDF, Namespace, URIRef
from rdflib.namespace import XSD
from datetime import datetime
from geopy.geocoders import Nominatim
from decimal import Decimal, InvalidOperation

In [5]:
path_us = 'data/ufo_sightings_us.csv'
path_br = 'data/ufo_sightings_br.csv'

In [35]:
get_rdfs(path_us, path_br)

In [16]:
def get_rdfs(path_us, path_br):
    
    unique_locations = {}
    unique_shapes = {}
    num_seen = []
    
    u = process_rdf_us(path_us, unique_locations, unique_shapes, num_seen)
    b = process_rdf_br(path_br, unique_locations, unique_shapes, num_seen)
    
    u.serialize(destination='ufo_sightings_us.rdf', format='turtle')
    b.serialize(destination='ufo_sightings_br.rdf', format='turtle')

    g = u + b
    
    g.serialize(destination='ufo_sightings.rdf', format='turtle')

In [15]:
def process_rdf_us(path, unique_locations, unique_shapes, num_seen):
    df = pd.read_csv(path)
    df.dropna(inplace=True)
    df = df[0:50]
    
    g = Graph()
    
    P1 = Namespace("https://idmc.univ-lorraine.fr/sw/p1#")
    EX = Namespace("https://idmc.univ-lorraine.fr/sw/ex#")
    
    g.bind("p1", P1)
    g.bind("ex", EX)

    for index, row in df.iterrows():
        sighting_event = EX[f"SightingEvent{len(num_seen)}"]
        num_seen.append([])

        date_str, time_str = row['date_time'].split()
        iso_date = convert_date_to_iso(date_str)
        iso_time = convert_time_to_iso(time_str)

        g.add((sighting_event, RDF.type, P1.SightingEvent))
        if iso_date:
            g.add((sighting_event, P1.date, Literal(iso_date, datatype=XSD.date)))
        if iso_time:
            g.add((sighting_event, P1.time, Literal(iso_time, datatype=XSD.time)))
        if not pd.isna(row['encounter_length']):
            g.add((sighting_event, P1.duration, Literal(f"PT{int(row['encounter_length'])}S", datatype=XSD.duration)))

        location_key = (row['latitude'], row['longitude'])
        if location_key not in unique_locations:
            location = EX[f"Location{len(unique_locations)}"]
            unique_locations[location_key] = location
            g.add((location, RDF.type, P1.Location))
            g.add((location, P1.latitude, Literal(row['latitude'], datatype=XSD.decimal)))
            g.add((location, P1.longitude, Literal(row['longitude'], datatype=XSD.decimal)))
        else:
            location = unique_locations[location_key]
        g.add((sighting_event, P1.location, location))

        shape_key = row['ufo_shape']
        if shape_key not in unique_shapes:
            shape = EX[f"Shape{len(unique_shapes)}"]
            unique_shapes[shape_key] = shape
            g.add((shape, RDF.type, P1.Shape))
            g.add((shape, P1.name, Literal(row['ufo_shape'], datatype=XSD.string)))
        else:
            shape = unique_shapes[shape_key]
        g.add((sighting_event, P1.shape, shape))
        
        g.add((sighting_event, P1.description, Literal(row['description'], datatype=XSD.string)))

    return g

In [34]:
def process_rdf_br(path, unique_locations, unique_shapes, num_seen):
    df = pd.read_csv(path)
    df.dropna(inplace=True)
    df = df[0:50]
    
    g = Graph()
    
    P1 = Namespace("https://idmc.univ-lorraine.fr/sw/p1#")
    EX = Namespace("https://idmc.univ-lorraine.fr/sw/ex#")
    
    g.bind("p1", P1)
    g.bind("ex", EX)

    for index, row in df.iterrows():
        sighting_event = EX[f"SightingEvent{len(num_seen)}"]
        num_seen.append([])

        date_str = row['Date']
        time_str = row['Time']
        iso_date = convert_date_to_iso(date_str)
        iso_time = convert_time_to_iso(time_str)

        g.add((sighting_event, RDF.type, P1.SightingEvent))
        if iso_date:
            g.add((sighting_event, P1.date, Literal(iso_date, datatype=XSD.date)))
        if iso_time:
            g.add((sighting_event, P1.time, Literal(iso_time, datatype=XSD.time)))
        
        town = row['Town/Village']

        geolocator = Nominatim(user_agent="Mozilla/5.0")
        location = geolocator.geocode(town + ', United Kingdom')

        if location:
            location_key = (location.latitude, location.longitude)
        else:
            location_key = None

        if location_key not in unique_locations and location_key:
            location = EX[f"Location{len(unique_locations)}"]
            unique_locations[location_key] = location
            g.add((location, RDF.type, P1.Location))
            g.add((location, P1.latitude, Literal(Decimal(location_key[0]), datatype=XSD.decimal)))
            g.add((location, P1.longitude, Literal(Decimal(location_key[1]), datatype=XSD.decimal)))
            g.add((sighting_event, P1.location, location))
        elif location_key in unique_locations and location_key:
            location = unique_locations[location_key]
            g.add((sighting_event, P1.location, location))
        
        g.add((sighting_event, P1.description, Literal(row['Description'], datatype=XSD.string)))

    return g

In [9]:
def convert_date_to_iso(date_str):
    date_formats = ["%m/%d/%Y", "%d-%b-%y"]
    for date_format in date_formats:
        try:
            return datetime.strptime(date_str, date_format).date().isoformat()
        except ValueError:
            continue
    return None

def convert_time_to_iso(time_str):
    try:
        return datetime.strptime(time_str, "%H:%M").time().isoformat()
    except ValueError:
        return None