In [261]:
import pandas as pd

DATA_DIR = "./src/data"

In [262]:
class DataCollector:
    def __init__(self, DATA_DIR):
        self.artist = pd.read_csv(DATA_DIR + "/artist.csv")
        self.museum = pd.read_csv(DATA_DIR + "/museum.csv")
        self.work = pd.read_csv(DATA_DIR + "/work.csv")
        self.subject = pd.read_csv(DATA_DIR + "/subject.csv")
        self.clean_data()
        self.set_identifiers()
    
    def clean_data(self):
        self.artist = self.artist.drop_duplicates()
        self.museum = self.museum.drop_duplicates()
        self.work = self.work.drop_duplicates()
        self.subject = self.subject.drop_duplicates()
        
    def set_identifiers(self):
        self.artist["identifier"] = self.artist.apply(
            lambda line: re.sub(r"[^a-zA-Z0-9]", "", str(line["full_name"])),
            axis=1
        )
        self.museum["identifier"] = self.museum.apply(
            lambda line: re.sub(r"[^a-zA-Z0-9]", "", str(line["name"])),
            axis=1
        )
        self.work["identifier"] = self.work.apply(
            lambda line: str(line["work_id"]) + "_" + re.sub(r"[^a-zA-Z0-9]", "", str(line["name"])),
            axis=1
        )
        self.subject["identifier"] = self.subject.apply(
            lambda line: re.sub(r"[^a-zA-Z0-9]", "", str(line["subject"])),
            axis=1
        )
        

    def get_artists(self):
        artists = self.artist.drop_duplicates().reset_index(drop=True)
        return artists
    
    def get_museums(self):
        museums = self.museum.drop_duplicates().reset_index(drop=True)
        return museums

    def get_works(self):
        works = self.work.drop_duplicates().reset_index(drop=True)
        return works
    
    def get_work_subjects(self):
        subjects = self.subject.drop_duplicates().reset_index(drop=True)
        return subjects

    def get_countries(self):
        return self.museum["country"].dropna().unique().tolist()
    
    def get_artist_styles(self):
        return self.artist["style"].dropna().unique().tolist()
    
    def get_artist_nationalities(self):
        return self.artist["nationality"].dropna().unique().tolist()

    def get_subjects(self):
        return self.subject["subject"].dropna().unique().tolist()

    def get_work_styles (self):
        return self.work["style"].dropna().unique().tolist()

    def get_artist_by_id(self, id):
        values = self.artist[self.artist["artist_id"] == id]["identifier"].values
        if len(values) > 0:
            return values[0]
        return None
    
    def get_museum_by_id(self, id):
        values = self.museum[self.museum["museum_id"] == id]["identifier"].values
        if len(values) > 0:
            return values[0]
        return None
    
    def get_subject_by_id(self, id):
        values = self.subject[self.subject["work_id"] == id]["identifier"].values
        if len(values) > 0:
            return values[0]
        return None
    
    def get_style_by_id(self, id):
        style = self.work[self.work["work_id"] == id]["style"].values[0]
        return re.sub(r"[^a-zA-Z0-9]", "", style)

In [263]:
collector = DataCollector(DATA_DIR)

In [264]:
subjects = collector.get_work_subjects()
works = collector.get_works()
artists = collector.get_artists()
museums = collector.get_museums()

In [265]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, XSD

In [266]:
# Criar o modelo (grafo RDF)
model = Graph()

# Definir namespaces 
base = Namespace("http://www.semanticweb.org/ericarfs/ontologies/2024/10/famouspaintings#")
xsd  = Namespace("http://www.w3.org/2001/XMLSchema#")
fam  = Namespace("http://example.org/family#")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
rdf  = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")


# Vincular os namespaces ao grafo (opcional, mas útil para serialização legível)
model.bind("", base)
model.bind("xsd", xsd)
model.bind("fam", fam)
model.bind("foaf", foaf)
model.bind("rdf", rdf)
model.bind("rdfs", rdfs)

In [267]:
def add_countries_to_model():
    countries = collector.get_countries()

    for country_name in countries:
        country_name_formated = re.sub(r"[^a-zA-Z0-9]", "", country_name)
        country = URIRef(base + country_name_formated)
        model.add((country, RDF.type, base.Country))
        model.add((country, foaf.name, Literal(country_name)))

In [268]:
def add_styles_to_model():
    artist_styles = collector.get_artist_styles()
    work_styles = collector.get_work_styles()
    styles = set(artist_styles + work_styles)

    for style_name in styles:
        style_name_formated = re.sub(r"[^a-zA-Z0-9]", "", style_name)
        style = URIRef(base + style_name_formated)
        model.add((style, RDF.type, base.Style))
        model.add((style, foaf.name, Literal(style_name)))

In [270]:
def add_subjects_to_model():
    subjects = collector.get_subjects()

    for subject_name in subjects:
        subject_name_formated = re.sub(r"[^a-zA-Z0-9]", "", subject_name)
        subject = URIRef(base + subject_name_formated)
        model.add((subject, RDF.type, base.Subject))
        model.add((subject, foaf.name, Literal(subject_name)))

In [271]:
def add_nationalities_to_model():
    nationalities = collector.get_artist_nationalities()

    for nationality_name in nationalities:
        nationality_name_formated = re.sub(r"[^a-zA-Z0-9]", "", nationality_name)
        nationality = URIRef(base + nationality_name_formated)
        model.add((nationality, RDF.type, base.Nationality))
        model.add((nationality, foaf.name, Literal(nationality_name)))

In [272]:
def add_museums_to_model(data):
    tam = data.shape[0]
    
    for i in range (tam):
        museum_identifier = data.iloc[i]["identifier"]
        museum = URIRef(base + museum_identifier)
        model.add((museum, RDF.type, base.Museum))

        museum_name = str(data.iloc[i]["name"])
        model.add((museum, foaf.name, Literal(museum_name)))

        country_name = re.sub(r"[^a-zA-Z0-9]", "", str(data.iloc[i]["country"]))
        if country_name != "nan":
            model.add((museum, base.hasCountry, URIRef(base + country_name )))

        state = str(data.iloc[i]["state"])
        if state != "nan":
            model.add((museum, base.state, Literal(state)))
        
        city = str(data.iloc[i]["city"])
        if city != "nan":
            model.add((museum, base.city, Literal(city)))
        
        address = str(data.iloc[i]["address"])
        if address != "nan":
            model.add((museum, base.address, Literal(address)))
        
        postal = str(data.iloc[i]["postal"])
        if postal != "nan":
            model.add((museum, base.postal, Literal(postal)))
        
        phone = str(data.iloc[i]["phone"])
        if phone != "nan":
            model.add((museum, base.phone, Literal(phone)))
        
        url = str(data.iloc[i]["url"])
        if city != "nan":
            model.add((museum, base.url, Literal(url)))

In [273]:
def add_artists_to_model(data):
    tam = data.shape[0]
    
    for i in range (tam):
        artist_identifier = data.iloc[i]["identifier"]
        artist = URIRef(base + artist_identifier)
        model.add((artist, RDF.type, base.Artist))

        full_name = str(data.iloc[i]["full_name"])
        model.add((artist, foaf.name, Literal(full_name)))

        first_name = str(data.iloc[i]["first_name"])
        if first_name != "nan":
            model.add((artist, foaf.firstName, Literal(first_name)))

        last_name = str(data.iloc[i]["last_name"])
        if last_name != "nan":
            model.add((artist, foaf.lastName, Literal(last_name)))
        
        birth = str(data.iloc[i]["birth"])
        if birth != "nan":
            model.add((artist, base.birth, Literal(birth)))

        death = str(data.iloc[i]["death"])
        if death != "nan":
            model.add((artist, base.death, Literal(death)))
        
        nationality = re.sub(r"[^a-zA-Z0-9]", "", str(data.iloc[i]["nationality"]))
        if nationality != "nan":
            model.add((artist, base.hasNationality, URIRef(base + nationality )))
        
        style = re.sub(r"[^a-zA-Z0-9]", "", str(data.iloc[i]["style"]))
        if style != "nan":
            model.add((artist, base.hasStyle, URIRef(base + style )))

In [274]:
def add_works_to_model(data):
    tam = data.shape[0]

    for i in range (tam):
        work_identifier = data.iloc[i]["identifier"]
        work = URIRef(base + work_identifier)
        model.add((work, RDF.type, base.Work))

        name = str(data.iloc[i]["name"])
        model.add((work, foaf.name, Literal(name)))

        artist = collector.get_artist_by_id(data.iloc[i]["artist_id"])
        if artist is not None:
            model.add((work, base.wasPaintedBy, URIRef(base + artist )))
        
        style = re.sub(r"[^a-zA-Z0-9]", "", str(data.iloc[i]["style"]))
        if style != "nan":
            model.add((work, base.hasStyle, URIRef(base + style )))
        
        subject = collector.get_subject_by_id(data.iloc[i]["work_id"])
        if subject is not None:
            model.add((work, base.haSubject, URIRef(base + subject )))

        museum_id = data.iloc[i]["museum_id"]
        if museum_id != "nan":
            work_museum = collector.get_museum_by_id(museum_id)
            if work_museum is not None:
                model.add((work, base.isLocatedAt, URIRef(base + work_museum)))
        

In [275]:
add_nationalities_to_model()

In [276]:
add_subjects_to_model()

In [277]:
add_countries_to_model()

In [278]:
add_styles_to_model()

In [279]:
add_works_to_model(works)

In [280]:
add_museums_to_model(museums)

In [282]:
add_artists_to_model(artists)

In [284]:
# Serializar o grafo para um arquivo ou string
output_file = "output.ttl"  # Nome do arquivo de saída
with open(output_file, "w") as fout:
    fout.write(model.serialize(format="turtle"))