In [40]:
from bs4 import BeautifulSoup, PageElement
from pathlib import Path
import os
import json

folder_path = Path("../data/civilopedia/www.civilopedia.net/gathering-storm/civilizations/")

In [41]:
def parse_civilization(html_content: str):
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract relevant data (example structure, adjust based on the actual HTML structure)
    header = soup.find("div", class_="App_pageHeaderText__SsfWm")
    name = header.text
    location = soup.find(string="Location").parent.parent.nextSibling.text
    
    return {
        "type": "civilization",
        "name": name,
        "location": location
    }
    
with open(folder_path / "civilization_spain.html", "r", encoding="utf-8") as file:
    html_content = file.read()
    civ_ex_data = parse_civilization(html_content)

civ_ex_data

{'type': 'civilization', 'name': 'Spain', 'location': 'Europe'}

In [42]:
def parse_leader(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    
    header = soup.find("div", class_="App_pageHeaderText__SsfWm")
    name = header.text
    ruler_of = soup.find(string="Traits").parent.parent.find(string="Civilizations").parent.parent.find_next_sibling().text
    
    return {
        "type": "leader",
        "name": name,
        "ruler_of": ruler_of,
    }
    
with open(folder_path / "leader_suleiman.html", "r", encoding="utf-8") as file:
    html_content = file.read()
    leader_ex_data = parse_leader(html_content)

leader_ex_data

{'type': 'leader',
 'name': 'Suleiman I, The Magnificent',
 'ruler_of': 'Ottomans'}

In [43]:
parsed_data_filepath = "parsed_data.json"


def process_files(folder_path: Path):
    parsed_data = []

    for file_name in os.listdir(folder_path):
        file_path = folder_path / file_name

        if not file_name.endswith(".html"):
            continue

        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

            # Parse based on file type
            if file_name.startswith("civilization_"):
                data = parse_civilization(html_content)
                parsed_data.append(data)
            elif file_name.startswith("leader_"):
                data = parse_leader(html_content)
                parsed_data.append(data)
            else:
                print(f"Unknown file type: {file_name}")

    with open(parsed_data_filepath, "w", encoding="utf-8") as json_file:
        json.dump(
            parsed_data,
            json_file,
            ensure_ascii=False,
            indent=4,
        )

    print(f"Parsed data saved to {parsed_data_filepath}")


# Run the processing function
process_files(folder_path)

Unknown file type: civilizations_intro.html
Unknown file type: leaders_intro.html
Parsed data saved to parsed_data.json


In [44]:
from rdflib import Graph, Namespace, Literal, RDF, RDFS

g = Graph()
ontology_file = "../data/raw/civ6-ontology.owl"
g.parse(ontology_file, format="xml")

<Graph identifier=N511da3a5061b4471b0377a782bd98d70 (<class 'rdflib.graph.Graph'>)>

In [45]:
import pandas as pd
ex = Namespace('http://webprotege.stanford.edu/')
Leader = ex.RBtVYtTDmwD9yyR2DUW0xm3
Civilization = ex.RHLVvXdWi6EA2dvTpWq8lH
RuledBy = ex.RB9IPkL7AbK2eV2GytXo19K

df = pd.read_json(parsed_data_filepath)

# new_individual = ex[civ_name]
# g.add((new_individual, RDF.type, person_class))
# g.add((new_individual, RDFS.label, Literal(civ_name)))
# updated_file = "updated_ontology_with_label.owl"
# g.serialize(destination=updated_file, format="xml")
# print(f"Updated ontology saved to {updated_file}")

In [46]:
civilizations = df[df.type == 'civilization']
for index, civ in civilizations.iterrows():
    civ_name = civ['name'].replace(' ', '_')
    ind = ex[civ_name]
    g.add((ind, RDF.type, Civilization))
    g.add((ind, RDFS.label, Literal(civ_name)))

In [47]:
leaders = df[df.type == 'leader']
for index, leader in leaders.iterrows():
    leader_name = leader['name'].replace(' ', '_')
    leader_ind = ex[leader_name]
    g.add((leader_ind, RDF.type, Leader))
    g.add((leader_ind, RDFS.label, Literal(leader_name)))
    
    for i, c in civilizations[civilizations.name == leader.ruler_of].iterrows():
        civ_name = c['name'].replace(' ', '_')
        civ_ind = ex[civ_name]
        g.add((civ_ind, RuledBy, leader_ind))
    

In [48]:
updated_owl_file = "updated_ontology_with_label.owl"
g.serialize(destination=updated_owl_file, format="xml")
print(f"Updated ontology saved to {updated_owl_file}")

Updated ontology saved to updated_ontology_with_label.owl
