In [5]:
import os
from rdflib import ConjunctiveGraph, term
import csv

def process_uri(uri):
  uri_str = str(uri)
  if uri_str.startswith("<") and uri_str.endswith(">"):
    uri_str = uri_str[1:-1]
  if "http" in uri_str:
    parts = uri_str.split("http")
    if len(parts) > 2:
      uri_str = "http" + parts[-1]
  if uri_str.endswith("/body/body"):
    uri_str = uri_str.replace("/body/body", "/body")
  return term.URIRef(uri_str)

directory = os.path.join(os.pardir, os.pardir, "output post disamiguation")

print(f"Number of files in directory: {len(os.listdir(directory))}")

rows = []

for filename in os.listdir(directory):
  # Get all events
  if filename.endswith(".trig") and "event" in filename:
    file_path = os.path.join(directory, filename)
    g = ConjunctiveGraph()
    try:
      row = {
        "who": filename
      }
      # Parse the TRiG
      g.parse(file_path, format="trig")
      
      # Iterate over the triples
      for s, p, o in g:
        if str(p) == "http://www.cidoc-crm.org/cidoc-crm/P160_has_temporal_projection":
          row["when"] = process_uri(o)
        if str(p) == "http://www.cidoc-crm.org/cidoc-crm/P161_has_spatial_projection":
          row["where"] = process_uri(o)
        
      
      rows.append(row)

    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        

csv_file_path = os.path.join(os.curdir, "events.csv")

with open(csv_file_path, mode='w', newline='') as csv_file:
  fieldnames = ["when", "where", "who"]
  writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

  writer.writeheader()
  for row in rows:
    writer.writerow(row)

print(f"CSV file created at: {csv_file_path}")

print(f"Number of events: {len(rows)}")
non_none_events = [row for row in rows if "None" not in str(row.get("when", "")) and "None" not in str(row.get("where", ""))]
print(f"Number of events not containing 'None' in URI: {len(non_none_events)}")

Number of files in directory: 1268


  g = ConjunctiveGraph()


CSV file created at: ./events.csv
Number of events: 445
Number of events not containing 'None' in URI: 422


In [8]:
import pandas as pd
import os
import logging
from rdflib import Dataset

logging.basicConfig(filename=os.path.join(os.curdir, "error_log.txt"), level=logging.ERROR)

data = pd.read_csv(csv_file_path)
non_none_events = data[(data["when"].str.contains("http")) & (data["where"].str.contains("http"))]

rows = []

for key, event in non_none_events.iterrows():
  
  file = event["who"].split("_")[0]
  for filename in os.listdir(directory):
    if filename.startswith(file) and "event" not in filename:
      ds = Dataset()
      try:
        ds.parse(os.path.join(directory, filename), format="trig")
        
        # Iterate over places
        same_as = []  
        for s, p, o, *rest in ds.quads((term.URIRef(event["where"]), None, None, None)):
          if str(p) == "http://www.w3.org/2002/07/owl#sameAs":
            same_as.append(str(o))
        
        if len(same_as) > 0:
          rows.append({
            "identifier": event["where"],
            "type": "place",
            "same_as": same_as,
            "diary": "1894-95",
            "page": file
          })
        
        # Iterate over dates
        same_as = []
        for s, p, o, *rest in ds.quads((term.URIRef(event["when"]), None, None, None)):
          if str(p) == "http://www.cidoc-crm.org/cidoc-crm/P181b_begin_of_the_end":
            same_as.append(str(o))
        
        if len(same_as) > 0:
          rows.append({
            "identifier": event["when"],
            "type": "date",
            "same_as": same_as,
            "diary": "1894-95",
            "page": int(file)
          })
      except Exception as e:
        print(f"Error parsing {filename}: {e}")
        logging.error(filename)
   
output_csv_file_path = os.path.join(os.curdir, "processed_entities.csv")

output_df = pd.DataFrame(rows)
output_df["page"] = output_df["page"].astype(int)
output_df = output_df.sort_values(by="page", ascending=True)
output_df.to_csv(output_csv_file_path, index=False)

print(f"Processed entities CSV file created at: {output_csv_file_path}")



Processed entities CSV file created at: ./processed_entities.csv


In [9]:
# Load the processed_entities.csv file
processed_entities_df = pd.read_csv(output_csv_file_path)

# Get the number of rows with type "place"
num_place_rows = processed_entities_df[processed_entities_df['type'] == 'place'].shape[0]

# Get the number of rows with type "date"
num_date_rows = processed_entities_df[processed_entities_df['type'] == 'date'].shape[0]

print(f"Number of rows with type 'place': {num_place_rows}")
print(f"Number of rows with type 'date': {num_date_rows}")

Number of rows with type 'place': 432
Number of rows with type 'date': 426
