In [88]:
import os
from rdflib import ConjunctiveGraph, term
import csv

def process_uri(uri):
  uri_str = str(uri)
  if uri_str.startswith("<") and uri_str.endswith(">"):
    uri_str = uri_str[1:-1]
  if "http" in uri_str:
    parts = uri_str.split("http")
    if len(parts) > 2:
      uri_str = "http" + parts[-1]
  if uri_str.endswith("/body/body"):
    uri_str = uri_str.replace("/body/body", "/body")
  return term.URIRef(uri_str)

directory = os.path.join(os.pardir, os.pardir, "output post disamiguation")

print(f"Number of files in directory: {len(os.listdir(directory))}")

rows = []

for filename in os.listdir(directory):
  # Get all events
  if filename.endswith(".trig") and "event" in filename:
    file_path = os.path.join(directory, filename)
    g = ConjunctiveGraph()
    try:
      row = {
        "who": filename
      }
      # Parse the TRiG
      g.parse(file_path, format="trig")
      
      # Iterate over the triples
      for s, p, o in g:
        if str(p) == "http://www.cidoc-crm.org/cidoc-crm/P160_has_temporal_projection":
          row["when"] = process_uri(o)
        if str(p) == "http://www.cidoc-crm.org/cidoc-crm/P161_has_spatial_projection":
          row["where"] = process_uri(o)
        
      
      rows.append(row)

    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        

csv_file_path = os.path.join(os.curdir, "events.csv")

with open(csv_file_path, mode='w', newline='') as csv_file:
  fieldnames = ["when", "where", "who"]
  writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

  writer.writeheader()
  for row in rows:
    writer.writerow(row)

print(f"CSV file created at: {csv_file_path}")

print(f"Number of events: {len(rows)}")
non_none_events = [row for row in rows if "None" not in str(row.get("when", "")) and "None" not in str(row.get("where", ""))]
print(f"Number of events not containing 'None' in URI: {len(non_none_events)}")

Number of files in directory: 1268


  g = ConjunctiveGraph()


CSV file created at: ./events.csv
Number of events: 445


In [90]:

import pandas as pd
import os
import logging
from rdflib import Dataset

logging.basicConfig(filename=os.path.join(os.curdir, "error_log.txt"), level=logging.ERROR)

data = pd.read_csv(csv_file_path)

places = []
ds = Dataset()

# Load trigs of places in dataset
for key, event in data.iterrows():
  
  page = event["who"].split("_")[0]
  places.append(event["where"])

  for filename in os.listdir(directory):
    if filename.startswith(page) and "place" in filename:
      
      try:
        ds.parse(os.path.join(directory, filename), format="trig")

      except Exception as e:
        print(f"Error parsing {filename}: {e}")
        logging.error(filename)
print(f"Loaded {len(places)} places in dataset.")

Loaded 445 places in dataset.


In [92]:
rows = []

for place in places:
  row = {}
  print(f"Processing place: {place}")

  query = """
  PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
  PREFIX owl: <http://www.w3.org/2002/07/owl#>
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  SELECT ?same_as ?value ?source
  WHERE {
    GRAPH ?g {
      ?s a oa:Annotation;
        oa:hasBody ?place;
        oa:hasTarget ?target .

      ?place a crm:E52_place ;
        owl:sameAs ?same_as .
        
      ?target oa:hasSource ?source ;
        oa:hasSource ?source ;
        rdf:value ?value .
    }
  }
  """
  for results in ds.query(query, initBindings={"place": term.URIRef(place)}):
    row["page"] = int(str(results[2]).split("/")[-1])
    row["value"] = results[1]
    for result in results:
      if str(result).startswith("https://www.wikidata.org"):
        if "None" in str(result):
          row["wikidata_url"] = "None"
        else:
          row["wikidata_url"] = result
      elif str(result).startswith("https://www.geonames.org"):
        if "None" in str(result):
          row["geonames_url"] = "None"
        else:
          row["geonames_url"] = result
  
  rows.append(row)

output_csv_file_path = os.path.join(os.curdir, "predicted_places.csv")
output_df = pd.DataFrame(rows)
output_df = output_df.dropna(subset=["value", "page", "wikidata_url", "geonames_url"])
output_df["page"] = output_df["page"].astype(int)
output_df = output_df.sort_values(by="page", ascending=True)
output_df.to_csv(output_csv_file_path, index=False)
print(f"Processed entities CSV file created at: {output_csv_file_path}")

Processing place: https://mbdiaries.itatti.harvard.edu/annotation/05c73bf9-1024-44f4-aca0-91f9c1fbfb2f/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/bd7e69ed-5f4f-4907-9731-ddf5cd83bba9/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/1ea57c04-edab-4db6-bb8d-94e862fca1e1/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/e3df3100-eaaf-4799-8d04-03e709a53913/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/936cfd54-a47c-4d0d-9b39-b9d9b37e2406/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/c24d9c5e-f982-4ca2-9c75-e3ded86f6c42/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/a6b07fb3-5590-48df-98a7-37278691ea9b/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/26cb307e-a832-4aef-a712-75f992880a3b/body
Processing place: https://mbdiaries.itatti.harvard.edu/annotation/789c2262-09a7-445f-bc06-0104698c3f15/body
Processing place: https://mb

In [95]:
# Load the processed_entities.csv file
processed_entities_df = pd.read_csv(output_csv_file_path)

print(f"Number of rows with type 'place': {processed_entities_df.shape[0]}")

num_geonames_none = processed_entities_df['geonames_url'].isna().sum()
num_wikidata_none = processed_entities_df['wikidata_url'].isna().sum()

print(f"Number of rows with geonames_url None: {num_geonames_none}")
print(f"Number of rows with wikidata_url None: {num_wikidata_none}")

Number of rows with type 'place': 432
Number of rows with geonames_url None: 4
Number of rows with wikidata_url None: 2
