### Places extraction 
JSON to TriG

In [8]:
import os
import json
import uuid
import re
import anthropic
from datetime import datetime
from natsort import natsorted

class TextToRDFProcessor:
    def __init__(self, api_key):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.output_dir = "rdf_output"
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def extract_places(self, text):
        """Extract places from text using Claude API"""
        system_prompt = """You are an AI assistant specialized in extracting places from texts into JSON format.
            Your task is to analyze the text and extract ONLY the places in which Mary Berenson was in a particular day, create a JSON object with these exact fields:
            - name: name of the place. Be careful: if the place name contains also information related to streets or buildings, extract this information too. 
            - coordinates: coordinates in format (40.04, 30.55)
            - wiki_id: the CORRECT wikidata ID, if there's one representing the entity
            - line: text line reference in format /p[N] where N is line number. If a place is mentioned more than once in the text, identify correctly the line in which it's mentioned.
            - geoid: correct GeoNames ID

            Return the results in this exact format, with no additional text:
            {
                "places": [
                    {
                        "name": "place name",
                        "coordinates": "(lat, long)",
                        "wiki_id": "Q12345",
                        "line": "/p[1]",
                        "geoid": "3176959"
                    }
                ]
            }"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-latest",
                max_tokens=2048,
                system=system_prompt,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please analyze carefully the diary page and extract only the places in which Mary Berenson PHYSICALLY WAS ON A PARTICULAR DAY. \n\n{text}"
                    }
                ]
            )

            # Extract just the JSON part from Claude's response
            response_text = response.content[0].text

            # Find the JSON object bounds
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1

            if start_idx == -1 or end_idx == 0:
                raise ValueError("No valid JSON found in response")

            json_str = response_text[start_idx:end_idx]

            # Parse the JSON
            places_data = json.loads(json_str)

            # Validate the structure
            if "places" not in places_data:
                places_data = {"places": [places_data]}

            return places_data

        except Exception as e:
            print(f"Error extracting places: {str(e)}")
            return None

    def find_place_positions(self, text, place_name):
        """
        Find all start and end positions of a place name in the text
        with multiple fallback strategies. Returns a list of positions.
        """
        # Normalize place name and text for matching
        place_name_normalized = place_name.lower().strip()
        text_normalized = text.lower()
        
        def create_flexible_pattern(name):
            # Handle possessive forms
            if name.endswith("'s") or name.endswith("s'"):
                name = name.rstrip("s'")
            
            # Escape special regex characters
            escaped_name = re.escape(name)
            
            # Pattern to match both possessive and non-possessive forms
            return fr'(?:^|(?<=[^\w])){escaped_name}(?:\'s?)?(?=$|[^\w])'

        # Try direct flexible matching first
        matches = list(re.finditer(create_flexible_pattern(place_name_normalized), text_normalized))
        
        if matches:
            # Process all matches
            return [{
                'start_position': match.start() - sum(len(line) + 1 for line in text_normalized[:match.start()].split('\n')[:-1]),
                'end_position': match.end() - sum(len(line) + 1 for line in text_normalized[:match.end()].split('\n')[:-1]),
                'line': f"/p[{self._get_line_number(text, match.start())}]"
            } for match in matches]

        # If direct word match fails, try partial match
        search_term = place_name_normalized.rstrip("s'")
        matches = list(re.finditer(re.escape(search_term) + r"(?:\'s?)?", text_normalized))
        
        if matches:
            return [{
                'start_position': match.start() - sum(len(line) + 1 for line in text_normalized[:match.start()].split('\n')[:-1]),
                'end_position': match.end() - sum(len(line) + 1 for line in text_normalized[:match.end()].split('\n')[:-1]),
                'line': f"/p[{self._get_line_number(text, match.start())}]"
            } for match in matches]

        # Fallback: search for tokens in the place name
        tokens = search_term.split()
        for token in tokens:
            token_matches = list(re.finditer(r'\b' + re.escape(token) + r'(?:\'s?)?\b', text_normalized))
            if token_matches:
                return [{
                    'start_position': match.start() - sum(len(line) + 1 for line in text_normalized[:match.start()].split('\n')[:-1]),
                    'end_position': match.end() - sum(len(line) + 1 for line in text_normalized[:match.end()].split('\n')[:-1]),
                    'line': f"/p[{self._get_line_number(text, match.start())}]"
                } for match in token_matches]
        
        # If all else fails, return empty list
        return []



    def _get_line_number(self, text, character_position):
        """
        Get the line number and the local character position within that line
        
        Args:
            text (str): Full text content
            character_position (int): Absolute character position in the text
        
        Returns:
            int: Line number (1-indexed)
        """
        # Split the text into lines
        lines = text.split('\n')
        
        # Track cumulative character count
        cumulative_chars = 0
        
        # Iterate through lines to find the correct line
        for line_num, line in enumerate(lines, 1):
            # Length of the line plus newline character
            line_length = len(line) + 1  # +1 for the newline character
            
            # Check if the character position falls within this line
            if character_position < cumulative_chars + line_length:
                return line_num
            
            # Move to next line's starting position
            cumulative_chars += line_length
        
        # Fallback to last line if position is beyond text length
        return len(lines)

    def generate_uuids(self):
        """Generate a set of UUIDs for use in the Trig template"""
        return {
            'main': str(uuid.uuid4()),
            'uuid2': str(uuid.uuid4()),
            'uuid3': str(uuid.uuid4()),
            'uuid4': str(uuid.uuid4()),
            'uuid5': str(uuid.uuid4()),
            'uuid6': str(uuid.uuid4()),
            'uuid7': str(uuid.uuid4())
        }

    def convert_to_trig(self, place_data, input_filename):
        """Convert a single place data to Trig format"""
        uuids = self.generate_uuids()
        creation_date = datetime.now().isoformat()
        ndiary = os.path.splitext(os.path.basename(input_filename))[0]

        # Merge extracted place data with position information
        trig_template = f"""
        @prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix crminfluence: <http://www.cidoc-crm.org/cidoc-crm/influence/> .
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix crmsci: <http://www.ics.forth.gr/isl/CRMsci/> .
@prefix Help: <http://help.researchspace.org/resource/> .
@prefix bds: <http://www.bigdata.com/rdf/search#> .
@prefix crmba: <http://www.cidoc-crm.org/cidoc-crm/CRMba/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix mbdiaries-annotation: <https://mbdiaries.itatti.harvard.edu/annotation/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix iiif: <http://iiif.io/api/> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix sim: <http://purl.org/ontology/similarity/> .
@prefix fc: <https://collection.itatti.harvard.edu/resource/custom/fc/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix User: <http://www.researchspace.org/resource/user/> .
@prefix mbdiaries-type: <https://mbdiaries.itatti.harvard.edu/resource/type/> .
@prefix forms: <http://www.researchspace.org/resource/system/forms/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rshelp: <http://researchspace.org/help/> .
@prefix sp: <http://spinrdf.org/sp#> .
@prefix Platform: <http://www.researchspace.org/resource/system/> .
@prefix mbdiaries: <https://mbdiaries.itatti.harvard.edu/resource/> .
@prefix fr: <https://collection.itatti.harvard.edu/resource/custom/fr/> .
@prefix crmgeo: <http://www.ics.forth.gr/isl/CRMgeo/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix mbdiaries_forms: <http://mbdiaries.itatti.harvard.edu/resource/forms> .
@prefix schema: <http://schema.org/> .
@prefix rso: <http://www.researchspace.org/ontology/> .
@prefix Admin: <http://www.researchspace.org/resource/admin/> .
@prefix vitiiif: <https://iiif.itatti.harvard.edu/iiif/2/> .
@prefix ontodia: <http://ontodia.org/schema/v1#> .
@prefix frbroo: <http://iflastandards.info/ns/fr/frbr/frbroo/> .
@prefix crmarchaeo: <http://www.cidoc-crm.org/cidoc-crm/CRMarchaeo/> .
@prefix rsp: <http://www.researchspace.org/resource/> .
@prefix Default: <https://collection.itatti.harvard.edu/resource/> .
@prefix mbdiaries-document: <https://mbdiaries.itatti.harvard.edu/document/> .
@prefix mbdiaries-ontology: <https://mbdiaries.itatti.harvard.edu/ontology/> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
        
    <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container/context> {{
      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid7']}>
        a oa:TextPositionSelector;
        oa:end "{place_data.get('start_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{place_data.get('start_position', '')}"^^xsd:nonNegativeInteger .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid3']}>
        a oa:TextPositionSelector;
        oa:end "{place_data.get('end_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{place_data.get('end_position', '')}"^^xsd:nonNegativeInteger .

      mbdiaries-annotation:{uuids['main']} a oa:Annotation, crmdig:D29_Annotation_Object;
        crmdig:L48i_was_annotation_created_by <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}>;
        oa:hasBody <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>;
        oa:hasTarget <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-source-{uuids['uuid4']}> .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container>
        a ldp:Resource, prov:Entity;
        prov:wasAttributedTo User:agent;
        prov:generatedAtTime "{creation_date}"^^xsd:dateTime .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-source-{uuids['uuid4']}>
        a oa:SpecificResource;
        oa:hasSource <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}>;
        oa:hasSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-{uuids['uuid4']}>;
        rdf:value "{place_data.get('name', '')}" .

      <https://www.wikidata.org/wiki/{place_data.get('wiki_id', '')}> rdfs:label "{place_data.get('name', '')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-{uuids['uuid4']}>
        a oa:RangeSelector;
        oa:hasEndSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid5']}>;
        oa:hasStartSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid6']}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid6']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid7']}>;
        rdf:value "{place_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid5']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid3']}>;
        rdf:value "{place_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}>
        a crmdig:D30_Annotation_Event;
        crm:P4_has_time_span <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}/modifiedAt>;
        crm:P14_carried_out_by User:agent .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>
        a mbdiaries-ontology:Location;
        a crm:E52_place;
        crm:P168_place_is_defined_by "{place_data.get('coordinates', '')}";
        owl:sameAs <https://www.wikidata.org/wiki/{place_data.get('wiki_id', '')}>;
        owl:sameAs <https://www.geonames.org/{place_data.get('geoid', '')}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}/modifiedAt>
        crm:P81b_begin_of_the_end "{creation_date}"^^xsd:dateTime;
        crm:P81a_end_of_the_begin "{creation_date}"^^xsd:dateTime .

      _:node1i8224na8x5257 ldp:contains <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container> .
    }}

    {{
      _:node1i8224na8x5257 a ldp:Container, ldp:Resource, prov:Entity .
    }}"""

        return trig_template

    def process_batch(self, input_directory):
        """Process all .txt files in the input directory"""
        if not os.path.exists(input_directory):
            print(f"Input directory {input_directory} does not exist.")
            return

        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        files = [f for f in os.listdir(input_directory) if f.endswith('.txt')]
        sorted_files = natsorted(files)

        for filename in sorted_files:
            filepath = os.path.join(input_directory, filename)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()

                places_data = self.extract_places(text)

                if not places_data or not places_data.get("places"):
                    print(f"No places extracted from {filename}")
                    continue

                annotation_count = 0
                processed_positions = set()  # Track processed positions to avoid duplicates

                for place in places_data["places"]:
                    # Find all positions for this place
                    positions_list = self.find_place_positions(text, place['name'])
                    
                    # Create an annotation for each unique occurrence
                    for position_data in positions_list:
                        # Create a unique key for this position
                        position_key = f"{position_data['start_position']}-{position_data['end_position']}"
                        
                        # Skip if we've already processed this exact position
                        if position_key in processed_positions:
                            continue
                            
                        processed_positions.add(position_key)
                        
                        complete_place_data = {**place, **position_data}
                        
                        # Convert to Trig
                        trig_content = self.convert_to_trig(complete_place_data, filename)

                        # Create safe filename with occurrence number
                        base_name = os.path.splitext(filename)[0]
                        safe_place_name = "".join(x for x in place['name'].lower() if x.isalnum() or x in (' ', '-', '_'))
                        
                        annotation_count += 1
                        output_filename = f"{self.output_dir}/{base_name}_place_{annotation_count}_{safe_place_name}.trig"
                        
                        with open(output_filename, 'w', encoding='utf-8') as f:
                            f.write(trig_content)
                        print(f"Processed and saved {output_filename}")

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

def main():
    # Initialize processor with your API key
    api_key = ''  # Replace with your actual API key
    processor = TextToRDFProcessor(api_key)

    # Specify the input directory containing text files
    input_directory = "./txt"  

    # Process all text files in the directory
    processor.process_batch(input_directory)

if __name__ == "__main__":
    main()

Processed and saved rdf_output/4_place_1_ancona.trig
Processed and saved rdf_output/5_place_1_12 lungarno acciajuoli florence.trig
Processed and saved rdf_output/5_place_2_doneys.trig
Processed and saved rdf_output/5_place_3_pistoia.trig
Processed and saved rdf_output/6_place_1_pistoia.trig
Processed and saved rdf_output/6_place_2_florence.trig
Processed and saved rdf_output/7_place_1_florence.trig
Processed and saved rdf_output/8_place_1_fiesole hill.trig
Processed and saved rdf_output/9_place_1_il palmerino maiano.trig
Processed and saved rdf_output/9_place_2_il palmerino maiano.trig
Processed and saved rdf_output/10_place_1_il palmerino maiano.trig
Processed and saved rdf_output/10_place_2_12 lungarno acciajuoli florence.trig
Processed and saved rdf_output/10_place_3_uffizi.trig
Processed and saved rdf_output/11_place_1_florence.trig
Processed and saved rdf_output/11_place_2_bargello.trig
Processed and saved rdf_output/11_place_3_alinaris.trig
Processed and saved rdf_output/11_place

### Dates extraction 

JSON to TriG

In [10]:
import os
import json
import uuid
import anthropic
from datetime import datetime
from natsort import natsorted

class TextToRDFProcessor:
    def __init__(self, api_key):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.output_dir = "rdf_output"
        self.last_known_date = None  # Store the last known date
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def extract_dates(self, text, previous_text=None):
        """Extract dates from text using Claude API"""
        context = ""
        if previous_text:
            context = f"Previous text for context:\n{previous_text}\n\nCurrent text to analyze:\n"
            
        system_prompt = """You are an AI assistant specialized in extracting dates from texts into JSON format.
        Your task is to analyze the text and extract ONLY dates which Mary Berenson mentioned and that could be associated to a location in which she says she was, create a JSON object with these exact fields:
        - date: date value in ISO 8601 format
        - name: name of the date
        - line: text line reference in format /p[N] where N is line number. If a place is mentioned more than once in the text, be careful to correctly identify the correct line in which it's mentioned.

        
        Return the results in this exact format, with no additional text:
        {
            "dates": [
                {
                    "name": "25th September 2023",
                    "date": "2023-09-25T00:00:00:000Z",
                    "line": "/p[1]"
                }
            ]
        }"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-latest",
                max_tokens=2048,
                system=system_prompt,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please analyze carefully the diary page and extract all the dates which Mary Berenson mentioned and that could be associated to a location in which she says she was.  {context}{text}"
                    }
                ]
            )

            # Extract and process JSON as before...
            response_text = response.content[0].text
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1

            if start_idx == -1 or end_idx == 0:
                raise ValueError("No valid JSON found in response")

            json_str = response_text[start_idx:end_idx]
            dates_data = json.loads(json_str)

            if "dates" not in dates_data:
                dates_data = {"dates": [dates_data]}

            # Update last known date if we found any dates
            if dates_data["dates"]:
                self.last_known_date = dates_data["dates"][-1]["date"]

            return dates_data

        except Exception as e:
            print(f"Error extracting dates: {str(e)}")
            print(f"Raw response: {response_text}")
            return None
    def find_place_positions(self, text, place_name):
        """
        Find the start and end positions of a place name in the text
        with multiple fallback strategies
        """
        # Normalize place name and text for matching
        place_name_normalized = place_name.lower().strip()
        text_normalized = text.lower()

        # Try direct word matching first
        matches = list(re.finditer(r'\b' + re.escape(place_name_normalized) + r'\b', text_normalized))
        
        if matches:
            # If multiple matches, prefer the first one
            match = matches[0]
            absolute_start = match.start()
            
            return {
                'start_position': absolute_start - sum(len(line) + 1 for line in text_normalized[:absolute_start].split('\n')[:-1]),
                'end_position': match.end() - sum(len(line) + 1 for line in text_normalized[:match.end()].split('\n')[:-1]),
                'line': f"/p[{self._get_line_number(text, absolute_start)}]"
            }
        
        # If direct word match fails, try partial match
        matches = list(re.finditer(re.escape(place_name_normalized), text_normalized))
        
        if matches:
            # If multiple matches, prefer the first one
            match = matches[0]
            absolute_start = match.start()
            
            return {
                'start_position': absolute_start - sum(len(line) + 1 for line in text_normalized[:absolute_start].split('\n')[:-1]),
                'end_position': match.end() - sum(len(line) + 1 for line in text_normalized[:match.end()].split('\n')[:-1]),
                'line': f"/p[{self._get_line_number(text, absolute_start)}]"
            }
        
        # Fallback: search for tokens in the place name
        tokens = place_name_normalized.split()
        for token in tokens:
            token_matches = list(re.finditer(r'\b' + re.escape(token) + r'\b', text_normalized))
            if token_matches:
                match = token_matches[0]
                absolute_start = match.start()
                
                return {
                    'start_position': absolute_start - sum(len(line) + 1 for line in text_normalized[:absolute_start].split('\n')[:-1]),
                    'end_position': match.end() - sum(len(line) + 1 for line in text_normalized[:match.end()].split('\n')[:-1]),
                    'line': f"/p[{self._get_line_number(text, absolute_start)}]"
                }
        
        # If all else fails, return None
        return None

    def _get_line_number(self, text, character_position):
        """
        Get the line number and the local character position within that line
        
        Args:
            text (str): Full text content
            character_position (int): Absolute character position in the text
        
        Returns:
            int: Line number (1-indexed)
        """
        # Split the text into lines
        lines = text.split('\n')
        
        # Track cumulative character count
        cumulative_chars = 0
        
        # Iterate through lines to find the correct line
        for line_num, line in enumerate(lines, 1):
            # Length of the line plus newline character
            line_length = len(line) + 1  # +1 for the newline character
            
            # Check if the character position falls within this line
            if character_position < cumulative_chars + line_length:
                return line_num
            
            # Move to next line's starting position
            cumulative_chars += line_length
        
        # Fallback to last line if position is beyond text length
        return len(lines)

    def process_batch(self, input_directory):
        """Process all .txt files in the input directory"""
        if not os.path.exists(input_directory):
            print(f"Input directory {input_directory} does not exist.")
            return

        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        files = [f for f in os.listdir(input_directory) if f.endswith('.txt')]
        sorted_files = natsorted(files)

        previous_text = None
        
        for filename in sorted_files:
            filepath = os.path.join(input_directory, filename)
            
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    current_text = f.read()

                # Extract dates with context from previous text
                dates_data = self.extract_dates(current_text, previous_text)

                if not dates_data or not dates_data.get("dates"):
                    print(f"No dates extracted from {filename}")
                    previous_text = current_text
                    continue

                # Process each date
                for i, date in enumerate(dates_data["dates"]):
                    trig_content = self.convert_to_trig(date, filename)
                    
                    base_name = os.path.splitext(filename)[0]
                    safe_date_name = "".join(x for x in date['name'].lower() if x.isalnum() or x in (' ', '-', '_'))
                    
                    output_filename = f"{self.output_dir}/{base_name}_date_{i+1}_{safe_date_name}.trig"
                    with open(output_filename, 'w', encoding='utf-8') as f:
                        f.write(trig_content)
                    print(f"Processed and saved {output_filename}")

                # Update previous text for next iteration
                previous_text = current_text

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    def generate_uuids(self):
        """Generate a set of UUIDs for use in the Trig template"""
        return {
            'main': str(uuid.uuid4()),
            'uuid2': str(uuid.uuid4()),
            'uuid3': str(uuid.uuid4()),
            'uuid4': str(uuid.uuid4()),
            'uuid5': str(uuid.uuid4()),
            'uuid6': str(uuid.uuid4()),
            'uuid7': str(uuid.uuid4())
        }

    def convert_to_trig(self, date_data, input_filename):
        """Convert a single date data to Trig format"""
        uuids = self.generate_uuids()
        creation_date = datetime.now().isoformat()
        ndiary = os.path.splitext(os.path.basename(input_filename))[0]

        trig_template = f"""
        
    @prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix crminfluence: <http://www.cidoc-crm.org/cidoc-crm/influence/> .
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix crmsci: <http://www.ics.forth.gr/isl/CRMsci/> .
@prefix Help: <http://help.researchspace.org/resource/> .
@prefix bds: <http://www.bigdata.com/rdf/search#> .
@prefix crmba: <http://www.cidoc-crm.org/cidoc-crm/CRMba/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix mbdiaries-annotation: <https://mbdiaries.itatti.harvard.edu/annotation/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix iiif: <http://iiif.io/api/> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix sim: <http://purl.org/ontology/similarity/> .
@prefix fc: <https://collection.itatti.harvard.edu/resource/custom/fc/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix User: <http://www.researchspace.org/resource/user/> .
@prefix mbdiaries-type: <https://mbdiaries.itatti.harvard.edu/resource/type/> .
@prefix forms: <http://www.researchspace.org/resource/system/forms/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rshelp: <http://researchspace.org/help/> .
@prefix sp: <http://spinrdf.org/sp#> .
@prefix Platform: <http://www.researchspace.org/resource/system/> .
@prefix mbdiaries: <https://mbdiaries.itatti.harvard.edu/resource/> .
@prefix fr: <https://collection.itatti.harvard.edu/resource/custom/fr/> .
@prefix crmgeo: <http://www.ics.forth.gr/isl/CRMgeo/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix mbdiaries_forms: <http://mbdiaries.itatti.harvard.edu/resource/forms> .
@prefix schema: <http://schema.org/> .
@prefix rso: <http://www.researchspace.org/ontology/> .
@prefix Admin: <http://www.researchspace.org/resource/admin/> .
@prefix vitiiif: <https://iiif.itatti.harvard.edu/iiif/2/> .
@prefix ontodia: <http://ontodia.org/schema/v1#> .
@prefix frbroo: <http://iflastandards.info/ns/fr/frbr/frbroo/> .
@prefix crmarchaeo: <http://www.cidoc-crm.org/cidoc-crm/CRMarchaeo/> .
@prefix rsp: <http://www.researchspace.org/resource/> .
@prefix Default: <https://collection.itatti.harvard.edu/resource/> .
@prefix mbdiaries-document: <https://mbdiaries.itatti.harvard.edu/document/> .
@prefix mbdiaries-ontology: <https://mbdiaries.itatti.harvard.edu/ontology/> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .

    <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container/context> {{
      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid7']}>
        a oa:TextPositionSelector;
        oa:end "{date_data.get('start_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{date_data.get('start_position', '')}"^^xsd:nonNegativeInteger .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid3']}>
        a oa:TextPositionSelector;
        oa:end "{date_data.get('end_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{date_data.get('end_position', '')}"^^xsd:nonNegativeInteger .

      mbdiaries-annotation:{uuids['main']} a oa:Annotation, crmdig:D29_Annotation_Object;
        crmdig:L48i_was_annotation_created_by <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}>;
        oa:hasBody <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>;
        oa:hasTarget <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-source-{uuids['uuid4']}> .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container>
        a ldp:Resource, prov:Entity;
        prov:wasAttributedTo User:agent;
        prov:generatedAtTime "{creation_date}"^^xsd:dateTime .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-source-{uuids['uuid4']}>
        a oa:SpecificResource;
        oa:hasSource <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}>;
        oa:hasSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-{uuids['uuid4']}>;
        rdf:value "{date_data.get('name', '')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/range-{uuids['uuid4']}>
        a oa:RangeSelector;
        oa:hasEndSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid5']}>;
        oa:hasStartSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid6']}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid6']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid7']}>;
        rdf:value "{date_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/xpath-{uuids['uuid5']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/offset-{uuids['uuid3']}>;
        rdf:value "{date_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}>
        a crmdig:D30_Annotation_Event;
        crm:P4_has_time_span <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}/modifiedAt>;
        crm:P14_carried_out_by User:agent .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>
        a crm:E52_time_span;
        crm:P181b_begin_of_the_end "{date_data.get('date', "")}"^^xsd:dateTime.

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/{ndiary}/annotation-event-{uuids['uuid2']}/modifiedAt>
        crm:P81b_begin_of_the_end "{creation_date}"^^xsd:dateTime;
        crm:P81a_end_of_the_begin "{creation_date}"^^xsd:dateTime .

      _:node1i8224na8x5257 ldp:contains <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container> .
    }}

    {{
      _:node1i8224na8x5257 a ldp:Container, ldp:Resource, prov:Entity .
    }}"""

        return trig_template



    def process_batch(self, input_directory):
        """Process all .txt files in the input directory"""
        # Ensure input directory exists
        if not os.path.exists(input_directory):
            print(f"Input directory {input_directory} does not exist.")
            return

        # Create output directory if it doesn't exist
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # Get all .txt files and sort them naturally
        files = [f for f in os.listdir(input_directory) if f.endswith('.txt')]
        sorted_files = natsorted(files)

        # Iterate through sorted files
        for filename in sorted_files:
            filepath = os.path.join(input_directory, filename)
            
            try:
                # Read the text file
                with open(filepath, 'r', encoding='utf-8') as f:
                    text = f.read()

                # Extract dates
                dates_data = self.extract_dates(text)

                if not dates_data or not dates_data.get("dates"):
                    print(f"No dates extracted from {filename}")
                    continue

                # Process each place
                for i, date in enumerate(dates_data["dates"]):
                    # Find place positions in the text
                    position_data = self.find_place_positions(text, date['name'])
                    
                    if position_data:
                        # Merge position data with original place data
                        complete_place_data = {**date, **position_data}
                    else:
                        # Fallback if position finding fails
                        complete_place_data = {
                            **date,
                            'start_position': '',
                            'end_position': '',
                            'line': f"/p[{i+1}]"
                        }
                    trig_content = self.convert_to_trig(complete_place_data, filename)

                    # Create safe filename
                    base_name = os.path.splitext(filename)[0]
                    safe_date_name = "".join(x for x in date['name'].lower() if x.isalnum() or x in (' ', '-', '_'))
                    
                    # Save to file with original filename as prefix
                    output_filename = f"{self.output_dir}/{base_name}_date_{i+1}_{safe_date_name}.trig"
                    with open(output_filename, 'w', encoding='utf-8') as f:
                        f.write(trig_content)
                    print(f"Processed and saved {output_filename}")

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

def main():
    # Initialize processor with your API key
    api_key = ''  # Replace with your actual API key
    processor = TextToRDFProcessor(api_key)

    # Specify the input directory containing text files
    input_directory = "txt"  

    # Process all text files in the directory
    processor.process_batch(input_directory)

if __name__ == "__main__":
    main()

Processed and saved rdf_output/4_date_1_may 1893.trig
Processed and saved rdf_output/5_date_1_february 14 1894.trig
Processed and saved rdf_output/6_date_1_saturday the 3rd.trig
Processed and saved rdf_output/7_date_1_feb 15 1894.trig
No dates extracted from 8.txt
Processed and saved rdf_output/9_date_1_friday feb 16 1894.trig
Processed and saved rdf_output/9_date_2_saturday feb 17 1894.trig
Processed and saved rdf_output/10_date_1_sunday feb 18 1894.trig
Processed and saved rdf_output/10_date_2_monday feb 19 1894.trig
Processed and saved rdf_output/11_date_1_tuesday feb 20 1894.trig
Processed and saved rdf_output/12_date_1_wednesday feb 21 1894.trig
Processed and saved rdf_output/12_date_2_thursday feb 22 1894.trig
Processed and saved rdf_output/13_date_1_friday feb 23 1894.trig
Processed and saved rdf_output/13_date_2_saturday feb 24 1894.trig
Processed and saved rdf_output/14_date_1_sunday feb 25 1894.trig
Processed and saved rdf_output/14_date_2_monday feb 26 1894.trig
Processed an

### Event creation 
This code analyzes text and its corresponding place and time TRiG files to create events following a predefined template aligned with the main data model (detailed in the README). It supports generating sequential events across multiple diary pages, provided the pages are supplied in chronological order. The code is designed to not only process the current text but also anticipate subsequent events by examining following texts, extracting and structuring their relevant values in advance.

In [12]:
import os
import json
import uuid
import anthropic
from datetime import datetime
import shutil
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple

def natural_sort_key(s):
    """
    Function to generate key for natural sorting.
    Splits string into list of strings and numbers.
    """
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split('([0-9]+)', str(s))]

class EventProcessor:
    def __init__(self, api_key: str):
        """Initialize EventProcessor with API key and create output directory."""
        self.client = anthropic.Anthropic(api_key=api_key)
        self.output_dir = Path("rdf_output")
        self.output_dir.mkdir(exist_ok=True)

    def collect_files(self, text_id: str, trig_dir: Path) -> Tuple[str, List[Dict], List[Dict]]:
        """
        Collect text and trig files for a given text ID using natural sorting.
        
        Args:
            text_id: The ID of the text file
            trig_dir: Directory containing trig files
            
        Returns:
            Tuple containing text content and lists of date and place file dictionaries
        """
        text_file = Path(f"./1891/1891/txt/{text_id}.txt")
        if not text_file.exists():
            raise FileNotFoundError(f"Text file {text_file} not found")

        with text_file.open('r', encoding='utf-8') as f:
            text_content = f.read()

        date_files = []
        place_files = []

        # Use natural sorting for trig files
        trig_files = sorted(trig_dir.glob(f"{text_id}_*.trig"), 
                          key=lambda x: natural_sort_key(x.stem))
        
        for trig_file in trig_files:
            with trig_file.open('r', encoding='utf-8') as f:
                content = f.read()
                if 'date' in trig_file.stem.lower():
                    date_files.append({'filename': trig_file.name, 'content': content})
                elif 'place' in trig_file.stem.lower():
                    place_files.append({'filename': trig_file.name, 'content': content})

        return text_content, date_files, place_files

    def extract_events(self, text: str, date_files: List[Dict], place_files: List[Dict]) -> Dict:
        """
        Extract events using Claude API with both text and trig files.
        
        Args:
            text: The diary text content
            date_files: List of date trig file dictionaries
            place_files: List of place trig file dictionaries
            
        Returns:
            Dictionary containing extracted events
        """
        system_prompt = """Analyze the diary excerpt and the provided trig files to create a sequence of events representing Mary Berenson's movements in time and space. Each event should be created by matching places and dates from the text that are also represented in the trig files.

For each event pair, you need to:
1. Match places and dates that clearly belong together in the text
2. Find the corresponding trig files for both the place and date
3. Extract the UUIDs from the body annotations in the trig files (found in the pattern: annotation/UUID/body)
4. Create a chronologically ordered sequence of events

Return the results in this exact format, with no additional text:
{
    "events": [
        {
            "event_value": "Place, day month year",
            "place_annotation_uuid": "uuid-from-place-trig",
            "date_annotation_uuid": "uuid-from-date-trig",
            "event_uuid": "new-generated-uuid",
            "next_event_value": "Place, day month year",
            "next_event_uuid": "uuid-for-next-event",
        }
    ]
}"""

        content = f"""Here is the diary excerpt:
        {text}

        Here are the date trig files:
        {'-' * 50}
        """

        for date_file in sorted(date_files, key=lambda x: natural_sort_key(x['filename'])):
            content += f"\nFile: {date_file['filename']}\n{date_file['content']}\n{'-' * 50}"

        content += "\n\nHere are the place trig files:\n" + ('-' * 50)

        for place_file in sorted(place_files, key=lambda x: natural_sort_key(x['filename'])):
            content += f"\nFile: {place_file['filename']}\n{place_file['content']}\n{'-' * 50}"

        try:
            response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=4096,
                system=system_prompt,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please analyze the following text and trig files to extract and match events:\n\n{content}"
                    }
                ]
            )

            response_text = response.content[0].text
            json_start = response_text.find('{')
            json_end = response_text.rfind('}') + 1

            if json_start == -1 or json_end == 0:
                raise ValueError("No valid JSON found in response")

            events_data = json.loads(response_text[json_start:json_end])
            self.validate_event_uuid_sequence(events_data)
            return events_data

        except Exception as e:
            print(f"Error extracting events: {str(e)}")
            return None

    def validate_event_uuid_sequence(self, events_data: Dict) -> None:
        """
        Validate that next_event_uuid matches the event_uuid of the next event.
        
        Args:
            events_data: Dictionary containing event data
        """
        events = events_data.get('events', [])
        for i in range(len(events) - 1):
            current_event = events[i]
            next_event = events[i + 1]

            if current_event.get('next_event_uuid') != next_event.get('event_uuid'):
                raise ValueError(
                    f"UUID sequence mismatch: Event {i}'s next_event_uuid "
                    f"({current_event.get('next_event_uuid')}) does not match "
                    f"Event {i+1}'s event_uuid ({next_event.get('event_uuid')})"
                )

    def validate_uuids(self, events_data: Dict, date_files: List[Dict], place_files: List[Dict]) -> bool:
        """
        Validate that UUIDs in events match those in trig files.
        
        Args:
            events_data: Dictionary containing event data
            date_files: List of date trig file dictionaries
            place_files: List of place trig file dictionaries
            
        Returns:
            Boolean indicating if all UUIDs are valid
        """
        for event in events_data.get('events', []):
            place_uuid = event.get('place_annotation_uuid')
            date_uuid = event.get('date_annotation_uuid')

            place_found = any(place_uuid in place_file['content'] for place_file in place_files)
            date_found = any(date_uuid in date_file['content'] for date_file in date_files)

            if not (place_found and date_found):
                print(f"UUID validation failed for event: {event['event_value']}")
                return False
        return True

    def convert_to_trig(self, event_data: Dict) -> str:
        """
        Convert a single event data to Trig format.
        
        Args:
            event_data: Dictionary containing event data
            
        Returns:
            String containing Trig format data
        """
        trig_template = f"""
        @prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix crminfluence: <http://www.cidoc-crm.org/cidoc-crm/influence/> .
@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix crmsci: <http://www.ics.forth.gr/isl/CRMsci/> .
@prefix Help: <http://help.researchspace.org/resource/> .
@prefix bds: <http://www.bigdata.com/rdf/search#> .
@prefix crmba: <http://www.cidoc-crm.org/cidoc-crm/CRMba/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix mbdiaries-annotation: <https://mbdiaries.itatti.harvard.edu/annotation/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix iiif: <http://iiif.io/api/> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix sim: <http://purl.org/ontology/similarity/> .
@prefix fc: <https://collection.itatti.harvard.edu/resource/custom/fc/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix User: <http://www.researchspace.org/resource/user/> .
@prefix mbdiaries-type: <https://mbdiaries.itatti.harvard.edu/resource/type/> .
@prefix forms: <http://www.researchspace.org/resource/system/forms/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rshelp: <http://researchspace.org/help/> .
@prefix sp: <http://spinrdf.org/sp#> .
@prefix Platform: <http://www.researchspace.org/resource/system/> .
@prefix mbdiaries: <https://mbdiaries.itatti.harvard.edu/resource/> .
@prefix fr: <https://collection.itatti.harvard.edu/resource/custom/fr/> .
@prefix crmgeo: <http://www.ics.forth.gr/isl/CRMgeo/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix mbdiaries_forms: <http://mbdiaries.itatti.harvard.edu/resource/forms> .
@prefix schema: <http://schema.org/> .
@prefix rso: <http://www.researchspace.org/ontology/> .
@prefix Admin: <http://www.researchspace.org/resource/admin/> .
@prefix vitiiif: <https://iiif.itatti.harvard.edu/iiif/2/> .
@prefix ontodia: <http://ontodia.org/schema/v1#> .
@prefix frbroo: <http://iflastandards.info/ns/fr/frbr/frbroo/> .
@prefix crmarchaeo: <http://www.cidoc-crm.org/cidoc-crm/CRMarchaeo/> .
@prefix rsp: <http://www.researchspace.org/resource/> .
@prefix Default: <https://collection.itatti.harvard.edu/resource/> .
@prefix mbdiaries-document: <https://mbdiaries.itatti.harvard.edu/document/> .
@prefix mbdiaries-ontology: <https://mbdiaries.itatti.harvard.edu/ontology/> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .

        <https://mbdiaries.itatti.harvard.edu/event/{event_data.get('event_uuid', '')}/container/context> {{
        <https://mbdiaries.itatti.harvard.edu/event/{event_data.get('event_uuid', '')}>
            a crm:E5_event;
            crm:P160_has_temporal_projection <https://mbdiaries.itatti.harvard.edu/annotation/{event_data.get('date_annotation_uuid', '')}/body>;
            crm:P161_has_spatial_projection <https://mbdiaries.itatti.harvard.edu/annotation/{event_data.get('place_annotation_uuid', '')}/body>;
            crm:P183_ends_before_the_start <https://mbdiaries.itatti.harvard.edu/event/{event_data.get('next_event_uuid', '')}>.
            
        _:node1i8224na8x5257 ldp:contains <https://mbdiaries.itatti.harvard.edu/event/{event_data.get('event_uuid', '')}/container> .
              }}
        {{
        _:node1i8224na8x5257 a ldp:Container, ldp:Resource, prov:Entity .
        }}"""
        return trig_template

    def save_events_as_trig(self, events_data: Dict, text_id: str):
        """
        Save events to individual Trig files.
        
        Args:
            events_id: The ID of the text file
        """
        if not events_data or 'events' not in events_data:
            print("No events to save")
            return

        for i, event in enumerate(events_data['events'], 1):
            trig_content = self.convert_to_trig(event)
            trig_filename = self.output_dir / f"{text_id}_event_{event['event_uuid']}.trig"

            try:
                with trig_filename.open('w', encoding='utf-8') as f:
                    f.write(trig_content)
                print(f"Saved Trig file: {trig_filename}")
            except Exception as e:
                print(f"Error saving Trig file for event {i}: {str(e)}")

def main():
    """Main execution function."""
    api_key = ""
    input_dir = Path("./txt")
    trig_dir = Path("rdf_output")
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    try:
        processor = EventProcessor(api_key)

        # Get naturally sorted list of all text files
        text_files = sorted(list(input_dir.glob('*.txt')), 
                          key=lambda x: natural_sort_key(x.stem))
        
        # Process each text file
        for i, text_file in enumerate(text_files):
            text_id = text_file.stem
            
            # Collect date and place trig files for current text
            trig_files = sorted(trig_dir.glob(f"{text_id}_*.trig"), 
                              key=lambda x: natural_sort_key(x.stem))
            
            date_files, place_files = [], []
            for trig_file in trig_files:
                with trig_file.open('r', encoding='utf-8') as f:
                    content = f.read()
                    if 'date' in trig_file.stem.lower():
                        date_files.append({'filename': trig_file.name, 'content': content})
                    elif 'place' in trig_file.stem.lower():
                        place_files.append({'filename': trig_file.name, 'content': content})

            if not date_files or not place_files:
                print(f"Skipping {text_id}: No matching trig files found")
                continue

            # Process current text
            text_content = text_file.read_text(encoding='utf-8')
            current_events_data = processor.extract_events(text_content, date_files, place_files)
            
            if not current_events_data or 'events' not in current_events_data: #first event in the next 3 texts
                next_event = None
            for j in range(i + 1, min(i + 4, len(text_files))):
                next_text_file = text_files[j]
                next_text_id = next_text_file.stem
                
                # Collect trig files for next text
                next_trig_files = sorted(trig_dir.glob(f"{next_text_id}_*.trig"), 
                                       key=lambda x: natural_sort_key(x.stem))
                
                next_date_files, next_place_files = [], []
                for trig_file in next_trig_files:
                    with trig_file.open('r', encoding='utf-8') as f:
                        content = f.read()
                        if 'date' in trig_file.stem.lower():
                            next_date_files.append({'filename': trig_file.name, 'content': content})
                        elif 'place' in trig_file.stem.lower():
                            next_place_files.append({'filename': trig_file.name, 'content': content})

                if next_date_files and next_place_files:
                    next_text_content = next_text_file.read_text(encoding='utf-8')
                    next_events_data = processor.extract_events(next_text_content, 
                                                             next_date_files, 
                                                             next_place_files)
                    
                    if next_events_data and 'events' in next_events_data and next_events_data['events']:
                        next_event = next_events_data['events'][0]
                        break

            # Update the last event in current text with next event information
            if next_event and current_events_data['events']:
                last_event = current_events_data['events'][-1]
                last_event['next_event_value'] = next_event['event_value']
                last_event['next_event_uuid'] = next_event['event_uuid']

            # Save processed events
            output_json_path = output_dir / f"{text_id}_events.json"
            with output_json_path.open('w', encoding='utf-8') as f:
                json.dump(current_events_data, f, indent=2)

            # Save events as Trig files
            processor.save_events_as_trig(current_events_data, text_id)

            print(f"\nProcessed {text_id}:")
            for event in current_events_data['events']:
                print(f"\nEvent: {event['event_value']}")
                print(f"Place UUID: {event['place_annotation_uuid']}")
                print(f"Date UUID: {event['date_annotation_uuid']}")
                print(f"Event UUID: {event['event_uuid']}")
                if 'next_event_value' in event:
                    print(f"Next Event: {event['next_event_value']}")
                    print(f"Next Event UUID: {event['next_event_uuid']}")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        print("Please ensure:")
        print("1. The 'input' directory exists")
        print("2. Text and trig files are in the input directory")

if __name__ == "__main__":
    main()

Saved Trig file: rdf_output\4_event_e7b6d9e0-f9c4-4c8f-9f9d-5a0d5c8d1d7a.trig

Processed 4:

Event: Ancona, May 1893
Place UUID: affa6c80-3556-41fe-a562-bbf1da506edc
Date UUID: 5b30ee26-265e-4060-8eb2-9183d8124fa5
Event UUID: e7b6d9e0-f9c4-4c8f-9f9d-5a0d5c8d1d7a
Next Event: 12 Lungarno Acciajuoli, Florence, 14 February 1894
Next Event UUID: e8d4a51c-c63d-4ed8-bc94-019975ad4322
Saved Trig file: rdf_output\5_event_e8d4a51c-c63d-4ed8-bc94-019975ad4322.trig
Saved Trig file: rdf_output\5_event_a3d86ee7-ab9c-4c2d-b7d5-6174dbc6db91.trig
Saved Trig file: rdf_output\5_event_f7d4b611-9af7-4941-9dc4-48bd5a11268a.trig

Processed 5:

Event: 12 Lungarno Acciajuoli, Florence, 14 February 1894
Place UUID: e3df3100-eaaf-4799-8d04-03e709a53913
Date UUID: 49dee9bf-730a-4506-907e-be873f7203db
Event UUID: e8d4a51c-c63d-4ed8-bc94-019975ad4322
Next Event: Doney's, 14 February 1894
Next Event UUID: a3d86ee7-ab9c-4c2d-b7d5-6174dbc6db91

Event: Doney's, 14 February 1894
Place UUID: 55a33e85-7268-4506-bda1-78e8a