Produce a JSON first, and then fill the trig template.
First cell is a test, the rest is what i actually used.

In [None]:
import os
import base64
import json
import pandas as pd
import anthropic
from PIL import Image
from io import BytesIO

def process_text_pair(client, front_text):
    """Process a pair of text inputs (front and back) and generate JSON output"""
    try:
        system_prompt = """
you're an ai assistant specialized in extracting places from texts as JSON files. JSON that you produce must follow this format:
{
  name: name of the place
  coordinates: coordinates of the place in the format (40,04, 30.55)
  wiki_id:wikidata id of the place
  start_position: starting position of the place in the text
  end_position: ending position of the place in the text
  line: line of the text where the place is mentioned formatted in this way /p[1] where 1 is the line number. Line is different from paragraph
  uuid: a VALID UUID identifier.
}
        """

        initial_prompt = """
extract only the visited places from text. response must be in JSON. Create a different JSON for different places extracted, where you must add a field for the name of the places and one for the coordinates
        """

        response = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=2048,
            system=system_prompt,
            temperature=0,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": initial_prompt},
                        {"type": "text", "text": front_text}
                    ]
                },
                {
                    "role": "assistant",
                    "content": "Here is the JSON requested:\n{"
                }
            ]
        )

        transcription_text = response.content[0].text
        if not transcription_text.strip().startswith('{'):
            transcription_text = '{' + transcription_text.strip()

        # Parse the text as JSON to validate it
        try:
            transcription = json.loads(transcription_text)

            # Create output directory if it doesn't exist
            output_dir = "extracted_places"
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            # Save each place as a separate JSON file
            if "places" in transcription:
                for i, place in enumerate(transcription["places"]):
                    filename = f"{output_dir}/1p_place_{i+1}_{place['name'].lower()}.json"
                    with open(filename, 'w', encoding='utf-8') as f:
                        json.dump(place, f, indent=4)
                    print(f"Saved {filename}")

            return transcription

        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {str(e)}")
            return {"error": "Invalid JSON format", "raw_text": transcription_text}

    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return {"error": str(e)}

def main():
    try:
        # Initialize Claude client
        client = anthropic.Anthropic(
            api_key=''
        )

        # Example usage
        front_text = """Friday, October 25, 1895, Villa Rosa, Fiesole
A day with little to record, as we worked over the proofs of the Florentine Painters. We went down to the Academy
in the afternoon, and spent an hour in the Library, reading the Giornali.
We discovered Michelangelo’s S. Matteo!!!
Saturday, October 26, 1895
Again the damned Proofs - and a run into Florence to get the number of a Pollaiuolo. We meet Miss Hertz and a
        friend of hers in the Uffizi. Miss Hertz made us think of nothing but Bouvard, except possibly Aunty Lill!
        Bernhard then called on La baronne Puliga (“Brada”) and on Benn, while I walked home and devoted myself to some
        deadly dull writing on the French provincial galleries
All the evening we corrected, and corrected, and corrected proofs - until nearly 11, when we read Bernhard’s
        article on the Italians in New York and Boston
"""

        transcription = process_text_pair(client, front_text)
        print("Complete response:")
        print(json.dumps(transcription, indent=4))

    except Exception as e:
        print(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main()

Saved extracted_places/1p_place_1_villa rosa, fiesole.json
Saved extracted_places/1p_place_2_academy.json
Saved extracted_places/1p_place_3_florence.json
Saved extracted_places/1p_place_4_uffizi.json
Complete response:
{
    "places": [
        {
            "name": "Villa Rosa, Fiesole",
            "coordinates": "(43.8080, 11.2926)",
            "wiki_id": "Q3746",
            "start_position": 29,
            "end_position": 48,
            "line": "/p[1]",
            "uuid": "f47ac10b-58cc-4372-a567-0e02b2c3d479"
        },
        {
            "name": "Academy",
            "coordinates": "(43.7766, 11.2588)",
            "wiki_id": "Q1756942",
            "start_position": 165,
            "end_position": 172,
            "line": "/p[2]",
            "uuid": "550e8400-e29b-41d4-a716-446655440000"
        },
        {
            "name": "Florence",
            "coordinates": "(43.7696, 11.2558)",
            "wiki_id": "Q2044",
            "start_position": 321,
            "e

In [None]:
import os
import json
import uuid
import anthropic
from datetime import datetime

class TextToRDFProcessor:
    def __init__(self, api_key):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.output_dir = "rdf_output"
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def extract_places(self, text):
        """Extract places from text using Claude API"""
        system_prompt = """You are an AI assistant specialized in extracting places from texts into JSON format.
        Your task is to analyze the text and extract ONLY the places in which Mary Berenson says she was in a particular day, create a JSON object with these exact fields:
        - name: name of the place
        - coordinates: coordinates in format (40.04, 30.55)
        - wiki_id: the CORRECT wikidata ID
        - line: text line reference in format /p[N] where N is line number
        - start_position: character position where date mention starts: Offset resets after each line, so first character of the second line will have start_position = 0
        - end_position: character position where date mention ends. Offset resets after each line.

        Return the results in this exact format, with no additional text:
        {
            "places": [
                {
                    "name": "place name",
                    "coordinates": "(lat, long)",
                    "wiki_id": "Q12345",
                    "start_position": "23",
                    "end_position": "35",
                    "line": "/p[1]"
                }
            ]
        }"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-latest",
                max_tokens=2048,
                system=system_prompt,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please analyze carefully the diary page and extract all the places in which Mary Berenson said SHE WAS IN A PARTICULAR DAY and that could be associated with coordinates. \n\n{text}"
                    }
                ]
            )

            # Extract just the JSON part from Claude's response
            response_text = response.content[0].text

            # Find the JSON object bounds
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1

            if start_idx == -1 or end_idx == 0:
                raise ValueError("No valid JSON found in response")

            json_str = response_text[start_idx:end_idx]

            # Parse the JSON
            places_data = json.loads(json_str)

            # Validate the structure
            if "places" not in places_data:
                places_data = {"places": [places_data]}

            return places_data

        except Exception as e:
            print(f"Error extracting places: {str(e)}")
            print(f"Raw response: {response_text}")
            return None

    def generate_uuids(self):
        """Generate a set of UUIDs for use in the Trig template"""
        return {
            'main': str(uuid.uuid4()),
            'uuid2': str(uuid.uuid4()),
            'uuid3': str(uuid.uuid4()),
            'uuid4': str(uuid.uuid4()),
            'uuid5': str(uuid.uuid4()),
            'uuid6': str(uuid.uuid4()),
            'uuid7': str(uuid.uuid4())
        }

    def convert_to_trig(self, place_data, input_filename):
        """Convert a single place data to Trig format"""
        uuids = self.generate_uuids()
        creation_date = datetime.now().isoformat()

        trig_template = f"""<https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container/context> {{
      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid7']}>
        a oa:TextPositionSelector;
        oa:end "{place_data.get('start_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{place_data.get('start_position', '')}"^^xsd:nonNegativeInteger .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid3']}>
        a oa:TextPositionSelector;
        oa:end "{place_data.get('end_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{place_data.get('end_position', '')}"^^xsd:nonNegativeInteger .

      mbdiaries-annotation:{uuids['main']} a oa:Annotation, crmdig:D29_Annotation_Object;
        crmdig:L48i_was_annotation_created_by <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}>;
        oa:hasBody <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>;
        oa:hasTarget <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-source-{uuids['uuid4']}> .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container>
        a ldp:Resource, prov:Entity;
        prov:wasAttributedTo User:agent;
        prov:generatedAtTime "{creation_date}"^^xsd:dateTime .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-source-{uuids['uuid4']}>
        a oa:SpecificResource;
        oa:hasSource <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5>;
        oa:hasSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-{uuids['uuid4']}>;
        rdf:value "{place_data.get('name', '')}" .

      <https://www.wikidata.org/wiki/{place_data.get('wiki_id', '')}> rdfs:label "{place_data.get('name', '')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-{uuids['uuid4']}>
        a oa:RangeSelector;
        oa:hasEndSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid5']}>;
        oa:hasStartSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid6']}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid6']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid7']}>;
        rdf:value "{place_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid5']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid3']}>;
        rdf:value "{place_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}>
        a crmdig:D30_Annotation_Event;
        crm:P4_has_time_span <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}/modifiedAt>;
        crm:P14_carried_out_by User:agent .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>
        a mbdiaries-ontology:Location;
        a crm:E52_place;
        crm:P168_place_is_defined_by "{place_data.get('coordinates', '')}";
        owl:sameAs <https://www.wikidata.org/wiki/{place_data.get('wiki_id', '')}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}/modifiedAt>
        crm:P81b_begin_of_the_end "{creation_date}"^^xsd:dateTime;
        crm:P81a_end_of_the_begin "{creation_date}"^^xsd:dateTime .

      _:node1i8224na8x5257 ldp:contains <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container> .
    }}

    {{
      _:node1i8224na8x5257 a ldp:Container, ldp:Resource, prov:Entity .
    }}"""

        return trig_template

    def process_batch(self, input_directory):
        """Process all .txt files in the input directory"""
        # Ensure input directory exists
        if not os.path.exists(input_directory):
            print(f"Input directory {input_directory} does not exist.")
            return

        # Create output directory if it doesn't exist
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # Iterate through all .txt files in the input directory
        for filename in os.listdir(input_directory):
            if filename.endswith(".txt"):
                filepath = os.path.join(input_directory, filename)
                
                try:
                    # Read the text file
                    with open(filepath, 'r', encoding='utf-8') as f:
                        text = f.read()

                    # Extract places
                    places_data = self.extract_places(text)

                    if not places_data or not places_data.get("places"):
                        print(f"No places extracted from {filename}")
                        continue

                    # Process each place
                    for i, place in enumerate(places_data["places"]):
                        # Convert to Trig
                        trig_content = self.convert_to_trig(place, filename)

                        # Create safe filename
                        base_name = os.path.splitext(filename)[0]
                        safe_place_name = "".join(x for x in place['name'].lower() if x.isalnum() or x in (' ', '-', '_'))
                        
                        # Save to file with original filename as prefix
                        output_filename = f"{self.output_dir}/{base_name}_place_{i+1}_{safe_place_name}.trig"
                        with open(output_filename, 'w', encoding='utf-8') as f:
                            f.write(trig_content)
                        print(f"Processed and saved {output_filename}")

                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")

def main():
    # Initialize processor with your API key
    api_key = ''  # Replace with your actual API key
    processor = TextToRDFProcessor(api_key)

    # Specify the input directory containing text files
    input_directory = "./1891/1891/txt"  # Change this to your input directory path

    # Process all text files in the directory
    processor.process_batch(input_directory)

if __name__ == "__main__":
    main()

No places extracted from 100.txt
No places extracted from 101.txt
Processed and saved rdf_output/102_place_1_lake garda.trig
Processed and saved rdf_output/103_place_1_ratisbon.trig
Processed and saved rdf_output/103_place_2_schottenkirche.trig
Processed and saved rdf_output/103_place_3_cathedral of regensburg.trig
Processed and saved rdf_output/104_place_1_regensburg.trig
Processed and saved rdf_output/104_place_2_walhalla.trig
No places extracted from 105.txt
Processed and saved rdf_output/106_place_1_munich.trig
Processed and saved rdf_output/107_place_1_regensburg.trig
Processed and saved rdf_output/107_place_2_munich.trig
Processed and saved rdf_output/107_place_3_pinacothek.trig
Processed and saved rdf_output/107_place_4_glaspalast.trig
Processed and saved rdf_output/108_place_1_propylaia.trig
Processed and saved rdf_output/108_place_2_glyptothek.trig
Processed and saved rdf_output/109_place_1_munich.trig
Processed and saved rdf_output/110_place_1_munich.trig
Processed and saved 

KeyboardInterrupt: 

In [None]:
import os
import json
import uuid
import anthropic
from datetime import datetime

class TextToRDFProcessor:
    def __init__(self, api_key):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.output_dir = "rdf_output"
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

    def extract_dates(self, text):
        """Extract places from text using Claude API"""
        system_prompt = """You are an AI assistant specialized in extracting dates from texts into JSON format.
        Your task is to analyze the text and extract ONLY dates which Mary Berenson mentioned and that could be associated to a location in which she says she was, create a JSON object with these exact fields:
        - date: date value in ISO 8601 format
        - name: name of the date
        - line: text line reference in format /p[N] where N is line number
        - start_position: character position where date mention starts: Offset resets after each line, so first character of the second line will have start_position = 0
        - end_position: character position where date mention ends. Offset resets after each line.

        Return the results in this exact format, with no additional text:
        {
            "dates": [
                {
                    "name": "25th September 2023",
                    "date": "2023-09-25T00:00:00:000Z",
                    "start_position": "23",
                    "end_position": "35",
                    "line": "/p[1]"
                }
            ]
        }"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-latest",
                max_tokens=2048,
                system=system_prompt,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please analyze carefully the diary page and extract all the dates which Mary Berenson mentioned and that could be associated to a location in which she says she was. \n\n{text}"
                    }
                ]
            )

            # Extract just the JSON part from Claude's response
            response_text = response.content[0].text

            # Find the JSON object bounds
            start_idx = response_text.find('{')
            end_idx = response_text.rfind('}') + 1

            if start_idx == -1 or end_idx == 0:
                raise ValueError("No valid JSON found in response")

            json_str = response_text[start_idx:end_idx]

            # Parse the JSON
            dates_data = json.loads(json_str)

            # Validate the structure
            if "dates" not in dates_data:
                dates_data = {"dates": [dates_data]}

            return dates_data

        except Exception as e:
            print(f"Error extracting dates: {str(e)}")
            print(f"Raw response: {response_text}")
            return None

    def generate_uuids(self):
        """Generate a set of UUIDs for use in the Trig template"""
        return {
            'main': str(uuid.uuid4()),
            'uuid2': str(uuid.uuid4()),
            'uuid3': str(uuid.uuid4()),
            'uuid4': str(uuid.uuid4()),
            'uuid5': str(uuid.uuid4()),
            'uuid6': str(uuid.uuid4()),
            'uuid7': str(uuid.uuid4())
        }

    def convert_to_trig(self, date_data, input_filename):
        """Convert a single date data to Trig format"""
        uuids = self.generate_uuids()
        creation_date = datetime.now().isoformat()

        trig_template = f"""<https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container/context> {{
      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid7']}>
        a oa:TextPositionSelector;
        oa:end "{date_data.get('start_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{date_data.get('start_position', '')}"^^xsd:nonNegativeInteger .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid3']}>
        a oa:TextPositionSelector;
        oa:end "{date_data.get('end_position', '')}"^^xsd:nonNegativeInteger;
        oa:start "{date_data.get('end_position', '')}"^^xsd:nonNegativeInteger .

      mbdiaries-annotation:{uuids['main']} a oa:Annotation, crmdig:D29_Annotation_Object;
        crmdig:L48i_was_annotation_created_by <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}>;
        oa:hasBody <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>;
        oa:hasTarget <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-source-{uuids['uuid4']}> .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container>
        a ldp:Resource, prov:Entity;
        prov:wasAttributedTo User:agent;
        prov:generatedAtTime "{creation_date}"^^xsd:dateTime .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-source-{uuids['uuid4']}>
        a oa:SpecificResource;
        oa:hasSource <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5>;
        oa:hasSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-{uuids['uuid4']}>;
        rdf:value "{date_data.get('name', '')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-{uuids['uuid4']}>
        a oa:RangeSelector;
        oa:hasEndSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid5']}>;
        oa:hasStartSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid6']}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid6']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid7']}>;
        rdf:value "{date_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuids['uuid5']}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuids['uuid3']}>;
        rdf:value "{date_data.get('line', '/p[1]')}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}>
        a crmdig:D30_Annotation_Event;
        crm:P4_has_time_span <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}/modifiedAt>;
        crm:P14_carried_out_by User:agent .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/body>
        a mbdiaries-ontology:Location;
        a crm:E52_time_span;
        crm:P181b_begin_of_the_end "{date_data.get('date', "")}"^^xsd:dateTime.

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuids['uuid2']}/modifiedAt>
        crm:P81b_begin_of_the_end "{creation_date}"^^xsd:dateTime;
        crm:P81a_end_of_the_begin "{creation_date}"^^xsd:dateTime .

      _:node1i8224na8x5257 ldp:contains <https://mbdiaries.itatti.harvard.edu/annotation/{uuids['main']}/container> .
    }}

    {{
      _:node1i8224na8x5257 a ldp:Container, ldp:Resource, prov:Entity .
    }}"""

        return trig_template

    def process_batch(self, input_directory):
        """Process all .txt files in the input directory"""
        # Ensure input directory exists
        if not os.path.exists(input_directory):
            print(f"Input directory {input_directory} does not exist.")
            return

        # Create output directory if it doesn't exist
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # Iterate through all .txt files in the input directory
        for filename in os.listdir(input_directory):
            if filename.endswith(".txt"):
                filepath = os.path.join(input_directory, filename)
                
                try:
                    # Read the text file
                    with open(filepath, 'r', encoding='utf-8') as f:
                        text = f.read()

                    # Extract dates
                    dates_data = self.extract_dates(text)

                    if not dates_data or not dates_data.get("dates"):
                        print(f"No dates extracted from {filename}")
                        continue

                    # Process each date
                    for i, date in enumerate(dates_data["dates"]):
                        # Convert to Trig
                        trig_content = self.convert_to_trig(date, filename)

                        # Create safe filename
                        base_name = os.path.splitext(filename)[0]
                        safe_date_name = "".join(x for x in date['name'].lower() if x.isalnum() or x in (' ', '-', '_'))
                        
                        # Save to file with original filename as prefix
                        output_filename = f"{self.output_dir}/{base_name}_date_{i+1}_{safe_date_name}.trig"
                        with open(output_filename, 'w', encoding='utf-8') as f:
                            f.write(trig_content)
                        print(f"Processed and saved {output_filename}")

                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")

def main():
    # Initialize processor with your API key
    api_key = ''  # Replace with your actual API key
    processor = TextToRDFProcessor(api_key)

    # Specify the input directory containing text files
    input_directory = "./1891/1891/txt"  # Change this to your input directory path

    # Process all text files in the directory
    processor.process_batch(input_directory)

if __name__ == "__main__":
    main()

Processed and saved rdf_output/100_date_1_10 jan 1872.trig
Processed and saved rdf_output/100_date_2_16 jan 1872.trig
Processed and saved rdf_output/100_date_3_1 sept 1873.trig
No dates extracted from 101.txt
Processed and saved rdf_output/102_date_1_20 jan 1876.trig
Processed and saved rdf_output/102_date_2_3 juillet 1870.trig
Processed and saved rdf_output/103_date_1_september 2 1891.trig
No dates extracted from 104.txt
No dates extracted from 105.txt
No dates extracted from 106.txt
Processed and saved rdf_output/107_date_1_september 3 1891.trig
No dates extracted from 108.txt
Processed and saved rdf_output/109_date_1_september 4 1891.trig
Processed and saved rdf_output/110_date_1_saturday september 5 1891.trig
Processed and saved rdf_output/111_date_1_sunday september 6 1891.trig


KeyboardInterrupt: 

In [None]:
import os
import json
import uuid
import anthropic
from datetime import datetime
import shutil
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple

class EventProcessor:
    def __init__(self, api_key: str):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.output_dir = Path("rdf_output")
        if not self.output_dir.exists():
            self.output_dir.mkdir(exist_ok=True)

    def collect_files(self, text_id: str, trig_dir: Path) -> Tuple[str, List[Dict], List[Dict]]:
        """Collect text and trig files for a given text ID"""
        text_file = Path(f"./1891/1891/txt/{text_id}.txt")
        if not text_file.exists():
            raise FileNotFoundError(f"Text file {text_file} not found")

        with text_file.open('r', encoding='utf-8') as f:
            text_content = f.read()

        date_files = []
        place_files = []

        for trig_file in trig_dir.glob(f"{text_id}_*.trig"):
            with trig_file.open('r', encoding='utf-8') as f:
                content = f.read()
                if 'date' in trig_file.stem.lower():
                    date_files.append({'filename': trig_file.name, 'content': content})
                elif 'place' in trig_file.stem.lower():
                    place_files.append({'filename': trig_file.name, 'content': content})

        return text_content, date_files, place_files

    def extract_events(self, text: str, date_files: List[Dict], place_files: List[Dict]) -> Dict:
        """Extract events using Claude API with both text and trig files"""
        system_prompt = """Analyze the diary excerpt and the provided trig files to create a sequence of events representing Mary Berenson's movements in time and space. Each event should be created by matching places and dates from the text that are also represented in the trig files.

For each event pair, you need to:
1. Match places and dates that clearly belong together in the text
2. Find the corresponding trig files for both the place and date
3. Extract the UUIDs from the body annotations in the trig files (found in the pattern: annotation/UUID/body)
4. Create a chronologically ordered sequence of events

Return the results in this exact format, with no additional text:
{
    "events": [
        {
            "event_value": "Fiesole, 23 September 1900",
            "place_annotation_uuid": "uuid-from-place-trig",
            "date_annotation_uuid": "uuid-from-date-trig",
            "event_uuid": "new-generated-uuid",
            "next_event_value": "Florence, 24 September 1900",
            "next_event_uuid": "uuid-for-next-event",
        }
    ]
}"""

        content = f"""Here is the diary excerpt:
        {text}

        Here are the date trig files:
        {'-' * 50}
        """

        for date_file in date_files:
            content += f"\nFile: {date_file['filename']}\n{date_file['content']}\n{'-' * 50}"

        content += "\n\nHere are the place trig files:\n" + ('-' * 50)

        for place_file in place_files:
            content += f"\nFile: {place_file['filename']}\n{place_file['content']}\n{'-' * 50}"

        try:
            response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=4096,
                system=system_prompt,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": f"Please analyze the following text and trig files to extract and match events:\n\n{content}"
                    }
                ]
            )

            response_text = response.content[0].text
            json_start = response_text.find('{')
            json_end = response_text.rfind('}') + 1

            if json_start == -1 or json_end == 0:
                raise ValueError("No valid JSON found in response")

            events_data = json.loads(response_text[json_start:json_end])
            self.validate_event_uuid_sequence(events_data)
            return events_data

        except Exception as e:
            print(f"Error extracting events: {str(e)}")
            return None

    def validate_event_uuid_sequence(self, events_data: Dict) -> None:
        """Validate that next_event_uuid matches the event_uuid of the next event"""
        events = events_data.get('events', [])
        for i in range(len(events) - 1):
            current_event = events[i]
            next_event = events[i + 1]

            if current_event.get('next_event_uuid') != next_event.get('event_uuid'):
                raise ValueError(
                    f"UUID sequence mismatch: Event {i}'s next_event_uuid "
                    f"({current_event.get('next_event_uuid')}) does not match "
                    f"Event {i+1}'s event_uuid ({next_event.get('event_uuid')})"
                )

    def validate_uuids(self, events_data: Dict, date_files: List[Dict], place_files: List[Dict]) -> bool:
        """Validate that UUIDs in events match those in trig files"""
        for event in events_data.get('events', []):
            place_uuid = event.get('place_annotation_uuid')
            date_uuid = event.get('date_annotation_uuid')

            place_found = any(place_uuid in place_file['content'] for place_file in place_files)
            date_found = any(date_uuid in date_file['content'] for date_file in date_files)

            if not (place_found and date_found):
                print(f"UUID validation failed for event: {event['event_value']}")
                return False
        return True

    def convert_to_trig(self, event_data: Dict) -> str:
        """Convert a single event data to Trig format"""
        trig_template = f"""
        <https://mbdiaries.itatti.harvard.edu/annotation/{event_data.get('event_uuid', '')}/body>
            a crm:E5_event;
            crm:P160_has_temporal_projection <https://mbdiaries.itatti.harvard.edu/annotation/{event_data.get('date_annotation_uuid', '')}//body>;
            crm:P161_has_spatial_projection <https://mbdiaries.itatti.harvard.edu/annotation/{event_data.get('place_annotation_uuid', '')}/body>;
            crm:P183_ends_before_the_start <https://mbdiaries.itatti.harvard.edu/annotation/{event_data.get('next_event_uuid', '')}/body>
        """
        return trig_template

    def save_events_as_trig(self, events_data: Dict, text_id: str):
        """Save events to individual Trig files"""
        if not events_data or 'events' not in events_data:
            print("No events to save")
            return

        for i, event in enumerate(events_data['events'], 1):
            trig_content = self.convert_to_trig(event)
            trig_filename = self.output_dir / f"{text_id}_event_{event['event_uuid']}.trig"

            try:
                with trig_filename.open('w', encoding='utf-8') as f:
                    f.write(trig_content)
                print(f"Saved Trig file: {trig_filename}")
            except Exception as e:
                print(f"Error saving Trig file for event {i}: {str(e)}")

def main():
    api_key = ""
    input_dir = Path("./1891/1891/txt")
    trig_dir = Path("rdf_output")
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)

    try:
        processor = EventProcessor(api_key)

        # Get sorted list of all text files
        text_files = sorted(list(input_dir.glob('*.txt')))
        
        # Process each text file
        for i, text_file in enumerate(text_files):
            text_id = text_file.stem
            
            # Collect date and place trig files for current text
            date_files, place_files = [], []
            for trig_file in trig_dir.glob(f"{text_id}_*.trig"):
                with trig_file.open('r', encoding='utf-8') as f:
                    content = f.read()
                    if 'date' in trig_file.stem.lower():
                        date_files.append({'filename': trig_file.name, 'content': content})
                    elif 'place' in trig_file.stem.lower():
                        place_files.append({'filename': trig_file.name, 'content': content})

            if not date_files or not place_files:
                print(f"Skipping {text_id}: No matching trig files found")
                continue

            # Process current text
            text_content = text_file.read_text(encoding='utf-8')
            current_events_data = processor.extract_events(text_content, date_files, place_files)
            
            if not current_events_data or 'events' not in current_events_data:
                continue

            # Look for the first event in the next 3 texts
            next_event = None
            for j in range(i + 1, min(i + 4, len(text_files))):
                next_text_file = text_files[j]
                next_text_id = next_text_file.stem
                
                # Collect trig files for next text
                next_date_files, next_place_files = [], []
                for trig_file in trig_dir.glob(f"{next_text_id}_*.trig"):
                    with trig_file.open('r', encoding='utf-8') as f:
                        content = f.read()
                        if 'date' in trig_file.stem.lower():
                            next_date_files.append({'filename': trig_file.name, 'content': content})
                        elif 'place' in trig_file.stem.lower():
                            next_place_files.append({'filename': trig_file.name, 'content': content})

                if next_date_files and next_place_files:
                    next_text_content = next_text_file.read_text(encoding='utf-8')
                    next_events_data = processor.extract_events(next_text_content, next_date_files, next_place_files)
                    
                    if next_events_data and 'events' in next_events_data and next_events_data['events']:
                        next_event = next_events_data['events'][0]
                        break

            # Update the last event in current text with next event information
            if next_event and current_events_data['events']:
                last_event = current_events_data['events'][-1]
                last_event['next_event_value'] = next_event['event_value']
                last_event['next_event_uuid'] = next_event['event_uuid']

            # Save processed events
            output_json_path = output_dir / f"{text_id}_events.json"
            with output_json_path.open('w', encoding='utf-8') as f:
                json.dump(current_events_data, f, indent=2)

            # Save events as Trig files
            processor.save_events_as_trig(current_events_data, text_id)

            print(f"\nProcessed {text_id}:")
            for event in current_events_data['events']:
                print(f"\nEvent: {event['event_value']}")
                print(f"Place UUID: {event['place_annotation_uuid']}")
                print(f"Date UUID: {event['date_annotation_uuid']}")
                print(f"Event UUID: {event['event_uuid']}")
                if 'next_event_value' in event:
                    print(f"Next Event: {event['next_event_value']}")
                    print(f"Next Event UUID: {event['next_event_uuid']}")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        print("Please ensure:")
        print("1. The 'input' directory exists")
        print("2. Text and trig files are in the input directory")

if __name__ == "__main__":
    main()

Skipping 100: No matching trig files found
Skipping 101: No matching trig files found
Saved Trig file: rdf_output\102_event_e4a8d0c6-f8c4-4c9d-9d7f-1a3b5c2a7d9a.trig

Processed 102:

Event: Lake Garda, 20 January 1876
Place UUID: 3e93d1af-cb54-4816-bdc3-d363bee29773
Date UUID: 0c0195e4-2590-444a-821d-86364bca9e77
Event UUID: e4a8d0c6-f8c4-4c9d-9d7f-1a3b5c2a7d9a
Next Event: Ratisbon, 23 September 1900
Next Event UUID: d8d7f9c0-6d4f-4d9f-9f9f-e9d6d8f9d9d9
Saved Trig file: rdf_output\103_event_d8d7f9c0-6d4f-4d9f-9f9f-e9d6d8f9d9d9.trig
Saved Trig file: rdf_output\103_event_f7a6e2c1-b9b0-4f9a-8d8d-f9d9d9d9f9f9.trig
Saved Trig file: rdf_output\103_event_d9d9f9f9-f9f9-f9d9-9d9d-9f9f9d9d9d9d.trig

Processed 103:

Event: Ratisbon, 23 September 1900
Place UUID: e4f5dde1-881f-450d-a754-8486fb8e5b2b
Date UUID: 6bce0e16-56ca-4383-8273-875c0ab5670e
Event UUID: d8d7f9c0-6d4f-4d9f-9f9f-e9d6d8f9d9d9
Next Event: Schottenkirche, 23 September 1900
Next Event UUID: f7a6e2c1-b9b0-4f9a-8d8d-f9d9d9d9f9f9

Eve

a questo punto userei un approccio di questo tipo:
al momento di estrazione dei JSON aggiungerei un campo di UUID in cui estraggo il main
successivamente formatto in trig

poi chiedo all llm di vedere di nuovo il testo ed estrarre combinazioni di luogo e tempo da dover poi definire come "event" in CIDOC crm. In questo file devo poi anche indicare l'UUID dell'evento successivo come next event e gli UUID del luogo e tempo in cui succede

