Directly asking to produce a trig file.

In [None]:
from anthropic import Anthropic
import os
from typing import Optional, List
from datetime import datetime

class DiaryPlaceExtractor:
    def __init__(self, api_key: str):
        """Initialize the Anthropic client with API key."""
        self.api_key = api_key
        self.client = Anthropic(api_key=api_key)
        self.model = "claude-3-5-sonnet-latest"
        self.rdf_template = self.get_rdf_template()
        self.system = self.get_system_prompt()
        self.rdf_separator = "---"  # New separator for RDF entries

    def read_diary_text(self, file_path: str) -> Optional[str]:
        """Read the diary text from a file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except FileNotFoundError:
            print(f"Error: File not found at {file_path}")
            return None
        except Exception as e:
            print(f"Error reading file: {e}")
            return None

    def get_rdf_template(self) -> str:
        """Return the RDF template with corrected placeholders."""
        return """<https://mbdiaries.itatti.harvard.edu/annotation/{uuid}/container/context> {
      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuid7}>
        a oa:TextPositionSelector;
        oa:end "{start_position}"^^xsd:nonNegativeInteger;
        oa:start "{start_position}"^^xsd:nonNegativeInteger .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuid3}>
        a oa:TextPositionSelector;
        oa:end "{end_position}"^^xsd:nonNegativeInteger;
        oa:start "{end_position}"^^xsd:nonNegativeInteger .

      mbdiaries-annotation:{uuid} a oa:Annotation, crmdig:D29_Annotation_Object;
        crmdig:L48i_was_annotation_created_by <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuid2}>;
        oa:hasBody <https://mbdiaries.itatti.harvard.edu/annotation/{uuid}/body>;
        oa:hasTarget <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-source-{uuid4}> .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuid}/container>
        a ldp:Resource, prov:Entity;
        prov:wasAttributedTo User:admin;
        prov:generatedAtTime "{creation_date}"^^xsd:dateTime .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-source-{uuid4}>
        a oa:SpecificResource;
        oa:hasSource <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5>;
        oa:hasSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-{uuid4}>;
        rdf:value "{place_name}" .

      <https://www.wikidata.org/wiki/{wiki_id}> rdfs:label "{place_name}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/range-{uuid4}>
        a oa:RangeSelector;
        oa:hasEndSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuid5}>;
        oa:hasStartSelector <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuid6}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuid6}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuid7}>;
        rdf:value "{xpath}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/xpath-{uuid5}>
        a oa:XPathSelector;
        oa:refinedBy <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/offset-{uuid3}>;
        rdf:value "{xpath}" .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuid2}>
        a crmdig:D30_Annotation_Event;
        crm:P4_has_time_span <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuid2}/modifiedAt>;
        crm:P14_carried_out_by User:admin .

      <https://mbdiaries.itatti.harvard.edu/annotation/{uuid}/body>
        a mbdiaries-ontology:Location;
        a crm:E52_place;
        crm:P168_place_is_defined_by "{coordinates}";
        owl:sameAs <https://www.wikidata.org/wiki/{wiki_id}> .

      <https://mbdiaries.itatti.harvard.edu/diary/1894-95/document/5/annotation-event-{uuid2}/modifiedAt>
        crm:P81b_begin_of_the_end "{creation_date}"^^xsd:dateTime;
        crm:P81a_end_of_the_begin "{creation_date}"^^xsd:dateTime .

      _:node1i8224na8x5257 ldp:contains <https://mbdiaries.itatti.harvard.edu/annotation/{uuid}/container> .
    }

    {
      _:node1i8224na8x5257 a ldp:Container, ldp:Resource, prov:Entity .
    }"""

    def get_system_prompt(self) -> str:
        """Return the specialized system prompt for diary analysis."""
        return f"""You're an AI assistant specialized in analysing historical diaries written by Mary Berenson and extracting places from there. Your job is to serialize this information into an RDF following the template structure below. Extract only places mentioned in the text as beginning of the diary (e.g. "Fiesole, 18 january 1900" or "18 january 1900\\n Fiesole") or places mentioned in the text that have been visited that day, ONLY IF those places can be associated to coordinates.

Please separate different RDF entries with "---".

RDF Template:
{self.rdf_template}

Start your response directly with the RDF file, avoid adding introductions and contextualizations.
"""

    def get_prefixes(self) -> str:
        """Return the RDF prefixes needed for the TriG output."""
        return """@prefix oa: <http://www.w3.org/ns/oa#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix crmdig: <http://www.ics.forth.gr/isl/CRMdig/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix mbdiaries-annotation: <https://mbdiaries.itatti.harvard.edu/annotation/> .
@prefix mbdiaries-ontology: <https://mbdiaries.itatti.harvard.edu/ontology/> .
@prefix User: <https://mbdiaries.itatti.harvard.edu/user/> ."""

    def process_diary(self, diary_text: str) -> List[str]:
        """Process the diary text and return each RDF entry as a separate string in TriG format."""
        if not diary_text:
            return []

        try:
            user_prompt = f"""Please analyze the following diary text and generate RDF entries for each place mentioned, following the template and requirements specified in the system prompt. For each place found, you should substitute:
- uuid: A VALID UUID identifier. uuid, uuid2, uuid3, uuid4, uuid5, uuid6, uuid7 MUST BE DIFFERENT AND VALID UUIDs.
- start1: Character offset where the first character place name starts
- end1: Character offset where the first character of place name ends. So start1 and end1 should have the SAME value.
- start2: Character offset where the last character place name starts
- end2: Character offset where the last character of place name ends. So start2 and end2 should have the SAME value.
- place_name: The extracted place name
- creation_date: remains a placeholder.
- coordinates: Geographical coordinates as "(longitude, latitude)"
- wiki_id: Wikidata ID for the place
- xpath: XPath to the text node containing the place name in the format /p[1], number in [] represents the line from which the entity has been extracted: 3rd line = p[3]. Line means line of text, not paragraph.

Separate different RDF entries with "{self.rdf_separator}"
Remember: extract all the places in which Mary said she was in a particular day and that could be associated with coordinates. """

            response = self.client.messages.create(
                model=self.model,
                max_tokens=4000,
                temperature=0,
                system=self.system,
                messages=[
                    {"role": "user", "content": user_prompt},
                    {"role": "assistant", "content": "I'll analyze the diary text and generate RDF entries for the places mentioned."},
                    {"role": "user", "content": diary_text}
                ]
            )

            content = response.content
            if isinstance(content, list):
                content = '\n'.join(str(item) if isinstance(item, str) else item.text for item in content)

            entries = content.split(self.rdf_separator)
            rdf_entries = [
                f"{self.get_prefixes()}\n# Generated on {datetime.now().isoformat()}\n\n{entry.strip()}"
                for entry in entries if entry.strip()
            ]

            return rdf_entries

        except Exception as e:
            print(f"Error processing diary: {str(e)}")
            return []

    def save_to_trig(self, content: str, output_path: str) -> bool:
        """Save the RDF content to a .trig file."""
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(content)
            return True
        except Exception as e:
            print(f"Error saving TriG file: {e}")
            return False

    def save_all_entries(self, entries: List[str], output_dir: str) -> None:
        """Save each RDF entry as a separate .trig file."""
        os.makedirs(output_dir, exist_ok=True)
        for i, entry in enumerate(entries, start=1):
            output_path = os.path.join(output_dir, f"place_entry_{i}.trig")
            self.save_to_trig(entry, output_path)
            print(f"Saved: {output_path}")

def main():
    api_key = ''  # Get API key from environment variable
    if not api_key:
        print("Error: ANTHROPIC_API_KEY environment variable not set")
        return

    extractor = DiaryPlaceExtractor(api_key)
    diary_file_path = "prova.txt"
    diary_text = extractor.read_diary_text(diary_file_path)
    if not diary_text:
        return

    entries = extractor.process_diary(diary_text)
    if entries:
        output_dir = 'rdf_output3'
        extractor.save_all_entries(entries, output_dir)
        print(f"All entries have been saved to {output_dir}")
    else:
        print("No entries generated")

if __name__ == "__main__":
    main()