### System libraries & setup

In [1]:
import chromadb
import os
import json
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

In [2]:
doc = 48

In [3]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
llm_openai = ChatOpenAI(model="gpt-4o", temperature=0)

In [5]:
client = chromadb.PersistentClient(path="./semantic_db")

In [6]:
collection_chunks = client.get_collection("docs")

In [7]:
len(collection_chunks.get()['ids'])

49

### Matadata builder

In [8]:
chunk = collection_chunks.get()['documents'][0]

In [9]:
doc_id = collection_chunks.get()['ids'][0]

In [10]:
doc_id

'02077054-496d-4b39-935b-7fa950ba2b13'

In [11]:
metadata_document = {
                  "source": "alice_in_wonderland.txt",
                  "title": "Alice's Adventures in Wonderland",
                  "author": "Lewis Carroll",
                  "edition": "The Millennium Fulcrum Edition 3.0",
                  }

In [12]:
metadata_chunk = {
                "chapter": "",
                "chapter_title": "",
                "characters": [],
                "key_events": [],
                "themes": [],
                "setting": "",
                "quotes": [],
                "narrative_style": "",
                "summary": "",
                }

In [13]:
prompt_template = f"""
Follow these instructions carefully:

1. Read and analyze the content of the CHUNK: {chunk}.   
   Pay attention to key information, facts, dates, names, and any other relevant details.

2. Accurately extract relevant information to add to the metadata {metadata_document}, using the specific details from the {metadata_chunk}.

3. For each field in the metadata_document, ensure that they remain unchanged.

4. For each field in metadata_chunk structures, extract the corresponding information from the CHUNK.
   Ensure that the information you extract is accurate and directly related to the field.

5. Fill in the metadata fields with the extracted information.
   Be precise and concise in your entries.
   If a particular field cannot be filled based on the information in the CHUNK, leave it empty (do not use null, undefined, or any placeholder values).

6. Pay special attention to the data types of the metadata fields:
   - Use strings (str) for text-based information
   - Use integers (int) for whole numbers
   - Use floats (float) for decimal numbers
   - Use booleans (bool) for true/false values

7. Do not use lists or nested structures in your output.
   When listing items, use a string format with items separated by commas.
   For example, instead of using a list structure like ['Federico', 'Marco', 'Luigi'], write "Federico, Marco, Luigi"

8. Ensure that your final output is a valid JSON string that can be correctly parsed using json.loads().
   The output should be a single dictionary containing all the filled metadata fields.

9. Here's an example of how your output should be structured:

   {{
     "title": "Sample Article Title",
     "author": "John Doe, Federico Palma",
     "publication_date": "2023-05-15",
     "word_count": 500,
     "is_peer_reviewed": true,
     "average_rating": 4.5
   }}

10. Remember to maintain the integrity and accuracy of the metadata. Double-check your entries to ensure they correctly represent the information in the CHUNK.

11. If you're unsure about a particular field or if the information is not explicitly stated in the CHUNK, use your best judgment based on the context.
    However, avoid making unfounded assumptions.

12. After completing the analysis and extraction, review your output to ensure all fields are correctly filled and the format is valid JSON.

Provide your final output as a JSON string that combines metadata_document and metadata_chunk.
Do not include any explanations or additional text outside of the JSON string.
"""

In [14]:
messages = [
    ("system", """You are tasked with analyzing a given text chunk and extracting relevant information to populate metadata fields."""),
    ("human", prompt_template),
]

In [15]:
answer = llm_openai.invoke(messages)

In [16]:
metadata = answer.content

In [17]:
try:
    # Converte la stringa pulita in un dizionario
    metadata_json = json.loads(metadata)
except json.JSONDecodeError as e:
        print(f"JSON error: {e}")

In [18]:
metadata_json

{'source': 'alice_in_wonderland.txt',
 'title': "Alice's Adventures in Wonderland",
 'author': 'Lewis Carroll',
 'edition': 'The Millennium Fulcrum Edition 3.0',
 'chapter': 'XI',
 'chapter_title': 'Who Stole the Tarts?',
 'characters': "Alice, Gryphon, Mock Turtle, King of Hearts, Queen of Hearts, Knave of Hearts, White Rabbit, Hatter, March Hare, Dormouse, Bill the Lizard, Duchess's Cook",
 'key_events': "The trial begins, The Hatter gives evidence, Alice grows larger, The Queen orders the Hatter's execution, The Duchess's Cook is called as a witness",
 'themes': 'Justice, Absurdity, Growth',
 'setting': 'Court of the King and Queen of Hearts',
 'quotes': "'The Queen of Hearts, she made some tarts, All on a summer day: The Knave of Hearts, he stole those tarts, And took them quite away!', 'Consider your verdict,' the King said to the jury.",
 'narrative_style': 'Third-person',
 'summary': "The trial of the Knave of Hearts begins, with various characters giving evidence. Alice grows l

### Update metadata

In [19]:
# metadata_json['chapter'] = '7'

In [20]:
# metadata_json['chapter_title'] = "A Mad Tea-Party."

In [21]:
# collection_chunks.update(ids=[doc_id], metadatas=metadata_json)