In [1]:
import os
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf
import json
# import tesseract
import pytesseract
from typing import *
from box import ConfigBox
from pathlib import Path
import re, sys, os
from functools import partial



import io
import os
import base64
import numpy as np
from PIL import Image
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage


import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

from dotenv import load_dotenv

from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [21]:
from backend.src.utils.common import *
from backend.logger import logger
from backend.src.constants import *


In [5]:
os.getcwd()
os.chdir("../../")

In [6]:
%load_ext watermark
%watermark -a "Rajesh goldy" -vmp langchain,unstructured,openai,pydantic,pytesseract

Author: Rajesh goldy

Python implementation: CPython
Python version       : 3.9.21
IPython version      : 8.18.1

langchain   : 0.3.17
unstructured: 0.16.11
openai      : 1.61.1
pydantic    : 2.10.6
pytesseract : 0.3.13

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 24.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [None]:
# tracker elements


Extractor
    -> DataIngestion (handles -> text, image, table, list)
    -> ImageSummarizer 
    -> TextSummarizer
    

In [200]:
class Extractor:
    def __init__(self):
        pass
    
    def get_metadata(raw_pdf_element: List) -> List:
        """To get the metadata of elements

        Args:
            raw_pdf_element (List): list of elements in pdf

        Returns:
            List: containing meta data
        """
        metadata = []
        for element in raw_pdf_element:
            metadata.append(element.metadata.to_dict())
        
        return metadata
    
    def get_year(file: str) -> str:
        """To get the year from file name
        Args:
            file: name of with extension"""
        
        year = re.search(r"\d{4}", file).group()    
        if year:
            return str(year)
        else:
            try:
                file_name = os.path.split(file)[-1].split(".")[0]
                return file_name
            except Exception as e:
                logger.error(f"Error in getting year from file: {file} with error: {e}")
                return "1.0"
        
    def extract_data(self, pdf_file, **kwargs):
        raw_pdf_elements = partition_pdf(filename=pdf_file, **kwargs)
        return raw_pdf_elements
    
    def generate_unique_id(self, data: Iterator)-> List[str]:
        """To generate unique id for each element"""
        
        unique_id = []
        for _ in data:
            unique_id.append(str(uuid.uuid4()))
        
        return unique_id
    
    def generate_document(self, content: List, id_key: str):
        """To generate document from content"""
                
        ids = self.generate_unique_id(content)
        summaries = [Document(page_content=summary, metadata={id_key: ids[index]})
                         for index, summary in enumerate(content)]
        
        return summaries
    
    @staticmethod
    def seprate_data_metadata(data: List[dict], text_name: str="text", metadata_name: str="metadata") -> Tuple[List, List]:
        """To seprate data and metadata from dict

        Args:
            data (List[dict]): List containing dict >> text with metadata
            text_name (str): key to get text from dict. Defaults to "text"
            metadata_name (str): key to get metadata from dict. Defaults to "metadata"

        Returns:
            Tuple: containing text, metadata
        """
        text, metadata = [], []
        for i in data:
            text.append(i[text_name])
            metadata.append(i[metadata_name])
        return text, metadata
        

In [193]:
from unstructured.documents.elements import CompositeElement, Table, Image, ListItem  # Import ListItem



@dataclass
class DataIngestionConfig:
    raw: str # raw pdf path
    reports: str # processed reports path
    metadata: list # metadata to collect from raw_pdf_elements
    
    
class ConfigurationManager:
    def __init__(self, CONFIG_FILE_PATH, PARAMS_FILE_PATH):
        self.config = read_yaml(CONFIG_FILE_PATH)
        self.params = read_yaml(PARAMS_FILE_PATH)
        
    def get_data_ingestion_params(self) -> DataIngestionConfig:
        params = DataIngestionConfig(
            raw=self.config.data_dir.raw,
            reports=self.config.data_dir.reports,
            metadata=self.config.metadata)
        return params
    
    
class DataIngestion(Extractor):
    def __init__(self, config):
        super(DataIngestion, self).__init__()
        self.config = config
        self.raw_pdf_elements = None
        self.current_pdf_file = None # to keep track of current pdf file being processed (future case)
    
    def data_with_metadata(self, raw_pdf_elements: List) -> Tuple[List]:
        """To get the data with metadata from raw_pdf_elements

        Args:
            raw_pdf_elements (List): list of elements returned by partition_pdf

        Returns:
            tuple: containing text_elements, table_elements, list_items
        """
        text_elements = []
        table_elements = []
        list_items = []
        # add new structure here 
        
        for element in raw_pdf_elements:
            metadata = element.metadata.to_dict() if element.metadata else {}

            # Store text with metadata
            if isinstance(element, CompositeElement):
                text_elements.append({
                    "text": str(element),
                    "metadata": metadata
                })
            # Store tables with metadata
            elif isinstance(element, Table):
                table_elements.append({
                    "table": str(element),
                    "metadata": metadata
                })
            # Store ListItem with metadata
            elif isinstance(element, ListItem):
                list_items.append({
                    "list_item": str(element),
                    "metadata": metadata
                })
                
        return (text_elements, table_elements, list_items)
    
    def process_pdf(self, pdf_file_path: str,save:bool, **kwargs):
        """To extract the data from pdf file using unstructured library

        Args:
            pdf_file_path (str): full path of pdf file
            save (bool): to save the extracted data

        Returns:
            configBox: Box containing text_elements, table_elements, list_items
        """
        self.current_pdf_file = pdf_file_path
        
        report_year = Extractor.get_year() # get the year from file name and set it as directory name
        # pdf_dir = Path(pdf_file_path).resolve().parent # get the directory of pdf file   backend/data/raw_pdfs -> backend/data
        
        report_dir = os.path.join(self.config.reports, report_year) 
        # print("image Dir: ", image_dir)
        # to store the images extracted from pdf
        create_directory(report_dir, is_extension_present=False) # create directory with year name
        if kwargs.get("extract_image_block_output_dir", None) == None:
            kwargs["extract_image_block_output_dir"] = os.path.join(report_dir, "images")
        raw_pdf_elements = self.extract_data(pdf_file_path, **kwargs)
        self.raw_pdf_elements = raw_pdf_elements
  
        text_elements, table_elements, list_items = self.data_with_metadata(raw_pdf_elements)
        
        if save:
            save_json(table_elements, os.path.join(report_dir, "table_elements.json"))
            save_json(text_elements, os.path.join(report_dir, "text_elements.json"))
            save_json(list_items, os.path.join(report_dir, "list_items.json"))
            # add new structure here if needed
            
            print(f"Processed {pdf_file_path} and saved results in {report_dir}")
                
        return ConfigBox({
            "text_elements": text_elements,
            "table_elements": table_elements,
            "list_items": list_items
            })
    

                

In [194]:
config_manager = ConfigurationManager(CONFIG_FILE_PATH, PARAMS_FILE_PATH)
data_extractor_config = config_manager.get_data_ingestion_params()
data_extractor = DataIngestion(data_extractor_config)

2025-02-08 22:31:39,918 - root - INFO - Yaml read successfully from config.yaml
2025-02-08 22:31:39,922 - root - ERROR - FileNotFoundError: params.yaml


In [14]:
data = data_extractor.process_pdf(pdf_file_path="backend/data/raw_pdfs/2023_removed.pdf",
                                  save=True,
                                    strategy="hi_res", # 
                                    split_pdf_page=True,  # to process each page seprately
                                    split_pdf_allow_failed=True, # continue processing even if some pages fail
                                    extract_images_in_pdf=True,
                                    infer_table_structure=True,
                                    chunking_strategy="by_title",
                                    extract_image_block_types = ["Image" , "Table"],
                                    max_characters=4000,
                                    new_after_n_chars=3800,
                                    combine_text_under_n_chars=2000
                                    )

2025-02-08 11:34:14,954 - pikepdf._core - INFO - pikepdf C++ to Python logger bridge initialized
2025-02-08 11:34:14,998 - pdfminer.psparser - DEBUG - seek: 0
2025-02-08 11:34:15,000 - pdfminer.pdfdocument - DEBUG - find_xref: b''
2025-02-08 11:34:15,000 - pdfminer.pdfdocument - DEBUG - find_xref: b'%%EOF'
2025-02-08 11:34:15,001 - pdfminer.pdfdocument - DEBUG - find_xref: b'10081360'
2025-02-08 11:34:15,002 - pdfminer.pdfdocument - DEBUG - find_xref: b'startxref'
2025-02-08 11:34:15,003 - pdfminer.pdfdocument - DEBUG - xref found: pos=b'10081360'
2025-02-08 11:34:15,003 - pdfminer.psparser - DEBUG - seek: 10081360
2025-02-08 11:34:15,004 - pdfminer.psparser - DEBUG - nexttoken: (10081360, /b'xref')
2025-02-08 11:34:15,005 - pdfminer.pdfdocument - DEBUG - read_xref_from: start=10081360, token=/b'xref'
2025-02-08 11:34:15,005 - pdfminer.psparser - DEBUG - nextline: 10081364, b'\n'
2025-02-08 11:34:15,006 - pdfminer.psparser - DEBUG - nextline: 10081365, b'0 62\n'
2025-02-08 11:34:15,006

Processed backend/data/raw_pdfs/2023_removed.pdf and saved results in backend/data/reports/2023


# Image Summarizer

In [17]:
@dataclass
class ImageSummarizerConfig:
    model: str # name of model
    image_dir: str # image directory
    image_summary_dir: str # image summaries directory
    summarizer_prompt_dir: str
    
class ConfigurationManager:
    def __init__(self, CONFIG_FILE_PATH, PARAMS_FILE_PATH):
        self.config = read_yaml(CONFIG_FILE_PATH)
        self.params = read_yaml(PARAMS_FILE_PATH)
        
    def get_image_summarizer_params(self) -> ImageSummarizerConfig:
        params = ImageSummarizerConfig(
            model=self.config.model.chat_model,
            image_dir=self.config.image_summarizer.image_dir,
            image_summaries=self.config.image_summarizer.image_summaries,
            summarizer_prompt_dir=self.config.prompts.summarizer_prompt_dir)
        return params
    
class ImageSummarizer(Extractor):
    def __init__(self, config, model: object):
        super().__init__()
        self.config = config
        self.model = model
        self.image_summary_prompt = read_json(self.config.summarizer_prompt_dir)["image_summarizer_prompt"]
            
    def encode_image(self, image_path):
        """Encode image to base64
        Args:
            image_path: path to image
        Returns:
            str: base64 encoded image"""
        try:
            with open(image_path, "rb") as image:
                return base64.b64encode(image.read()).decode("utf-8")
        except Exception as e:
            print(f"Error: {e}")
            raise e
        
    def image_summarize(self, img_base64, prompt) -> str:
        """Summarize image using LLM model API
        Args:
            img_base64: base64 encoded image
            prompt: prompt to summarize image
        Returns:
            str: summary of image"""
        
        msg = self.model.invoke(
            [
                HumanMessage(
                    content=[
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{img_base64}"
                            },
                        },
                    ]
                )
            ]
        )
        return msg.content
    
    def store_summaries_to_vb(self, encoded_images, image_summaries, id_key: str, retriever: object) -> None:
        """Store images(Encoded images) and summaries to vectorbase

        Args:
            encoded_images (_type_): base64 encoded images
            image_summaries (_type_): Summaries of images generated by model
            id_key (str): Unique ID key for file processing
            retriever (object): Vectorbase retriever object
        Returns:
            None
        """
        image_ids = Extractor.generate_unique_id(image_summaries)
        image_summaries = [Document(page_content=summary.content, metadata={id_key: image_ids[index]})
                         for index, summary in enumerate(image_summaries)]
        
        retriever.vectorstore.add_documents(image_summaries)
        retriever.docstore.mset(list(zip(image_ids, encoded_images)))
        print("Stored images and summaries to vectorbase")


In [20]:
model = ChatOpenAI(temperature=0, model="gpt-4o")
config_manager = ConfigurationManager(CONFIG_FILE_PATH, PARAMS_FILE_PATH)
image_summarizer_config = config_manager.get_image_summarizer_params()
image_summarizer = ImageSummarizer(image_summarizer_config, model)

2025-02-08 11:43:29,240 - root - INFO - Yaml read successfully from config.yaml
2025-02-08 11:43:29,244 - root - ERROR - FileNotFoundError: params.yaml


2025-02-08 13:54:35,486 - root - INFO - Json object read sucessfully 


In [30]:
os.getcwd()

'/Users/goldyrana/mess/deep_learning/projects/rag'

In [None]:
image_path = "backend/data/reports/2023/images"
images = os.listdir(image_path)
encoded_images = []
summaries = []

PROMPT_FILE_PATH = str(Path("backend/src/prompts/summarizer.json"))
image_prompt = read_json(PROMPT_FILE_PATH)["image_summarizer_prompt"]

for image in images:
    encoded_image = image_summarizer.encode_image(os.path.join(image_path, image))
    summaries.append(image_summarizer.image_summarize(encoded_image, image_prompt))
    
    encoded_images.append(encoded_image)
    

In [48]:
extractor = Extractor()
summaries = extractor.generate_document(content=summaries, id_key = "2023")

In [54]:
summaries

[Document(metadata={'2023': 'aae66a9d-c3dc-4b2e-aed8-a6ce154d36c7'}, page_content='The image is a simple blue icon depicting a calculator and a document. The calculator is on the left, featuring a grid of buttons. The document on the right has a dollar sign at the top, indicating financial or accounting content. There are a few horizontal lines below the dollar sign, representing text or figures on the document. There are no graphs or specific numbers in this image.'),
 Document(metadata={'2023': 'ccf0f223-3f65-47ef-9022-d1c1245bd794'}, page_content='The image is a stacked area chart illustrating the 10-year production compound annual growth rate (CAGR) for different energy production categories. The vertical axis represents production levels, with increments marked at 500, 1,000, 1,500, 2,000, and 2,500 units.\n\n1. **Unconventional (Lower 48 + Montney)**: \n   - Represented in dark blue.\n   - Shows a ~6% CAGR.\n   - Occupies the largest portion of the chart.\n\n2. **Conventional**:\

# Multivector storage


In [49]:
from langchain_chroma import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever


In [50]:
vectorstore = Chroma("test", OpenAIEmbeddings(), persist_directory="test_vectorstore")   
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

2025-02-08 14:35:24,683 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-02-08 14:35:24,828 - chromadb.config - DEBUG - Starting component System
2025-02-08 14:35:24,828 - chromadb.config - DEBUG - Starting component Posthog
2025-02-08 14:35:24,829 - chromadb.config - DEBUG - Starting component OpenTelemetryClient
2025-02-08 14:35:24,830 - chromadb.config - DEBUG - Starting component SqliteDB
2025-02-08 14:35:24,873 - chromadb.config - DEBUG - Starting component SimpleQuotaEnforcer
2025-02-08 14:35:24,874 - chromadb.config - DEBUG - Starting component SimpleRateLimitEnforcer
2025-02-08 14:35:24,874 - chromadb.config - DEBUG - Starting component LocalSegmentManager
2025-02-08 14:35:24,875 - chromadb.config - DEBUG - Starting component LocalExecutor
2025-02-08 14:35:24,876 - chromadb.config - DEBUG - Starting component SegmentAPI


In [56]:
retriever.vectorstore.aadd_documents(summaries)
retriever.docstore.mset(list(zip([doc.metadata["2023"] for doc in summaries], encoded_images)))

  retriever.vectorstore.aadd_documents(summaries)


In [None]:
retriever.vectorstore.aadd_texts()

In [None]:
data

In [61]:
retrieved_docs = vectorstore.similarity_search("DELIVER COMPETITIVE RETURN", k=2)
for doc in retrieved_docs:
    print("\n--- Retrieved Document ---")
    print("Content:", doc.page_content)
    print("Metadata:", doc.metadata) 

2025-02-08 15:44:26,942 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/embeddings', 'files': None, 'post_parser': <function Embeddings.create.<locals>.parser at 0x35cfc3940>, 'json_data': {'input': [[39432, 19323, 22735, 1372, 45450, 31980]], 'model': 'text-embedding-ada-002', 'encoding_format': 'base64'}}
2025-02-08 15:44:26,952 - openai._base_client - DEBUG - Sending HTTP Request: POST https://api.openai.com/v1/embeddings
2025-02-08 15:44:26,963 - httpcore.connection - DEBUG - connect_tcp.started host='api.openai.com' port=443 local_address=None timeout=None socket_options=None
2025-02-08 15:46:57,205 - httpcore.connection - DEBUG - connect_tcp.failed exception=ConnectError(TimeoutError(60, 'Operation timed out'))
2025-02-08 15:46:57,210 - openai._base_client - DEBUG - Encountered Exception
Traceback (most recent call last):
  File "/Users/goldyrana/miniconda3/envs/bounce/lib/python3.9/site-packages/httpx/_transports/default.py", line 101, in map_httpcor

KeyboardInterrupt: 

# Store data summaries into vectorstore


In [69]:
text_elements = data.text_elements

In [79]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
import openai

In [220]:
@dataclass 
class TextSummarizerConfig:
    model: str # model name used to generate summary
    text_summary_dir: str # path to save text summaries
    summarizer_prompt_dir: str
    text_summarizer_prompt: str 
    
class ConfigurationManager:
    def __init__(self, CONFIG_FILE_PATH, PARAMS_FILE_PATH):
        self.config = read_yaml(CONFIG_FILE_PATH)
        self.params = read_yaml(PARAMS_FILE_PATH)
        
    def get_text_summarizer(self) -> TextSummarizerConfig:
        summarizer_prompt_dir=read_json(self.config.prompts.summarizer_prompt_dir)
        
        params = TextSummarizerConfig(
            model=self.config.text_summarizer.model,
            text_summary_dir=self.config.text_summarizer.text_summary_dir,
            summarizer_prompt_dir=self.config.prompts.summarizer_prompt_dir,
            text_summarizer_prompt=summarizer_prompt_dir["text_summarizer_prompt"],
        )
        return params
    
class TextSummarizer(Extractor):
    def __init__(self, config, model):
        self.config = config
        self.model = model
        
    def generate_summary(self, docs: List[dict]):
        """To generate summary of the text

        Args:
            model (object): model used to generate summary of text
            docs (List[dict]): List containing dict >> text with metadata
                len of docs = len of dict containing text with metadata
                        dict -> text, metadata 
        """
                    
        text, metadata = Extractor.seprate_data_metadata(docs, text_name="text", metadata_name="metadata")
        prompt = ChatPromptTemplate.from_template(self.config.text_summarizer_prompt)
        summarize_chain = {"element": lambda x: x} | prompt | self.model | StrOutputParser()
        summaries = summarize_chain.batch(text, {"max_concurrency": 5})
        return summaries, metadata
    

In [224]:
config_manager = ConfigurationManager(CONFIG_FILE_PATH, PARAMS_FILE_PATH)
text_summarizer_config = config_manager.get_text_summarizer()
model = ChatOpenAI(temperature=0, model="gpt-4o")
text_summarizer = TextSummarizer(text_summarizer_config, model)

2025-02-08 22:42:49,913 - root - INFO - Yaml read successfully from config.yaml
2025-02-08 22:42:49,914 - root - ERROR - FileNotFoundError: params.yaml
2025-02-08 22:42:49,918 - root - INFO - Json object read sucessfully 


In [223]:
text_data = data.text_elements

In [225]:
summaries, summary_metadata = text_summarizer.generate_summary(text_data)

2025-02-08 22:42:50,252 - openai._base_client - DEBUG - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'Act as an assistant tasked with summarizing the tables and text. Give a concise summary of the table or text. Table or text chunk: Cautionary Statement\n\nThis presentation provides management\'s current operational plan for ConocoPhillips over roughly the next decade, for the assets currently in our portfolio, and is subject to multiple assumptions, including, unless otherwise specifically noted: * anoil price of $60/BBL West Texas Intermediate in 2022 dollars, escalating at 2.25% annually;\n\n* anoil price of $65/BBL Brent in 2022 dollars, escalating at 2.25% annually;\n\n* agas price of $3.75/MMBTU Henry Hub in 2022 dollars, escalating at 2.25%\n\nannually;\n\nan international gas price of $8/MMBTU Title Transfer Facility & Japan Korea Marker in 2022 dollars, escalating at 2.25% annually;\n\n* c

In [226]:
summaries

["The presentation outlines ConocoPhillips' operational plan for the next decade, based on current assets and several assumptions. Key assumptions include oil prices of $60/BBL for West Texas Intermediate and $65/BBL for Brent, both escalating at 2.25% annually, and gas prices of $3.75/MMBTU for Henry Hub and $8/MMBTU for international markets, also escalating at 2.25% annually. The plan assumes cost and capital escalation in line with price increases, with a specific planning case for $60/BBL WTI involving capital de-escalation from 2022 levels. Production growth rates are calculated over 2023-2032, and carbon taxes are included in cash flow forecasts where applicable. The presentation contains forward-looking statements about future events, plans, and operational results, identified by specific terminology, and is subject to federal securities laws.",
 "The text outlines the inherent uncertainties and risks associated with forward-looking statements made by a company regarding its fu

In [253]:
@dataclass
class VectorDatabaseConfig:
    vectorstore_name: str
    persist_directory: str


class ConfigurationManager:
    def __init__(self, CONFIG_FILE_PATH, PARAMS_FILE_PATH):
        self.config = read_yaml(CONFIG_FILE_PATH)
        self.params = read_yaml(PARAMS_FILE_PATH)
        
    def get_vectordatabase_config(self) -> VectorDatabaseConfig:
        params = VectorDatabaseConfig(
            vectorstore_name=self.config.vector_database.vectorstore_name,
            persist_directory=self.config.vector_database.persist_directory)
        return params
    
    
class VectorDatabase(Extractor):
    def __init__(self, config):
        super(VectorDatabase, self).__init__()
        self.config = config
    
    def init_chromadb(self, embeddings):
        return Chroma(self.config.vectorstore_name, 
                      embeddings, 
                      persist_directory=self.config.persist_directory)
        
    def get_reteriver(self, vectorstore, id_key):
        store = InMemoryStore()
        return MultiVectorRetriever(
            vectorstore=vectorstore,
            docstore=store,
            id_key=id_key
        )
          
    def generate_document(self, data, metadata=None):
        if metadata == None:
            metadata = self.generate_unique_id(data)
        
        documents = [Document(page_content=data_, metadata=metadata_)
                        for data_, metadata_ in zip(data, metadata)]
        return documents
    
    def store_data_to_vb(self, data, data_metadata=None, 
                            summary_text=None, summary_metadata=None, 
                            retriever=None):
        try:
            if summaries != None:
                retriever.vectorstore.add_documents(summary_text, metadata=summary_metadata)
                
            retriever.docstore.mset(list(zip(data, data_metadata)))
            print("Stored summaries to vectorbase")
        except Exception as e:
            print("Error in storing data to vectorbase: ", e)
            raise e
            


In [254]:
config_manager = ConfigurationManager(CONFIG_FILE_PATH, PARAMS_FILE_PATH)
vb_params = config_manager.get_vectordatabase_config()
print(vb_params)

2025-02-08 23:00:01,294 - root - INFO - Yaml read successfully from config.yaml


2025-02-08 23:00:01,307 - root - ERROR - FileNotFoundError: params.yaml


VectorDatabaseConfig(vectorstore_name='test', persist_directory='test_directory')


In [255]:
vb = VectorDatabase(vb_params)
vectorstore = vb.init_chromadb(OpenAIEmbeddings())


2025-02-08 23:00:01,864 - chromadb.api.segment - DEBUG - Collection test already exists, returning existing collection.


In [256]:
retriever = vb.get_reteriver(vectorstore, "2023_removed")

In [251]:
text_data, text_metadata = Extractor.seprate_data_metadata(data.text_elements, text_name="text", metadata_name="metadata")

In [260]:
for i in summary_metadata:
    i["id"] = str(uuid.uuid4())

In [262]:
summary_metadata

[ConfigBox({'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2025-02-06T12:34:29', 'page_number': 1, 'orig_elements': 'eJzNWW1v2zgS/iuEce+wtBLFF6n7qS2w295tD8U1h/vQLQKKHNm8yqKglzjG4v77DUk5SVu3yBpwUCAIODSHLzN8Zp6h3v+2ghZ20E3X1qyekZWUJuNZoxLKMpOwAkRSCtBJWWSaAQhTGb5ak9UOJmXUpFDnt5V2bjC2UxOMQW7Vwc3T9RbsZjthT86zDHWW7r010xZ7qRASe3tnu8nrvX8vWcrWRGYp/7Ami5RzlkovljQ7JYfRKK/GwzjBzp/hrb2F9l2vNKz+hz8YmEBP1nXXulXjeN0PrsZhWSoFrXIc0NgWpkMPQfftm1XYareZ1Sac5/0Kus3qQ+gdp+udM7axEKxFM8qTjCaZuMrps4I9o5XX7lHzupt3NQz+9H4TE9x6S6xeqtlvRQ0H8m5Cg3nTe5Xj+ld2asO2P3cMlLWiuWiSsuZVwgSTSVVlRcIZZWVtqKRCXcwxnKcVmr7Mo+mjSAuR5l6kRVGlxYmOqHCmc0oqn9o5V1s7kn6AEc2u/KZQcDfWwEh2qkM9748/j0TPw4At4noYwjjVkh53RRo3kJeuc9q93dq2tf1I3A0MZHDzZtseyLQF0uFixIBWBtZBwXfi0WG6mxhH2o64eSC9G6bGtdatieoMwe2Nc/1ftBiZHNnN7WT7NmjPu95vZFyjpm5ndPpmTeauhRG3gCsMezsCGXvQaB+tWlyicxOYZ+RvOLOzeIDBaiCuIX8Q2Q8vXvxC/gPjRK7gVo3kdTfBsANj8Sb5raFpKTGubdWAK8KIE6Idug1RE6Ep5X/EObvZr/Ljw8v9TzV4e93AlTf4iUvOtYCGllVieK0SJjlLKi4gYVyXUoqmFlxf7JILnpb+Eud

In [267]:
text_elements[0].metadata

ConfigBox({'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2025-02-06T12:34:29', 'page_number': 1, 'orig_elements': 'eJzNWW1v2zgS/iuEce+wtBLFF6n7qS2w295tD8U1h/vQLQKKHNm8yqKglzjG4v77DUk5SVu3yBpwUCAIODSHLzN8Zp6h3v+2ghZ20E3X1qyekZWUJuNZoxLKMpOwAkRSCtBJWWSaAQhTGb5ak9UOJmXUpFDnt5V2bjC2UxOMQW7Vwc3T9RbsZjthT86zDHWW7r010xZ7qRASe3tnu8nrvX8vWcrWRGYp/7Ami5RzlkovljQ7JYfRKK/GwzjBzp/hrb2F9l2vNKz+hz8YmEBP1nXXulXjeN0PrsZhWSoFrXIc0NgWpkMPQfftm1XYareZ1Sac5/0Kus3qQ+gdp+udM7axEKxFM8qTjCaZuMrps4I9o5XX7lHzupt3NQz+9H4TE9x6S6xeqtlvRQ0H8m5Cg3nTe5Xj+ld2asO2P3cMlLWiuWiSsuZVwgSTSVVlRcIZZWVtqKRCXcwxnKcVmr7Mo+mjSAuR5l6kRVGlxYmOqHCmc0oqn9o5V1s7kn6AEc2u/KZQcDfWwEh2qkM9748/j0TPw4At4noYwjjVkh53RRo3kJeuc9q93dq2tf1I3A0MZHDzZtseyLQF0uFixIBWBtZBwXfi0WG6mxhH2o64eSC9G6bGtdatieoMwe2Nc/1ftBiZHNnN7WT7NmjPu95vZFyjpm5ndPpmTeauhRG3gCsMezsCGXvQaB+tWlyicxOYZ+RvOLOzeIDBaiCuIX8Q2Q8vXvxC/gPjRK7gVo3kdTfBsANj8Sb5raFpKTGubdWAK8KIE6Idug1RE6Ep5X/EObvZr/Ljw8v9TzV4e93AlTf4iUvOtYCGllVieK0SJjlLKi4gYVyXUoqmFlxf7JILnpb+Eudp

In [263]:
vb.store_data_to_vb(text_elements, text_metadata, summaries, summary_metadata, retriever)

Error in storing data to vectorbase:  'str' object has no attribute 'id'


AttributeError: 'str' object has no attribute 'id'