In [1]:
import dotenv
dotenv.load_dotenv()

True

In [2]:
import sys
from pathlib import Path

# Add the parent directory of the notebook to the Python path
notebook_dir = Path('.').resolve()
project_root = notebook_dir.parent
sys.path.append(str(project_root))

import json
import os
from pathlib import Path
from typing import Optional

from loguru import logger
from unstructured.partition.auto import partition

import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.core.storage import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

from rag._defaults import DEFAULT_MAX_EMBED_BSZ, DEFAULT_SYSTEM_PROMPT
from rag._utils import get_tag_from_dir
from rag.rag_schema import DataElement, DataType, Document, Metadata


  from .autonotebook import tqdm as notebook_tqdm


In [38]:
from llama_index.llms.openllm import OpenLLM
from transformers import AutoTokenizer
model_name = "meta-llama/Meta-Llama-3.1-70B-Instruct"
chat_model_endpoint = "http://llama-31-70b-jordan.models.mlds-kserve.us.rdlabs.hpecorp.net/v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generate_kwargs = {
    "do_sample": False,
}
llm = OpenLLM(
    model=model_name,
    api_base =chat_model_endpoint,
    api_key="fake",
    generate_kwargs=generate_kwargs,
    max_tokens=200
)
uninformative_prompt = """
Does the text extract below from a parsed PDF look like it's a part of the table of contents, or repeated header and footer, or a random gibberish of characters?
{context}

Only respond with "yes" or "no".
"""

question_answered_prompt = """
Generate the main question that is answered by the information provided in the passage below.  Ignore weird formatting or characters that look out of place.
{context}

Only respond with the question.
"""
def generate_completion(llm, tokenizer, text, system_prompt=DEFAULT_SYSTEM_PROMPT):

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
    ]
    toks = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    text = tokenizer.decode(toks)
    return llm.complete(text)


In [4]:
# Parsing functions
def elements_to_rag_schema(elements: list, tag=None) -> list[DataElement]:
    output_list = Document()
    for element in elements:
        el = element.to_dict()
        if "url" in el["metadata"]:
            source = el["metadata"]["url"]
        elif "filename" in el["metadata"]:
            source = el["metadata"]["filename"]
        else:
            source = "Unknown"
        el["metadata"]["source"] = source
        page_number = el["metadata"].get("page_number", 1)
        url = el["metadata"].get("url", None)
        text_as_html = el["metadata"].get("text_as_html", None)
    
        output_list.append(
            DataElement(
                id=el["element_id"],
                data_type=DataType(el["type"]),
                content=el["text"],
                metadata=Metadata(
                    source=el["metadata"]["source"],
                    page_number=page_number,
                    url=url,
                    text_as_html=text_as_html,
                    tag=tag,
                ),
            )
        )
    return output_list

def parse(
    input_file,
    output,
    strategy,
    chunking_strategy: str,
    combine_text_under_n_chars: Optional[int] = None,
    max_characters: Optional[int] = None,
    new_after_n_chars: Optional[int] = None,
    tag=None,
) -> None:
    logger.info(f"Processing {input_file}. Using {tag=}")
    elements = partition(
        filename=input_file,
        skip_infer_table_types=[],
        strategy=strategy,
        chunking_strategy=chunking_strategy,
        combine_text_under_n_chars=combine_text_under_n_chars,
        max_characters=max_characters,
        new_after_n_chars=new_after_n_chars,
    )
    logger.info("Done parsing elements")
    output_list = elements_to_rag_schema(elements, tag=tag)
    input_file_path = Path(input_file)

    print(f"{input_file_path=} {input_file_path.parent=}, {input_file_path.stem + '.json'=}")
    output_path = Path(os.path.join(output, input_file_path.stem + ".json"))
    if not output_path.parent.exists():
        print(f"Creating {input_file_path.parent.absolute()=}")
        os.makedirs(output_path.parent.absolute())
    with open(output_path, "w") as f:
        logger.info(f"Writing output to {output_path}")
        json.dump(output_list, f, indent=4)


In [35]:
def clean_parsed(json_file):
    results = []
    print(json_file)
    with open(json_file, "r") as f:
        input_text = json.load(f)
        for doc in input_text:
            if isinstance(doc, dict):
                if doc["data_type"] == "Table":
                    text = doc["metadata"]["text_as_html"]
                else:
                    text = doc["content"]
            uninformative = generate_completion(llm, tokenizer, uninformative_prompt.format(context=text)).text
            if uninformative=="no":
                print("************** UNINFORMATIVE **************")
                print(text)
            else:
                prefix = question_answered_prompt.format(context=text)
                question_answered = generate_completion(llm, tokenizer, prefix).text
                print("************** Generating question **************")
                print(prefix)
                print(question_answered)
                doc["metadata"]["question_answered"] = question_answered
                results.append(doc)
    with open(json_file, "w") as f:
        json.dump(results, f, indent=4)

In [41]:
# Embedding functions
import weaviate
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.core.storage import StorageContext
from llama_index.core.schema import TextNode

    
def embed(data_path: str, path_to_db: str, embed_model, weaviate_client) -> None:
    weaviate_client.collections.delete("Documents")
    vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, index_name="Documents")
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    docs = []
    for dirpath, _, files in os.walk(data_path):
        for file in files:
            input_file = os.path.join(dirpath, file)

            with open(input_file, "r") as f:
                input_text = json.load(f)
                for doc in input_text:
                    if isinstance(doc, dict):
                        if doc["data_type"] == "Table":
                            text = doc["metadata"]["text_as_html"]
                        else:
                            text = doc["content"]
                        source = doc["metadata"]["source"]
                        page_number = doc["metadata"].get("page_number", 1)
                        tag = doc["metadata"].get("tag", "")
                        question_answered = doc["metadata"].get("question_answered", "")
                        metadata = {
                            "Source": source,
                            "PageNumber": page_number,
                            "Commit": os.environ.get("PACH_JOB_ID", ""),
                            "Tag": tag,
                            "QuestionAnswered": question_answered
                        }
                        docs.append(
                            TextNode(
                                text=text, 
                                metadata=metadata, 
                                excluded_embed_metadata_keys=["Source", "PageNumber", "Commit", "Tag"], 
                                excluded_llm_metadata_keys=["Source", "PageNumber", "Commit", "Tag", "QuestionAnswered"],
                                metadata_template="{value}"
                            )
                        )

    print("Number of chunks: ", len(docs))

    # Insert nodes into both indices
    index = VectorStoreIndex(
        docs,
        storage_context=storage_context,
        embed_model=embed_model,
        insert_batch_size=DEFAULT_MAX_EMBED_BSZ,
    )
    print("Indexing done!")
    index.storage_context.persist(persist_dir=path_to_db)
    print(f"Persisting done! Saved at {path_to_db}")
    return weaviate_client

In [7]:
def embed_chroma(data_path: str, path_to_db: str, embed_model, db: chromadb.PersistentClient) -> None:
    collection = db.get_or_create_collection(name="documents", metadata={"hnsw:space": "cosine"})
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    docs = []
    index = VectorStoreIndex(
        docs,
        storage_context=storage_context,
        embed_model=embed_model,
        insert_batch_size=DEFAULT_MAX_EMBED_BSZ,
    )
    for dirpath, _, files in os.walk(data_path):
        for file in files:
            input_file = os.path.join(dirpath, file)

            with open(input_file, "r") as f:
                input_text = json.load(f)
                for doc in input_text:
                    if isinstance(doc, dict):
                        if doc["data_type"] == "Table":
                            text = doc["metadata"]["text_as_html"]
                        else:
                            text = doc["content"]
                        source = doc["metadata"]["source"]
                        page_number = doc["metadata"].get("page_number", 1)
                        tag = doc["metadata"].get("tag", "")
                        metadata = {
                            "Source": source,
                            "PageNumber": page_number,
                            "Commit": os.environ.get("PACH_JOB_ID", ""),
                            "Tag": tag,
                        }
                        docs.append(TextNode(text=text, metadata=metadata))

    print("Number of chunks: ", len(docs))

    index.insert_nodes(docs, show_progress=True)
    print("Indexing done!")
    index.storage_context.persist(persist_dir=path_to_db)
    print(f"Persisting done! Saved at {path_to_db}")

In [39]:
root_dir = "/home/liam/Projects/determined-repos/rag-pdf"
HOSTED_CHAT = "http://llama-31-70b-jordan.models.mlds-kserve.us.rdlabs.hpecorp.net/v1"
HOSTED_EMBED = "http://embedding-tyler.models.mlds-kserve.us.rdlabs.hpecorp.net/v1"
input_dir = f"{root_dir}/private/RFQ_Commercial"
output_dir = f"{root_dir}/private/test/parsed"
db_path = f"{root_dir}/private/test/embedded"
embedding_model_path = HOSTED_EMBED
input_path = Path(input_dir)
parsing_strategy = "hi_res"
chunking_strategy = "by_title"
combine_text_under_n_chars = 200
max_characters = 1500
new_after_n_chars = 1500


if not input_path.is_dir():
    raise ValueError("Input must be a directory.")

for subdir in input_path.iterdir():
    if subdir.is_dir() and "Petrobras" in subdir.name:
        tag = get_tag_from_dir(subdir)
        for file in subdir.rglob("*"):
            if file.suffix.lower() in (".pdf", ".docx"):
                parse(
                    str(file),
                    output_dir,
                    parsing_strategy,
                    chunking_strategy,
                    combine_text_under_n_chars,
                    max_characters,
                    new_after_n_chars,
                    tag,
                )

[32m2024-09-19 04:49:55.107[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse[0m:[36m43[0m - [1mProcessing /home/liam/Projects/determined-repos/rag-pdf/private/RFQ_Commercial/Petrobras/Petrobras.pdf. Using tag='Petrobras'[0m
[32m2024-09-19 04:52:56.681[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse[0m:[36m53[0m - [1mDone parsing elements[0m
[32m2024-09-19 04:52:56.858[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse[0m:[36m63[0m - [1mWriting output to /home/liam/Projects/determined-repos/rag-pdf/private/test/parsed/Petrobras.json[0m


input_file_path=PosixPath('/home/liam/Projects/determined-repos/rag-pdf/private/RFQ_Commercial/Petrobras/Petrobras.pdf') input_file_path.parent=PosixPath('/home/liam/Projects/determined-repos/rag-pdf/private/RFQ_Commercial/Petrobras'), input_file_path.stem + '.json'='Petrobras.json'


In [40]:
for file in Path(output_dir).rglob("*"):
    clean_parsed(file)

/home/liam/Projects/determined-repos/rag-pdf/private/test/parsed/Petrobras.json
************** Generating question **************

Generate the main question that is answered by the information provided in the passage below.  Ignore weird formatting or characters that look out of place.
<table><tr><td/><td/><td/><td/><td>Doc. No.:</td><td>20901-SPOG-50000-IN-SP-0002</td></tr><tr><td/><td/><td/><td/><td>Rev No.:</td><td>Al</td></tr><tr><td>supoor</td><td>ators</td><td>QUILL &amp;</td><td>GAS</td><td>Page:</td><td>Page 1 of 95</td></tr></table>

Only respond with the question.

What is the document number and revision number of this document?
************** Generating question **************

Generate the main question that is answered by the information provided in the passage below.  Ignore weird formatting or characters that look out of place.
= BR PETROBRAS

PETROBRAS ALBACORA FPSO PROJECT No. 20901

TECHNICAL SPECIFICATION FOR ICSS

Only respond with the question.

What are the tech

In [36]:
# Embedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
if embedding_model_path.startswith("http"):
    print(f"\nUsing Embedding API model endpoint: {embedding_model_path}\n")
    embed_model = OpenAIEmbedding(api_base=embedding_model_path, api_key="dummy")
else:
    print(f"\nUsing local Embedding model: {embedding_model_path}\n")
    embed_model = HuggingFaceEmbedding(model_name=embedding_model_path)

try:
    weaviate_client = weaviate.WeaviateClient(
        embedded_options=weaviate.EmbeddedOptions(
            persistence_data_path=db_path
        )
    )
    weaviate_client.connect()
except:
    weaviate_client = weaviate.connect_to_local(port=8079, grpc_port=50060)
    print(weaviate_client.is_ready())


Using Embedding API model endpoint: http://embedding-tyler.models.mlds-kserve.us.rdlabs.hpecorp.net/v1



{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-09-19T04:47:49-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-09-19T04:47:49-07:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-09-19T04:47:49-07:00"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-09-19T04:47:49-07:00"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":59801},"time":"2024-09-19T04:47:49-07:00"}
{"address":"192.168.68.51:59802","level":"info","msg":"starting cloud rpc server ...","time":"2024-09-19T04:47:49-07:00"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-09-19T04:4

In [42]:
embed(output_dir, db_path, embed_model, weaviate_client)

{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-09-19T05:03:14-07:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-09-19T05:03:14-07:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-09-19T05:03:14-07:00"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-09-19T05:03:14-07:00"}
{"level":"info","msg":"open cluster service","servers":{"Embedded_at_8079":49821},"time":"2024-09-19T05:03:14-07:00"}
{"address":"192.168.68.51:49822","level":"info","msg":"starting cloud rpc server ...","time":"2024-09-19T05:03:14-07:00"}
{"level":"info","msg":"starting raft sub-system ...","time":"2024-09-19T05:0

Number of chunks:  316


{"action":"telemetry_push","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:ec9b4e0c-3094-4557-b420-372242d01387 Type:INIT Version:1.26.1 NumObjects:0 OS:linux Arch:amd64 UsedModules:[]}","time":"2024-09-19T05:03:17-07:00"}
{"action":"bootstrap","level":"info","msg":"node reporting ready, node has probably recovered cluster from raft config. Exiting bootstrap process","time":"2024-09-19T05:03:17-07:00"}
{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-09-19T05:03:17-07:00","wait_for_cache_prefill":false}
{"level":"info","msg":"Created shard documents_1AqC9BmQNOW8 in 5.115213ms","time":"2024-09-19T05:03:17-07:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-09-19T05:03:17-07:00","took":235072}


Indexing done!
Persisting done! Saved at /home/liam/Projects/determined-repos/rag-pdf/private/test/embedded


<weaviate.client.WeaviateClient at 0x729de82f4220>