In [None]:
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from groq import Groq
from langchain_groq import ChatGroq
import joblib
import os

import nest_asyncio  # noqa: E402
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv()

In [None]:
def load_or_parse_data(fp_pdf):
    fp__parsed_data = "./data/parsed_data.pkl"

    if os.path.exists(fp__parsed_data):
        print("PKL-FILE ALREADY EXISTS.")
        # Load the parsed data from the file
        parsed_data = joblib.load(fp__parsed_data)
    else:
        print("PKL-FILE DOESNT EXIST YET. CREATE IT.")
        # Perform the parsing step and store the result in llama_parse_documents
        parsingInstructionUber10k = """The document offers an extensive overview of the UBER's financial performance for the
        fourth quarter of 2022. It includes unaudited financial statements, management discussion and analysis, and other relevant
        disclosures as required by the SEC. The form features several tables for detailed examination. Please provide precise
        responses when addressing the questions contained within."""
        parser = LlamaParse(api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
                            result_type="markdown",
                            parsing_instruction=parsingInstructionUber10k,
                            max_timeout=5000,)
        llama_parse_documents = parser.load_data(fp_pdf)

        # Save the parsed data to a file
        print("SAVE PARSING RESULT IN PICKEL FILE.")
        joblib.dump(llama_parse_documents, fp__parsed_data)

        # Set the parsed data to the variable
        parsed_data = llama_parse_documents

    return parsed_data

def remove_non_utf8_characters(source_file_path, cleaned_file_path):
    with open(source_file_path, 'rb') as file:
        bytes_data = file.read()

    # Attempt to decode with 'utf-8' and replace errors with a space or nothing
    clean_text = bytes_data.decode('utf-8', errors='replace')

    # Write the cleaned text back to a new file
    with open(cleaned_file_path, 'w', encoding='utf-8') as clean_file:
        clean_file.write(clean_text)

    print(f"Cleaned file saved as {cleaned_file_path}")

# Create vector database
def create_vector_database(fp_pdf):
    """
    This function creates a vector database using document loaders and embeddings.
    """
    # Call the function to either load or parse the data
    llama_parse_documents = load_or_parse_data(fp_pdf)

    markdown_path = "data/output.md"
    with open(markdown_path, 'w') as f:
        for doc in llama_parse_documents:
            f.write(doc.text + '\n')

    remove_non_utf8_characters(markdown_path, markdown_path)
    
    loader = UnstructuredMarkdownLoader(markdown_path)

    documents = loader.load()
    # Split loaded documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)

    # Initialize Embeddings
    embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    # Create and persist a Chroma vector database from the chunked documents
    vs = Chroma.from_documents(documents=docs, embedding=embed_model
                               , persist_directory="chroma_db_llamaparse1", collection_name="rag")

    return vs, embed_model

fp_pdf = rf"C:\Users\{os.getlogin()}\Documents\gitProjects\dataAndAiExamples\ragComplexPdfs\in\Uber-Q4-22-Earnings-Press-Release.pdf"
vectorstore, embed_model = create_vector_database(fp_pdf)

In [None]:
# In case you wanna use GROQ
chat_model = ChatGroq(temperature=0
                      , model_name="mixtral-8x7b-32768"
                      , api_key=os.getenv("GROQ_API_KEY"),)

retriever=vectorstore.as_retriever(search_kwargs={'k': 3})

In [None]:
custom_prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

def set_custom_prompt(custom_prompt_template):
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt

prompt = set_custom_prompt(custom_prompt_template)

qa = RetrievalQA.from_chain_type(llm=chat_model,
                               chain_type="stuff",
                               retriever=retriever,
                               return_source_documents=True,
                               chain_type_kwargs={"prompt": prompt})

In [None]:
response = qa.invoke({"query": "What is the segment adjusted ebitda of mobility end of 2021 and 2022?"})
response["result"]

In [None]:
response = qa.invoke({"query": "What are the Non GAAP Research and development expenses?"})
print(response["result"])