In [1]:
from dotenv import load_dotenv
load_dotenv()

##### LLAMAPARSE #####
from llama_parse import LlamaParse

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
#
from groq import Groq
from langchain_groq import ChatGroq
#
import joblib
import os
import nest_asyncio  # noqa: E402
nest_asyncio.apply()

In [2]:
def load_or_parse_data():
    data_file = "./data/parsed_data.pkl"

    if os.path.exists(data_file):
        # Load the parsed data from the file
        parsed_data = joblib.load(data_file)
    else:
        # Perform the parsing step and store the result in llama_parse_documents
        parsingInstructionUber10k = """The provided document is a quarterly report filed by Uber Technologies,
        Inc. with the Securities and Exchange Commission (SEC).
        This form provides detailed financial information about the company's performance for a specific quarter.
        It includes unaudited financial statements, management discussion and analysis, and other relevant disclosures required by the SEC.
        It contains many tables.
        Try to be precise while answering the questions"""
        parser = LlamaParse(api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
                            result_type="markdown",
                            parsing_instruction=parsingInstructionUber10k,
                            max_timeout=5000,)
        llama_parse_documents = parser.load_data(r"C:\Users\oliver.koehn\Documents\gitProjects\dataAndAiExamples\ragComplexPdfs\in\Uber-Q4-22-Earnings-Press-Release.pdf")


        # Save the parsed data to a file
        print("Saving the parse results in .pkl format ..........")
        joblib.dump(llama_parse_documents, data_file)

        # Set the parsed data to the variable
        parsed_data = llama_parse_documents

    return parsed_data

def remove_non_utf8_characters(source_file_path, cleaned_file_path):
    with open(source_file_path, 'rb') as file:
        bytes_data = file.read()

    # Attempt to decode with 'utf-8' and replace errors with a space or nothing
    clean_text = bytes_data.decode('utf-8', errors='replace')

    # Write the cleaned text back to a new file
    with open(cleaned_file_path, 'w', encoding='utf-8') as clean_file:
        clean_file.write(clean_text)

    print(f"Cleaned file saved as {cleaned_file_path}")

# Create vector database
def create_vector_database():
    """
    Creates a vector database using document loaders and embeddings.

    This function loads urls,
    splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
    and finally persists the embeddings into a Chroma vector database.

    """
    # Call the function to either load or parse the data
    llama_parse_documents = load_or_parse_data()
    print(llama_parse_documents[0].text[:300])

    markdown_path = "data/output.md"
    with open(markdown_path, 'w') as f:  # Open the file in append mode ('a')
        for doc in llama_parse_documents:
            f.write(doc.text + '\n')

    remove_non_utf8_characters(markdown_path, markdown_path)
    
    loader = UnstructuredMarkdownLoader(markdown_path)

   #loader = DirectoryLoader('data/', glob="**/*.md", show_progress=True)
    documents = loader.load()
    # Split loaded documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)

    #len(docs)
    print(f"length of documents loaded: {len(documents)}")
    print(f"total number of document chunks generated :{len(docs)}")
    #docs[0]

    # Initialize Embeddings
    embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    # Create and persist a Chroma vector database from the chunked documents
    vs = Chroma.from_documents(
        documents=docs,
        embedding=embed_model,
        persist_directory="chroma_db_llamaparse1",  # Local mode with in-memory storage only
        collection_name="rag"
    )

    #query it
    #query = "what is the agend of Financial Statements for 2022 ?"
    #found_doc = qdrant.similarity_search(query, k=3)
    #print(found_doc[0][:100])
    #print(qdrant.get())

    print('Vector DB created successfully !')
    return vs,embed_model

In [3]:
import os
vs,embed_model = create_vector_database()

# Uber Quarterly Report Q4 2022

# Uber Quarterly Report Q4 2022

## Financial Highlights for Fourth Quarter 2022

- Gross Bookings grew 19% year-over-year to $30.7 billion, or 26% on a constant currency basis.
- Revenue grew 49% YoY to $8.6 billion, or 59% on a constant currency basis.
- Net income
Cleaned file saved as data/output.md
length of documents loaded: 1
total number of document chunks generated :15


  from .autonotebook import tqdm as notebook_tqdm
Fetching 8 files: 100%|██████████| 8/8 [00:00<?, ?it/s]


Vector DB created successfully !


In [4]:
chat_model = ChatGroq(temperature=0,
                      model_name="mixtral-8x7b-32768",
                      api_key=os.getenv("GROQ_API_KEY"),)

In [5]:
vectorstore = Chroma(embedding_function=embed_model,
                      persist_directory="chroma_db_llamaparse1",
                      collection_name="rag")
 #
retriever=vectorstore.as_retriever(search_kwargs={'k': 3})

In [6]:
custom_prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [7]:
def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt
#
prompt = set_custom_prompt()
prompt

########################### RESPONSE ###########################
PromptTemplate(input_variables=['context', 'question'], template=custom_prompt_template)

PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of information to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nContext: {context}\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n")

In [8]:
qa = RetrievalQA.from_chain_type(llm=chat_model,
                               chain_type="stuff",
                               retriever=retriever,
                               return_source_documents=True,
                               chain_type_kwargs={"prompt": prompt})

In [9]:
response = qa.invoke({"query": "What is the sehment adjusted ebitda of mobility end of 2021 and 2022?"})
response

{'query': 'What is the sehment adjusted ebitda of mobility end of 2021 and 2022?',
 'result': 'The Segment Adjusted EBITDA for Mobility was $575 million at the end of 2021 and $1,012 million at the end of 2022, representing a 76% increase.',
 'source_documents': [Document(page_content='| |Three Months Ended December 31|% Change|% Change (Constant Currency)|\n\n|---|---|---|---|\n\n|Mobility|$11,340|$14,894|31%|37%|\n\n|Delivery|$13,444|$14,315|6%|14%|\n\n|Freight (1)|$1,082|$1,540|42%|43%|\n\n|Total|$25,866|$30,749|19%|26%|\n\n(1) Beginning in Q4 2021, Freight Gross Bookings include contributions from the acquisition of Transplace which closed on November 12, 2021.\n\n| |Three Months Ended December 31|% Change|% Change (Constant Currency)|\n\n|---|---|---|---|\n\n|Mobility (1)|$2,278|$4,136|82%|94%|\n\n|Delivery (2)|$2,420|$2,931|21%|33%|\n\n|Freight (3)|$1,080|$1,540|43%|43%|\n\n|Total|$5,778|$8,607|49%|59%|\n\n(1) Mobility Revenue in Q4 2022 benefited by a net amount of $1.2 billion 

In [12]:
response = qa.invoke({"query": "What are teh numbers regarding Bad debt expense?"})
response

{'query': 'What are teh numbers regarding Bad debt expense?',
 'result': 'Bad debt expense for the years ended December 31, 2021 and 2022 was $109 and $114, respectively.\nFor the three months ended December 31, 2021 and 2022, bad debt expense was $34 and $38, respectively.',
 'source_documents': [Document(page_content='($550)\n\n($3,834)\n\nInterest Expense\n\n($130)\n\n($483)\n\nOther Income (Expense), Net\n\n$1,471\n\n$3,292\n\nIncome (Loss) Before Income Taxes\n\n$791\n\n($1,025)\n\nNet Income (Loss) Attributable to Uber Technologies, Inc.\n\n$892\n\n($496)\n\nNet Income (Loss) Per Share\n\nBasic\n\n$0.46\n\n($0.26)\n\nDiluted\n\n$0.44\n\n($0.29)\n\nWeighted-average shares used to compute net income per share: Basic - 1,936,736 (2021) and 1,994,800 (2022), Diluted - 2,005,591 (2021) and 2,060,575 (2022).\n\nUber Technologies, Inc. - Quarterly Report\n\nUber Technologies, Inc. - Condensed Consolidated Statements of Cash Flows\n\nThree Months Ended December 31, 2021\n\nThree Months E