# RAG Implementation

PDFs include Uber, Lyft, and United 10ks

### Setup Azure OpenAI Client

In [1]:
from openai import AzureOpenAI

# Azure OpenAI Configuration
AZURE_OPENAI_ENDPOINT = 'https://traversaal-ai-openai-urdu-llmam.openai.azure.com'#  #https://traversaal-ai-openai-urdu-llmam.openai.azure.com/openai/deployments/urdu-llama/chat/completions?api-version=2023-03-15-preview
AZURE_OPENAI_KEY = '86b6c99a81434e6fa84ab397642ecf91' #86b6c99a81434e6fa84ab397642ecf91
AZURE_API_VERSION = "2023-03-15-preview"

client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_KEY,
    api_version=AZURE_API_VERSION
)


In [2]:
import faiss
import pickle

# Load the FAISS indices
text_index = faiss.read_index("/Users/zhengyaojin/Desktop/Capstone_1205/text_index.faiss")
table_index = faiss.read_index("/Users/zhengyaojin/Desktop/Capstone_1205/table_index.faiss")

# Load the metadata dictionaries
with open("/Users/zhengyaojin/Desktop/Capstone_1205/text_metadata.pkl", "rb") as f:
    text_metadata = pickle.load(f)
with open("/Users/zhengyaojin/Desktop/Capstone_1205/table_metadata.pkl", "rb") as f:
    table_metadata = pickle.load(f)

# Verify loaded data
print(f"Text Index Size: {text_index.ntotal}")
print(f"Table Index Size: {table_index.ntotal}")
print(f"Text Metadata Entries: {len(text_metadata)}")
print(f"Table Metadata Entries: {len(table_metadata)}")


Text Index Size: 2626
Table Index Size: 712
Text Metadata Entries: 2626
Table Metadata Entries: 712


In [3]:
import faiss
import pickle

def confirm_faiss_and_metadata():
    """
    Load and confirm FAISS indices and metadata.
    """
    try:
        # Load the FAISS indices
        text_index = faiss.read_index("text_index.faiss")
        table_index = faiss.read_index("table_index.faiss")

        # Load the metadata dictionaries
        with open("text_metadata.pkl", "rb") as f:
            text_metadata = pickle.load(f)
        with open("table_metadata.pkl", "rb") as f:
            table_metadata = pickle.load(f)

        # Confirm FAISS indices
        print(f"Text FAISS Index Size: {text_index.ntotal}")
        print(f"Table FAISS Index Size: {table_index.ntotal}")

        # Confirm metadata
        print(f"Number of Text Metadata Entries: {len(text_metadata)}")
        print(f"Number of Table Metadata Entries: {len(table_metadata)}")

        # Perform a small check to validate stored data
        if text_index.ntotal == len(text_metadata):
            print("✅ Text index and metadata are consistent.")
        else:
            print("❌ Text index and metadata mismatch!")

        if table_index.ntotal == len(table_metadata):
            print("✅ Table index and metadata are consistent.")
        else:
            print("❌ Table index and metadata mismatch!")

        # Print a small sample from each metadata for manual verification
        print("\nSample Text Metadata Entry:")
        for key, value in list(text_metadata.items())[:1]:
            print(f"Key: {key}, Metadata: {value}")

        print("\nSample Table Metadata Entry:")
        for key, value in list(table_metadata.items())[:1]:
            print(f"Key: {key}, Metadata: {value}")

    except Exception as e:
        print(f"Error during confirmation: {e}")

# Call the confirmation function
confirm_faiss_and_metadata()


Text FAISS Index Size: 2626
Table FAISS Index Size: 712
Number of Text Metadata Entries: 2626
Number of Table Metadata Entries: 712
✅ Text index and metadata are consistent.
✅ Table index and metadata are consistent.

Sample Text Metadata Entry:
Key: 0, Metadata: {'file_name': 'Lyft 2020.pdf', 'content_type': 'text', 'content': 'UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n \n(Mark One)\n☒\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2019\nOR\n☐\nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR THE TRANSITION\nPERIOD FROM                      TO\nCommission File Number 001-38846\nLyft, Inc.\n(Exact name of Registrant as specified in its Charter)\nDelaware\n20-8809830\n(State or other jurisdiction of \nincorporation or organization)\n(I.R.S. Employer \nIdentification No.)\n185 Berry Street, Suite 5000\nSan Francisco, Califo

### Implement Retrieval-Augmented Generation (RAG)

In [4]:
def retrieve_relevant_content(query, top_k=15):
    """
    Retrieve relevant content (both text and tables) for the query.
    """
    # Embed the user query
    query_response = client.embeddings.create(
        input=query,
        model="text-embedding-3-small"
    )
    query_embedding = np.array(query_response.data[0].embedding).astype("float32")

    # Search text index
    text_distances, text_indices = text_index.search(np.array([query_embedding]), top_k)
    text_results = [
        f"- **Text Insight**: {text_metadata[idx]['content']}"
        for idx in text_indices[0] if idx != -1
    ]

    # Search table index
    table_distances, table_indices = table_index.search(np.array([query_embedding]), top_k)
    table_results = [
        f"- **Table Data**: {table_metadata[idx]['content']}"
        for idx in table_indices[0] if idx != -1
    ]

    # Combine text and table results into a single context
    combined_results = text_results + table_results
    return "\n\n".join(combined_results)


In [5]:
import numpy as np

def generate_rag_response(user_query):
    """
    Generate a detailed response using RAG by combining text and table data.

    Args:
        user_query (str): User query.

    Returns:
        str: Generated response.
    """
    # Retrieve relevant content
    context = retrieve_relevant_content(user_query, top_k=15)

    # Construct the prompt with explicit instructions
    prompt = f"""
    The following context includes text insights and table data extracted from financial documents. Analyze the information and provide a detailed response to the user's query.

    Context:
    {context}

    User Query:
    {user_query}

    Instructions:
    - Combine insights from text and table data.
    - Summarize key points and provide numerical trends from tables.
    - Draw comparisons and explain implications clearly.
    - If possible, provide actionable insights or next steps.
    """

    # Call Azure OpenAI API to generate a response
    try:
        response = client.chat.completions.create(
            model="gpt-4o-large",
            messages=[
                {"role": "system", "content": "You are a financial data assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=1500,  # Increased token limit for detailed responses
            temperature=0.5
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Error: {e}")
        return "An error occurred while generating the response."



### Run RAG queries

In [6]:
# Example user query
# user_query = "What was Uber's total revenue in 2023, and how does it compare to their expenses?"
# user_query = "What was United's total revenue in 2023, and how does it compare to their expenses?"
# user_query = "What was Lyft's total revenue in 2023, and how does it compare to their expenses?"
user_query = "What was Uber's total revenue in 2023, and how does it compare to Lyft 2023 total revenue?"

# Generate the RAG response
response = generate_rag_response(user_query)

# Display the response
print(f"\nRAG Response:\n{response}")


RAG Response:
In 2023, Uber's total revenue was $37.281 billion, as detailed in the "Results of Operations" table for the year ended December 31, 2023. Comparatively, Lyft's total revenue for the same year was $4.404 billion, as shown in the "Lyft, Inc. Consolidated Statements of Operations" table.

### Key Points and Numerical Trends:
1. **Uber's Revenue**:
   - Total revenue for 2023: $37.281 billion.
   - This represented a 17% increase from the previous year (2022), where the revenue was $31.877 billion.

2. **Lyft's Revenue**:
   - Total revenue for 2023: $4.404 billion.
   - This represented a 7.5% increase from the previous year (2022), where the revenue was $4.095 billion.

### Comparison:
- **Revenue Scale**: Uber's total revenue in 2023 was significantly higher than Lyft's, with Uber generating approximately 8.5 times more revenue than Lyft.
- **Growth Rate**: Uber's revenue growth rate from 2022 to 2023 was 17%, while Lyft's was 7.5%. This indicates that Uber not only had a

In [None]:
import streamlit as st

# App Title and Description
st.title("10-K Analysis and Question Answering")
st.write("""
A system to process and analyze Lyft's 10-K filings and answer questions related to financial metrics, cost structure, and ride volume.
""")

# Question Input Section
st.header("Ask a Question")
user_question = st.text_input("Enter your question here:")

# Submit Button Logic
if st.button("Submit Question"):
    if not user_question:
        st.error("Please enter a question.")
    else:
        st.info("Processing your query...")
        try:
            # Retrieve relevant content (you need to define `retrieve_relevant_sections`)
            retrieved_texts, retrieved_tables = retrieve_relevant_sections(
                user_question, text_index, text_metadata, table_index, table_metadata
            )
            
            # Combine retrieved content
            context = " ".join(retrieved_texts)  # Adjust combination logic if needed
            
            # Generate response (ensure your model is loaded and functional)
            if model:
                response = model(f"Question: {user_question} Context: {context}")
                st.subheader("Response")
                st.write(response[0]['generated_text'])
            else:
                st.error("RAG model not loaded. Check your setup.")
        except Exception as e:
            st.error(f"Error during query processing: {e}")

# Footer
st.write("---")
st.write("Developed for Lyft's 10-K financial analysis.")
