# CorpPulse: A RAG Application to query against shareholder letters

### This application is hosted on [Streamlit Community Cloud](https://corppulse.streamlit.app)

First, we need to import all of the dependencies for the project

In [23]:
!pip install altair numpy pandas pydeck streamlit haystack-ai datasets>=2.6.1 \
sentence-transformers>=2.2.0 streamlit transformers

from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators.openai import OpenAIGenerator
from haystack.components.generators.utils import print_streaming_chunk
from haystack.utils import Secret
from haystack import Pipeline
from haystack import Document
import streamlit as st
import os
import os
import time
from streamlit.logger import get_logger
from google.colab import userdata


Next, we will define our constants. The constants are our OpenAI API key and our data directory

In [11]:
OPENAI_KEY = userdata.get("OPENAI_KEY")
DATA_DIR = "data/Letters"

We first create our pipeline function. This function will house our pipeline logic, allowing the pipeline to be called from our streamlit chat surface. View inline comments to see how this is set up.

In [12]:
@st.cache_resource
def rag_pipeline() -> Pipeline:
    # Gather documents
    docs = []
    for filename in os.listdir(DATA_DIR):
        if filename.endswith('.txt'):  # Ensure we're reading text files
            file_path = os.path.join(DATA_DIR, filename)
            with open(file_path, 'r') as f:
                text = f.read()
                docs.append(Document(content=text))

    # Initialise the document embedder
    doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
    doc_embedder.warm_up()
    # Run the document embedder to get the embeddings
    docs_with_embeddings = doc_embedder.run(docs)["documents"]

    # Initialize the document store and store our documents here
    document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
    document_store.write_documents(docs_with_embeddings)

    # Initialize our retriever and ensure it's retrieving from our document store
    retriever = InMemoryEmbeddingRetriever(document_store=document_store)

    # Initialize our text embedder
    text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

    # Answer user question, focusing on the most relevant documents
    template = """
    Carefully analyze the provided documents and answer the question.  Highlight trends, patterns, or significant conclusions that can be drawn by considering the information as a whole:

    {% for document in documents %}
         {{ document.content }}
     {% endfor %}
    Question: {{question}}
    Answer:
    """

    # Initialize our prompt builder
    prompt_builder = PromptBuilder(template=template)

    # Initialize our LLM generator using GPT-3.5-Turbo
    generator = OpenAIGenerator(api_key=Secret.from_token(OPENAI_KEY), model="gpt-3.5-turbo", streaming_callback=print_streaming_chunk)

    # Initialize our pipeline
    rag_pipeline = Pipeline()

    # Add our components
    rag_pipeline.add_component("text_embedder", text_embedder)
    rag_pipeline.add_component("retriever", retriever)
    rag_pipeline.add_component("prompt_builder", prompt_builder)
    rag_pipeline.add_component("llm", generator)

    # Connect our components to each other
    rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
    rag_pipeline.connect("retriever", "prompt_builder.documents")
    rag_pipeline.connect("prompt_builder", "llm")

    return rag_pipeline


Next, we need to generate our answers when a query is entered on the Streamlit chat surface

In [14]:
pipeline = rag_pipeline

# Function to execute the RAG pipeline
def generate_answer(query: str):
    result = pipeline.run(
        {
            "text_embedder": {"text": query},
            "prompt_builder": {"question": query}
        }
    )
    answer = result["llm"]["replies"][0]
    return str(answer).replace("$", "\$")

We want to make our chat experience pleasant, so we stream the answer generated by the LLM

In [15]:
def stream_answer(answer):
    for word in answer.split(" "):
        yield word + " "
        time.sleep(0.02)

We now want to add our Streamlit code so we can view this on a nice GUI

In [17]:
def run():
    # Sidebar for uploading new documents
    st.sidebar.title("Upload a New File")
    uploaded_file = st.sidebar.file_uploader("Choose a .txt file", type="txt")

    if uploaded_file is not None:
      # Get the file name
      filename = uploaded_file.name

      # Save the uploaded file to the data directory
      with open(os.path.join("data/Letters", filename), "wb") as f:
          f.write(uploaded_file.getbuffer())

      st.sidebar.success('File uploaded successfully!')

    # Set column layout for header
    col1, col2 = st.columns(2)
    with col1:
        col1.image("img/header-small.svg", use_column_width=True)
    with col2:
        col2.markdown("<h3 style='text-align: right; margin-top: 20%;'>Query a dataset of over 125 shareholder letters</h3>", unsafe_allow_html=True)

    st.divider()

    user_question = st.chat_input("Ask us a question:")

    if user_question:
        user = st.chat_message("human")
        user.write(user_question)
        message = st.chat_message("assistant")
        with st.spinner("Thinking..."):
            answer = generate_answer(user_question)
        message.write("Hello, ")
        message.write_stream(stream_answer(answer))

if __name__ == "__main__":
    run()

There you have it! This is our complete project. We can query against our existing dataset and we can even upload new documents to add to the dataset.

The Streamlit app will not run in this notebook. To view local run instructions, please consult our README.md in the Github [repo](https://github.com/dhasty1/CorpPulse)