In [1]:
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain.document_loaders.pdf import PyPDFDirectoryLoader, OnlinePDFLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from langchain.schema.document import Document

In [2]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [3]:
def load_documents(text_splitter: TextSplitter = None) -> list[Document]:
    """Load documents from a list of URLs."""
    pdf_loader = PyPDFDirectoryLoader(DATA_PATH)
    return pdf_loader.load_and_split(text_splitter=text_splitter)

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=80,
    length_function=len,
    is_separator_regex=False,
)

In [5]:
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="mistral")
    return embeddings

In [6]:
def calculate_chunk_ids(chunks: list[Document]) -> list[Document]:

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [8]:
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        collection_name="study_ai",
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function(),
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        # Run a for loop to add max 1000 documents at a time.

        for i in range(0, len(new_chunks), 5000):
            db.add_documents(new_chunks[i : i + 5000], ids=new_chunk_ids[i : i + 5000])
        # db.add_documents(new_chunks, ids=new_chunk_ids)
        # db.persist()
    else:
        print("✅ No new documents to add")

In [9]:
def clear_database():
    db = Chroma(
        collection_name="study_ai",
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function(),
    )
    db.reset_collection()

In [12]:
def populate_databse(reset = False):
    if reset:
        print("✨ Clearing Database")
        clear_database()

    # Create (or update) the data store.
    documents = load_documents(text_splitter=text_splitter)
    add_to_chroma(documents)

In [13]:
populate_databse(reset=True)

✨ Clearing Database
Number of existing documents in DB: 0
👉 Adding new documents: 2179


In [81]:
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableMap
import json

template = """
Format these lists of books and queries into a JSON metadata format with the same names.

List of books:
{books}

Query: "{query}"

Just return JSON metadata formatted like this:
{{
  "book_title": "...",
  "author": "..."
}}
"""


def gen_filters(query: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(
        collection_name="study_ai",
        persist_directory=CHROMA_PATH,
        embedding_function=embedding_function,
    )

    # Get unique books from the db
    unique_books = db.get(include=["metadatas"])
    unique_books = set([meta["source"] for meta in unique_books["metadatas"]])

    prompt = ChatPromptTemplate.from_template(template)
    llm = OllamaLLM(model="mistral")
    chain = (
        RunnableMap({"query": RunnablePassthrough(), "books": RunnablePassthrough()}) 
        | prompt 
        | llm 
        | StrOutputParser()
    )
    metadata_json = chain.invoke({"query": query, "books": str(list(unique_books))})
    print(f"Metadata JSON: {metadata_json}")
    try:
        return json.loads(metadata_json)
    except json.JSONDecodeError as e:
        return {}

In [82]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
---
Do no include any information that is not in the context.
If the context does not contain the answer, say "I don't know".
If the context contains the answer, say the answer. Do not talk about the given context.
"""

In [83]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(
        collection_name="study_ai",
        persist_directory=CHROMA_PATH,
        embedding_function=embedding_function,
    )
    filters = gen_filters(query_text)
    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5, filter=filters)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = OllamaLLM(model="mistral")
    for chunk in model.stream(prompt):
        print(chunk, end="", flush=True)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    print("\n\n---\n\n")
    print(f"Sources: {len(sources)}")
    for source in sources:
        print(source)

In [84]:
query_rag("What is vibration?")

Metadata JSON:  Here's the JSON metadata for each book based on your examples. Please note that I can't provide actual titles or authors, as I don't have access to that information. In these examples, I assumed that each book has a title and an author.

```json
[
  {
    "book_title": "Thomson 4th ed.",
    "author": "Author of Thomson 4th ed."
  },
  {
    "book_title": "Ferguson 3rd ed.",
    "author": "Author of Ferguson 3rd ed."
  }
]
```
In the given example, each book is represented as an object with two properties: "book_title" and "author". The values for these properties are placeholders for real titles and authors. Adjust the placeholders according to the actual data you have.


ValueError: Expected where to have exactly one operator, got {} in query.

In [17]:
query_rag("What is the definition of vibration from Thomson's book?")

 Based on the provided context, there is no explicit definition of "vibration" given in the text. However, the context suggests that we are discussing oscillations or vibrations in a physical system, as seen in Figure 8.5-1 and discussed in relation to mode shapes. In a broader sense, in Thomson's book, vibration could be referred to as the periodic motion of a system about its equilibrium position.

---


Sources: 5
data\Ferguson 3rd ed.pdf:247:3
data\Thomson 4th ed.pdf:242:1
data\Thomson 4th ed.pdf:460:2
data\Thomson 4th ed.pdf:349:2
data\Thomson 4th ed.pdf:257:1


In [18]:
query_rag("Describe different types of vibrations.")

 Based on the given context, it appears that the discussion revolves around Harmonically Excited Vibration, as mentioned in Chapter 3. However, no explicit definition for this type of vibration is provided in the text.

The equations presented seem to be related to normal mode vibration, expressed either in terms of stiffness (K) or flexibility (1/K). The stiffness approach expresses force in terms of displacement and flexibility approaches express displacement in terms of force.

The context does not explicitly mention other types of vibrations such as free, forced, or damped vibration. However, it does discuss the concept of damping, which is a mechanism for energy dissipation in oscillatory systems, and mentions that energy can be dissipated into heat, radiated away, or lost through waves (like a buoy bobbing in water).

In summary, based on the provided context, the discussion appears to be limited to Harmonically Excited Vibration, with some mention of the effects of damping. Othe

In [19]:
query_rag("What is the difference between free and forced vibrations?")

 According to the provided context, free vibrations occur when a system is initially disturbed from equilibrium and then left to oscillate on its own without any external forces acting on it. The equations of motion for free vibrations are typically expressed in terms of the stiffness (K) of the system, such as equation (a): (-co^[M] + [/:]){A-} = {0}.

On the other hand, forced vibrations occur when an external force is applied to the system causing it to oscillate. The displacement in this case can be written in terms of the force using the flexibility (1/K) as shown in equation (b): {F}=[/C]{A'}.

In essence, free vibrations are self-sustaining oscillations of a system without any external forces, while forced vibrations involve an external force acting on the system and causing it to vibrate.

---


Sources: 5
data\Ferguson 3rd ed.pdf:247:3
data\Thomson 4th ed.pdf:242:1
data\Thomson 4th ed.pdf:257:1
data\Thomson 4th ed.pdf:460:2
data\Thomson 4th ed.pdf:183:0


In [27]:
embedding_function = get_embedding_function()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_score("runtime", k=5)
results

[]

In [24]:
query_rag("Prerequisites to set up a Lambda function?")
# query_rag("How to set up a Lambda function in AWS?")

 Based on the given context, there are no explicit prerequisites mentioned for setting up an AWS Lambda function. However, in general, to set up an AWS Lambda function, you would typically need the following:

1. An Amazon Web Services (AWS) account.
2. The AWS Command Line Interface (CLI) installed and configured on your local machine or access to the AWS Management Console.
3. A suitable runtime environment for your Lambda function (e.g., Node.js, Python, etc.) and a compatible ZIP file containing your code.
4. Proper IAM permissions to create and manage Lambda functions within your account.
5. Depending on the use case, you might also need to configure event sources, such as Amazon S3 buckets, DynamoDB tables, or API Gateway endpoints.

---


Sources: 0


In [21]:
query_rag("How to set up a Lambda function in AWS?")

 To set up an AWS Lambda function, follow these steps:

1. Sign in to the AWS Management Console and open the Lambda service.

2. Click on the 'Create function' button to begin setting up your new Lambda function.

3. In the 'Designer' section, provide a name for your Lambda function and choose a runtime environment (e.g., Node.js, Python, etc.). Then click 'Create function'.

4. Under the 'Code entry type', select 'Upload a .zip file' or 'Author from scratch' based on whether you have an existing code package or plan to write your Lambda function within the AWS Console.

   If you choose 'Upload a .zip file', you will be prompted to upload the compressed code for your Lambda function. Make sure that all dependencies are included in the zip file.

5. In the 'Handler' field, specify the name of the handler function within your code package (e.g., index.handler if you have an Node.js project).

6. Click on 'Add or create an execution role' to assign a suitable IAM role to your Lambda fun

In [None]:
query_rag("How to set up a Lambda function in AWS?")