## Step 1: Data Ingestion

In [14]:
from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title

# Filename
filename = "data/llama2.pdf"

# Partition the document into raw elements
elements = partition(filename=filename, strategy="fast")

# Chunk the elements of the partitioned document
chunks = chunk_by_title(elements)

## Step 2: Prepare Vector Store And Retriever

In [15]:
import os

from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

# Load the environment variables
load_dotenv()

# Initialize documents array
documents = []

# Process each chunk and remove the language key from metadata
for chunk in chunks:
    metadata = chunk.metadata.to_dict()
    metadata.pop('languages')
    documents.append(Document(page_content=chunk.text, metadata=metadata))

# Initialize the vector db
db = Chroma.from_documents(
    documents=documents, 
    collection_name="chroma-db-papers",
    embedding=OpenAIEmbeddings(),
    persist_directory="./db"
)

# Create a retriever over the vector database
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 6, "lambda_mult": 0.25, "fetch_k": 30}
)

## Step 3: Create The Conversation Chain

In [16]:
from dotenv import load_dotenv
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Load the environment variables
load_dotenv()

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o")

# Pull a prompt template
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [23]:
answer = chain.invoke(
    """What is reward modelling in llama2?
    Provide citations or references at the end.
    """
)

In [24]:
answer

"Reward modeling in Llama 2 involves taking a model response and its corresponding prompt as inputs and outputting a scalar score to indicate the quality of the response, such as its helpfulness and safety. These scores are used as rewards to optimize Llama 2-Chat during RLHF (Reinforcement Learning with Human Feedback) for better alignment with human preferences. This process aims to enhance the model's helpfulness and safety (3.2.2 Reward Modeling)."

## Optional: Using A Pre-Existing Chroma DB

In [49]:
import chromadb

from langchain_chroma import Chroma
from dotenv import load_dotenv
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

# Load the environment variables
load_dotenv()

# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o")

# Pull a prompt template
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a chromadb client
try:
    client = chromadb.PersistentClient("./db")
except Exception as e:
    print(e)

# View existing collections
collections = client.list_collections()
for collection in collections:
    print(f"COLLECTION_NAME: {collection.name}\nCOLLECTION_METADATA: {collection.metadata}\n")

# Load the required db file and collection name
db = Chroma(
    persist_directory="./db", 
    collection_name="chroma-db-manuals",
    embedding_function=OpenAIEmbeddings()
)

# Create a retriever over the database
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 6, "lambda_mult": 0.25, "fetch_k": 50}
)

COLLECTION_NAME: chroma-db-manuals
COLLECTION_METADATA: None

COLLECTION_NAME: chroma-db-papers
COLLECTION_METADATA: None



In [50]:
# Create the conversation chain
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [53]:
answer = chain.invoke(
    """How to select a new rhythm pattern in RC 10R?
    Provide citations or references at the end.
    """
)

In [54]:
answer

'To select a new rhythm pattern in the RC-10R, first press the [VALUE] knob to move the cursor to the genre. Then, turn the [VALUE] knob to select the desired genre. If the screen is not correct, press the [EXIT] button several times to access the top screen.\n\nReferences: BOSS RC-10R Basic Rhythm Operation.'

## Step 3: Initializing The Vector Store

### Using In-Memory Vector Store (FAISS)

In [None]:
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

documents = []

# Create a document from every chunk and add to document store
for chunk in chunks:
    metadata = chunk.metadata.to_dict()
    documents.append(Document(page_content=chunk.text, metadata=metadata))

# Initialize in-memory vector index with the document store
db = FAISS.from_documents(
    documents, 
    HuggingFaceEmbeddings(
        model_name="BAAI/bge-base-en-v1.5"
    )
)

# Create retriever to index over the document store
faiss_retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
retrieved_documents = retriever.invoke("llama2 pretrained model evaluation")

In [None]:
for document in retrieved_documents:
    print(f"CONTENT:\n{document.page_content}\n\nMETADATA:\npage: {document.metadata}")
    print()