#### Install dependencies

In [1]:
# New OS parameter to avoid warnings.  
# This will not have a material impact on your code, but prevents warnings from appearing related to new LangChain features.
import os
os.environ['USER_AGENT'] = 'RAGUserAgent'

In [2]:
import openai
import bs4
import chromadb

from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain import hub
from langchain_core.documents.base import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain_core.vectorstores.base import VectorStoreRetriever
from langchain_core.runnables import RunnableParallel
from langchain_core.prompts import PromptTemplate

from modules import utils

from typing import List

In [3]:
envs = utils.load_env_file("./../secrets/env")

# Indexing
## Web loading and crawling

In [4]:
# Load Documents
loader = WebBaseLoader(
    web_paths=("https://kbourne.github.io/chapter1.html",), 
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs: List[Document] = loader.load()

In [5]:
print(f"The number of docs loaded is {len(docs)}")

The number of docs loaded is 1


In [6]:
for doc in docs:
    print(doc.page_content)



      Introduction to Retrieval Augmented Generation (RAG)
    
Date: March 10, 2024  |  Estimated Reading Time: 15 min  |  Author: Keith Bourne

  In the rapidly evolving field of artificial intelligence, Retrieval-Augmented Generation (RAG) is emerging as a significant addition to the Generative AI toolkit. RAG harnesses the strengths of Large Language Models (LLMs) and integrates them with internal data, offering a method to enhance organizational operations significantly. This book delves into the essential aspects of RAG, examining its role in augmenting the capabilities of LLMs and leveraging internal corporate data for strategic advantage.
As it progresses, the book outlines the potential of RAG in business, suggesting how it can make AI applications smarter, more responsive, and aligned with organizational objectives. RAG is positioned as a key facilitator of customized, efficient, and insightful AI solutions, bridging the gap between Generative AI's potential and specific bu

## Splitting

In [7]:
embedding_model = "llama3.1:8b"
base_url = envs["OLLAMA_HOST"]  # for example, "http://localhost:11434"

# Split
text_splitter: SemanticChunker = SemanticChunker(OllamaEmbeddings(base_url=base_url, model=embedding_model))
splits: List[Document] = text_splitter.split_documents(docs)

In [8]:
print(f"The number of splits is {len(splits)}")

The number of splits is 2


In [9]:
for split in splits:
    print(split.page_content, end=f"\n\n\n{'='*300}\n\n\n")



      Introduction to Retrieval Augmented Generation (RAG)
    
Date: March 10, 2024  |  Estimated Reading Time: 15 min  |  Author: Keith Bourne

  In the rapidly evolving field of artificial intelligence, Retrieval-Augmented Generation (RAG) is emerging as a significant addition to the Generative AI toolkit. RAG harnesses the strengths of Large Language Models (LLMs) and integrates them with internal data, offering a method to enhance organizational operations significantly. This book delves into the essential aspects of RAG, examining its role in augmenting the capabilities of LLMs and leveraging internal corporate data for strategic advantage. As it progresses, the book outlines the potential of RAG in business, suggesting how it can make AI applications smarter, more responsive, and aligned with organizational objectives. RAG is positioned as a key facilitator of customized, efficient, and insightful AI solutions, bridging the gap between Generative AI's potential and specific bu

## Embedding

In [10]:
# Embed
vectorstore: Chroma = Chroma.from_documents(documents=splits, 
                                    embedding=OllamaEmbeddings(base_url=base_url, model=embedding_model))

retriever: VectorStoreRetriever = vectorstore.as_retriever()

In [11]:
print(f"Length of retriever is {len(vectorstore.get())}")

Length of retriever is 7


In [12]:
len(vectorstore.get()["documents"])

2

## Retrieval

In [13]:
query = "How does RAG compare with fine-tuning?"
relevant_docs = retriever.invoke(query)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [14]:
for doc in relevant_docs:
    print(doc.page_content, end=f"\n\n\n{'='*300}\n\n\n")

Input/Prompts - This is where you actually "use" the model, using the prompt/input to introduce new knowledge that the LLM can act upon. Why not use fine-tuning in all situations? Once you have introduced the new knowledge, it will always have it! It is also how the model was originally created, by training with data, right? That sounds right in theory, but in practice, fine-tuning has been more reliable in teaching a model specialized tasks (like teaching a model how to converse in a certain way), and less reliable for factual recall. The reason is complicated, but in general, a model’s knowledge of facts is like a human’s long-term memory. If you memorize a long passage from a speech or book and then try to recall it a few months later, you will likely still understand the context of the information, but you may forget specific details. Whereas, adding knowledge through the input of the model is like our short-term memory, where the facts, details, and even the order of wording is al

# TF-IDF

In [15]:
# USING TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# Extract the text content from the splits
tfidf_documents = [split.page_content for split in splits]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_documents)

In [17]:
# Get the vocabulary, term frequencies, and corresponding IDF values
vocab = tfidf_vectorizer.get_feature_names_out()
tf_values = tfidf_matrix.toarray()
idf_values = tfidf_vectorizer.idf_

In [18]:
# Create a list of tuples containing word, TF, and IDF values
word_stats = list(zip(vocab, tf_values.sum(axis=0), idf_values))

# Sort the list by IDF values in descending order
word_stats.sort(key=lambda x: x[2], reverse=True)

In [19]:
# Print the grid of top 10 words, TF, and IDF values
print("Word\t\tTF\t\tIDF")
print("----\t\t--\t\t---")
for word, tf, idf in word_stats[:10]:
    print(f"{word:<12}\t{tf:.2f}\t\t{idf:.2f}")

Word		TF		IDF
----		--		---
000         	0.04		1.41
1024        	0.01		1.41
123         	0.00		1.41
13          	0.01		1.41
15          	0.00		1.41
16          	0.00		1.41
192         	0.02		1.41
1m          	0.02		1.41
200         	0.02		1.41
2024        	0.00		1.41


Using TF-IDF to query

In [20]:
user_query = "What are the advantages of using RAG?"

# TD-IDF scoring for user query
# User query embedding
tfidf_user_query = [user_query]
new_tfidf_matrix = tfidf_vectorizer.transform(tfidf_user_query)

# Calculate cosine similarity between the new content and the original documents
tfidf_similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

# Find the index of the document with the highest similarity score
tfidf_top_doc_index = tfidf_similarity_scores.argmax()

# Print the text of the top document
print("TF-IDF Top Document:\n", tfidf_documents[tfidf_top_doc_index])

TF-IDF Top Document:
 

      Introduction to Retrieval Augmented Generation (RAG)
    
Date: March 10, 2024  |  Estimated Reading Time: 15 min  |  Author: Keith Bourne

  In the rapidly evolving field of artificial intelligence, Retrieval-Augmented Generation (RAG) is emerging as a significant addition to the Generative AI toolkit. RAG harnesses the strengths of Large Language Models (LLMs) and integrates them with internal data, offering a method to enhance organizational operations significantly. This book delves into the essential aspects of RAG, examining its role in augmenting the capabilities of LLMs and leveraging internal corporate data for strategic advantage. As it progresses, the book outlines the potential of RAG in business, suggesting how it can make AI applications smarter, more responsive, and aligned with organizational objectives. RAG is positioned as a key facilitator of customized, efficient, and insightful AI solutions, bridging the gap between Generative AI's pot

# Doc2Vec

In [21]:
# CREATING AND SAVING DOC2VEC MODEL
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Extract the text content from the splits
doc2vec_documents = [split.page_content for split in splits]

# Tokenize the documents
doc2vec_tokenized_documents = [doc.lower().split() for doc in doc2vec_documents]

# Create tagged documents for Doc2Vec
doc2vec_tagged_documents = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(doc2vec_tokenized_documents)]

# Train the Doc2Vec model
# Use this version first.
doc2vec_model = Doc2Vec(doc2vec_tagged_documents, vector_size=100, window=5, min_count=1, workers=4)

# After running the previous version of model, comment the previous line out and uncomment this one. Try it with 1536D vectors.
# doc2vec_model = Doc2Vec(doc2vec_tagged_documents, vector_size=1536, window=5, min_count=1, workers=4)

# Save the trained model to a file
doc2vec_model.save("bin/doc2vec_model.bin")

In [23]:
# USING DOC2VEC SAVED MODEL

# Load the saved model
loaded_doc2vec_model = Doc2Vec.load("bin/doc2vec_model.bin")

# Calculate the document vectors
doc2vec_document_vectors = [loaded_doc2vec_model.dv[str(i)] for i in range(len(doc2vec_documents))]

# User query for embedding
doc2vec_user_query = [user_query]

# Tokenize the new content
doc2vec_tokenized_user_query = [content.lower().split() for content in doc2vec_user_query]

# Infer the vector for the new content
doc2vec_user_query_vector = loaded_doc2vec_model.infer_vector(doc2vec_tokenized_user_query[0])

# Calculate cosine similarity between the new content vector and the document vectors
doc2vec_similarity_scores = cosine_similarity([doc2vec_user_query_vector], doc2vec_document_vectors)

# Find the index of the document with the highest similarity score
doc2vec_top_doc_index = doc2vec_similarity_scores.argmax()

# Print the text of the top document
print("\nDoc2Vec Top Document:\n", doc2vec_documents[doc2vec_top_doc_index])


Doc2Vec Top Document:
 Input/Prompts - This is where you actually "use" the model, using the prompt/input to introduce new knowledge that the LLM can act upon. Why not use fine-tuning in all situations? Once you have introduced the new knowledge, it will always have it! It is also how the model was originally created, by training with data, right? That sounds right in theory, but in practice, fine-tuning has been more reliable in teaching a model specialized tasks (like teaching a model how to converse in a certain way), and less reliable for factual recall. The reason is complicated, but in general, a model’s knowledge of facts is like a human’s long-term memory. If you memorize a long passage from a speech or book and then try to recall it a few months later, you will likely still understand the context of the information, but you may forget specific details. Whereas, adding knowledge through the input of the model is like our short-term memory, where the facts, details, and even th

# Bert

In [24]:
# USING BERT
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
# Extract the text content from the splits
bert_documents = [split.page_content for split in splits]

# Load BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Get the vector size of the BERT embeddings
bert_vector_size = bert_model.config.hidden_size
print(f"Vector size of BERT (base-uncased) embeddings: {bert_vector_size}\n")

# Tokenize the documents
bert_tokenized_documents = [bert_tokenizer(doc, return_tensors='pt', max_length=512, truncation=True) for doc in bert_documents]

Vector size of BERT (base-uncased) embeddings: 768



In [26]:
# Calculate the document embeddings
bert_document_embeddings = []
with torch.no_grad():
    for doc in bert_tokenized_documents:
        bert_outputs = bert_model(**doc)
        bert_doc_embedding = bert_outputs.last_hidden_state[0, 0, :].numpy()
        bert_document_embeddings.append(bert_doc_embedding)

In [27]:
# New content (question) for embedding
bert_user_query = [user_query]

# Tokenize the new content
bert_tokenized_user_query = bert_tokenizer(bert_user_query[0], return_tensors='pt', max_length=512, truncation=True)

# Calculate the embedding for the new content
bert_user_query_embedding = []
with torch.no_grad():
    bert_outputs = bert_model(**bert_tokenized_user_query)
    bert_user_query_embedding = bert_outputs.last_hidden_state[0, 0, :].numpy()

In [28]:
# Calculate cosine similarity between the new content embedding and the document embeddings
bert_similarity_scores = cosine_similarity([bert_user_query_embedding], bert_document_embeddings)

# Find the index of the document with the highest similarity score
bert_top_doc_index = bert_similarity_scores.argmax()

# Print the text of the top document
print("BERT Top Document:\n", bert_documents[bert_top_doc_index])

BERT Top Document:
 Input/Prompts - This is where you actually "use" the model, using the prompt/input to introduce new knowledge that the LLM can act upon. Why not use fine-tuning in all situations? Once you have introduced the new knowledge, it will always have it! It is also how the model was originally created, by training with data, right? That sounds right in theory, but in practice, fine-tuning has been more reliable in teaching a model specialized tasks (like teaching a model how to converse in a certain way), and less reliable for factual recall. The reason is complicated, but in general, a model’s knowledge of facts is like a human’s long-term memory. If you memorize a long passage from a speech or book and then try to recall it a few months later, you will likely still understand the context of the information, but you may forget specific details. Whereas, adding knowledge through the input of the model is like our short-term memory, where the facts, details, and even the or

# Embedding

In [29]:
# Embed
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OllamaEmbeddings(base_url=base_url, model=embedding_model))

retriever = vectorstore.as_retriever()

In [30]:
# Retrieve the first result using the new content
result = retriever.invoke(user_query)[0]

# Print the retrieved document
print("\nRetrieved Document:\n", result.page_content)


Retrieved Document:
 Input/Prompts - This is where you actually "use" the model, using the prompt/input to introduce new knowledge that the LLM can act upon. Why not use fine-tuning in all situations? Once you have introduced the new knowledge, it will always have it! It is also how the model was originally created, by training with data, right? That sounds right in theory, but in practice, fine-tuning has been more reliable in teaching a model specialized tasks (like teaching a model how to converse in a certain way), and less reliable for factual recall. The reason is complicated, but in general, a model’s knowledge of facts is like a human’s long-term memory. If you memorize a long passage from a speech or book and then try to recall it a few months later, you will likely still understand the context of the information, but you may forget specific details. Whereas, adding knowledge through the input of the model is like our short-term memory, where the facts, details, and even the 

# RETRIEVAL and GENERATION

In [31]:
# Prompt - ignore LangSmith warning, you will not need langsmith for this coding exercise
prompt = hub.pull("jclemens24/rag-prompt")



In [32]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [33]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [34]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [35]:
llm = ChatOllama(base_url=base_url, 
                 model=embedding_model,
                 temperature=0.01)

In [36]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | StrOutputParser()
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | StrOutputParser()
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [37]:
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [38]:

# Question - relevant question
result = rag_chain_with_source.invoke(user_query)
relevance_score = result['answer']['relevance_score']
final_answer = result['answer']['final_answer']

print(f"Relevance Score: {relevance_score}")
print(f"Final Answer:\n{final_answer}")

Relevance Score: It seems like you've provided a large block of text about Retrieval Augmented Generation (RAG) and its applications in various industries. I'll try to summarize the key points for you:

**What is RAG?**

RAG combines information retrieval with generative language models to enhance the quality and relevance of generated text.

**Applications of RAG:**

1. Customer Support and Chatbots
2. Automated Reporting
3. Product Descriptions
4. Searchability and Utility of Internal Knowledge Bases
5. Searchability and Utility of General Knowledge Bases
6. Innovation Scouting
7. Content Personalization
8. Product Recommendations
9. Training and Education

**Benefits of RAG:**

1. Combines internal data with generative language models to create more accurate and relevant text.
2. Can access all internal data resources, not just a small portion like conventional Generative AI.

**Comparison with Conventional Generative AI:**

Conventional Generative AI is limited by its inability to 