# Build your knowledge base for RAG
RAG example using a small textual dataset.

In [1]:
from langchain_core.messages.base import BaseMessage
from langchain_core.messages.human import HumanMessage
from langchain_core.messages.system import SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

import csv
import enum
import gradio as gr
import logging as log
import numpy as np
import os
import pandas as pd
import pickle
import pprint

14:42:23 [ INFO] HTTP Request: GET https://api.gradio.app/gradio-messaging/en "HTTP/1.1 200 OK"


In [2]:
log.basicConfig(level=log.INFO, format='%(asctime)s [%(levelname)5s] %(message)s',datefmt='%H:%M:%S')

In [3]:
df= None # Dataframe containing the dataset
embeddings= None
loadSavedData= True

### LLM Model Factory

In [4]:
class LlmVendor( enum.Enum):
    Google=1,
    OpenAI=2

class LlmModel( enum.Enum):
    GeminiFlash=1,
    GeminiPro=2
    OpenAI=3
    
class LlmModelFactory():
    def __init__(self, model: LlmModel, vendor: LlmVendor, envVar:str):
        self._model = model
        self._vendor= vendor
        self._envVar= envVar

        if not envVar in os.environ:
            raise ValueError(f'{envVar} not found in the environment.')

    @staticmethod
    def createGoogleGeminiFlash(): 
        return LlmModelFactory(model= LlmModel.GeminiFlash,  
                               vendor= LlmVendor.Google,
                               envVar='GOOGLE_API_KEY')
    @staticmethod
    def modelName( llmModel: LlmModel):
        match llmModel:
            case LlmModel.GeminiFlash:
                return "gemini-1.5-flash"
            case LlmModel.GeminiPro:     
                return "gemini-1.5-pro"
        
    def __str__(self):
        return f'(model:{self._model}, vendor:{self._vendor}, envVar:{self._envVar})'

    def createChatModel(self, 
                       maxOutputTokens=250,
                       temperature=0.0):
        match self._vendor:
            case LlmVendor.Google:
                return ChatGoogleGenerativeAI(model= LlmModelFactory.modelName(self._model), 
                                              max_output_tokens=maxOutputTokens, 
                                              temperature=temperature)
                

llmModelFactory= LlmModelFactory.createGoogleGeminiFlash()
print(llmModelFactory)

(model:LlmModel.GeminiFlash, vendor:LlmVendor.Google, envVar:GOOGLE_API_KEY)


## Load the dataset

In [5]:
TaiDatasetRootEnvVar='TAI_DATASET_ROOT'
assert TaiDatasetRootEnvVar in os.environ
TaiDatasetRoot= os.environ[TaiDatasetRootEnvVar]
print(f'TaiDatasetRoot: {TaiDatasetRoot}')

datasetFile= os.path.join(TaiDatasetRoot, 'rag_ai_tutor', 'mini-llama-articles.txt')
processedDatasetFile= os.path.join(TaiDatasetRoot, 'rag_ai_tutor', 'RAG_AI_TUTOR.csv')
ragAiTutorEmbeddingsFile= os.path.join(TaiDatasetRoot, 'rag_ai_tutor', 'rag_ai_tutor_embeddings.pickle')

print(f'datasetFile:     {datasetFile}')
print(f'Embeddings file: {ragAiTutorEmbeddingsFile}')
assert os.path.exists( datasetFile)

TaiDatasetRoot: /home/minguzzi/repo/towards_ai_course/dataset
datasetFile:     /home/minguzzi/repo/towards_ai_course/dataset/rag_ai_tutor/mini-llama-articles.txt
Embeddings file: /home/minguzzi/repo/towards_ai_course/dataset/rag_ai_tutor/rag_ai_tutor_embeddings.pickle


In [6]:
miniDataset = pd.read_csv(datasetFile)
miniDataset

Unnamed: 0,title,content,url,source
0,Beyond GPT-4: What's New?,LLM Variants and Meta's Open Source Before she...,https://pub.towardsai.net/beyond-gpt-4-whats-n...,towards_ai
1,Building a Q&A Bot over Private Documents with...,Private data to be used The example provided c...,https://pub.towardsai.net/building-a-q-a-bot-o...,towards_ai
2,Enhancing E-commerce Product Search Using LLMs,Problem Statement Despite the pioneers like Am...,https://pub.towardsai.net/enhancing-e-commerce...,towards_ai
3,Exploring Large Language Models -Part 3,Fine Tuning on Custom Domain Data All the popu...,https://pub.towardsai.net/exploring-large-lang...,towards_ai
4,Fine-Tuning a Llama-2 7B Model for Python Code...,"New Llama-2 model In mid-July, Meta released i...",https://pub.towardsai.net/fine-tuning-a-llama-...,towards_ai
5,Foundation Models: Scaling Large Language Models,New Moore's Laws Achieving Zettascale Computin...,https://pub.towardsai.net/foundation-models-37...,towards_ai
6,GPTQ Quantization on a Llama 2 7B Fine-Tuned M...,GPTQ: Post-training quantization on generative...,https://pub.towardsai.net/gptq-quantization-on...,towards_ai
7,LLaMA by Meta leaked by an anonymous forum: Qu...,LLaMA: Meta's new AI tool According to the off...,https://pub.towardsai.net/llama-by-meta-leaked...,towards_ai
8,LLaMA-GPT4All: Simplified Local ChatGPT,Introduce GPT4All GPT4All is a large language ...,https://pub.towardsai.net/llama-gpt4all-simpli...,towards_ai
9,Inside Code Llama: Meta AI's Entrance in the C...,Inside Code Llama The release of Code Llama do...,https://pub.towardsai.net/inside-code-llama-me...,towards_ai


### Utility Functions

In [7]:
# --------------------------------------------------------------------------------------------------
# Split the input text into chunks of specified size.
def simpleSplitIntoChunks(text, chunkSize=1024):
  chunks = []
  lastChunk=0
  for i in range(0, len(text), chunkSize):
    lastChunk= i+chunkSize  
    chunks.append(text[i:i+chunkSize])

  if lastChunk < len(text)-1:
      chunks.append(text[lastChunk:])
  return chunks
    
print(simpleSplitIntoChunks('0123456789ABC', chunkSize=5))
assert 3 == len( simpleSplitIntoChunks('0123456789ABC', chunkSize=5))

['01234', '56789', 'ABC']


### Split the dataset into chunks.

In [8]:
datasetFile= os.path.join(TaiDatasetRoot, 'rag_ai_tutor', 'mini-llama-articles.txt')
print(f'datasetFile: {datasetFile}')
assert os.path.exists( datasetFile)

chunks = []

# Load the file as a CSV
with open( datasetFile, mode="r", encoding="utf-8") as file:
  csvReader = csv.reader(file)

  for index, row in enumerate( csvReader):
    if index == 0: 
        continue; # Skip header row
    chunks.extend( simpleSplitIntoChunks(row[1]))

print("Number of articles:", index-1)
print("Number of chunks:  ", len(chunks))

assert len( chunks ) > 0

print("First chunk:\n", chunks[0])
print(f"First chunk size: {len(chunks[0])}")

datasetFile: /home/minguzzi/repo/towards_ai_course/dataset/rag_ai_tutor/mini-llama-articles.txt
Number of articles: 13
Number of chunks:   174
First chunk:
 LLM Variants and Meta's Open Source Before shedding light on four major trends, I'd share the latest Meta's Llama 2 and Code Llama. Meta's Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2's superior performance over most extant open-source chat models. Human-centric evaluations, focusing on safety and utility metrics, positioned Llama 2-Chat as a potential contender against proprietary, closed-source counterparts. The development trajectory of Llama 2 emphasized rigorous fine-tuning methodologies. Meta's transparent delineation of these processes aims to catalyze community-driven advanceme

In [9]:
df = pd.DataFrame(chunks, columns=['chunk'])
print(df.keys())

df.to_csv(processedDatasetFile, index=False)
print(f'Saved the processed dataset to: {processedDatasetFile}')

Index(['chunk'], dtype='object')
Saved the processed dataset to: /home/minguzzi/repo/towards_ai_course/dataset/rag_ai_tutor/RAG_AI_TUTOR.csv


## Generating the embeddings

In [10]:
OPENAI_EMBEDDING_MODEL= 'text-embedding-3-small'

In [11]:
def getEmbedding(text, client):
  totalTokens= -1
  try:
    text = text.replace("\n", " ")
    res = client.embeddings.create(input=[text], model=OPENAI_EMBEDDING_MODEL)
    promptTokens= res.usage.total_tokens       
    totalTokens= res.usage.total_tokens 
    log.debug(f'getEmbedding: promptTokens:{promptTokens}, totalTokens:{totalTokens}')      
    return (res.data[0].embedding, totalTokens)

  except:
    return (None, totalTokens)

In [12]:
embeddingsClient= OpenAI()

In [13]:
(embeddings, totalTokens) = getEmbedding( chunks[0], embeddingsClient)
print(f'Embeddings size: {len(embeddings)}')

14:43:12 [ INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Embeddings size: 1536


In [17]:
# Generate embeddings
embeddings = []
totalTokens= 0

if loadSavedData:
    log.info('Loading back the embeddings.')
    with open(ragAiTutorEmbeddingsFile,'rb') as pkFile:
      embeddings= pickle.load(pkFile)      
    print(f'Loaded the embeddings: {len(embeddings)}')
else:
    print("Generating embeddings...")
    for index, row in tqdm(df.iterrows()):
      (chunkEmbeddings, chunkTokens) = getEmbedding(row['chunk'], embeddingsClient)
      if chunkEmbeddings is None:
          log.error('get Embedding failed.')
          assert False
      
      embeddings.append( chunkEmbeddings)
      totalTokens += chunkTokens
      print(f'Total tokens:{totalTokens}')          

14:45:58 [ INFO] Loading back the embeddings.


Loaded the embeddings: 174


### Saves the Embeddings in a Pickle file

In [18]:
if not loadSavedData:
    with open(ragAiTutorEmbeddingsFile,'wb') as pkFile:
        pickle.dump(embeddings, pkFile)    

In [19]:
if len(embeddings)>0:
    print(f'Embedding size:{len(embeddings[0])}')
    normEmbedding = np.linalg.norm( embeddings[0])
    print(f'First embedding norm:{normEmbedding}')

# Checks that all embeddings are already normalized.    
for embedding in embeddings:
    assert abs( np.linalg.norm( embedding) - 1.0 ) < 0.0000001
# They are.    

Embedding size:1536
First embedding norm:1.0000000549974424


In [20]:
# Add the "embedding" column to the dataframe
embeddingsColumn = pd.Series(embeddings)
df.insert(loc=1, column='embedding', value=embeddingsColumn)

<b>Saves the dataset processed so far.</b>

In [21]:
df.to_csv(processedDatasetFile, index=False)
print(f'Saved the processed dataset to: {processedDatasetFile}')

Saved the processed dataset to: /home/minguzzi/repo/towards_ai_course/dataset/rag_ai_tutor/RAG_AI_TUTOR.csv


## Cosine similarity
### Test

In [22]:
question = "How many parameters does the LLaMA2 model have?"

(embQuestion, totalTokens) = getEmbedding(question, embeddingsClient)
assert not embQuestion is None

(embBadSource, totalTokens) = getEmbedding("The sky is blue.", embeddingsClient)
assert not embBadSource is None

(embGoodSource, totalTokens) = getEmbedding("LLaMA2 model has a total of 2B parameters.", embeddingsClient)
assert not embGoodSource is None

14:47:04 [ INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
14:47:05 [ INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
14:47:05 [ INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [27]:
print( f'Bad source similarity:  { cosine_similarity([embQuestion], [embBadSource])[0][0]}' )
print( f'Good source similarity: { cosine_similarity([embQuestion], [embGoodSource])[0][0]}' )
print( f'Similarity with itself: { cosine_similarity([embGoodSource], [embGoodSource])[0][0]}' )

Bad source similarity:  0.022523741117329046
Good source similarity: 0.8481372724529019
Similarity with itself: 1.0


### Loading back previously processed data

In [23]:
assert os.path.exists(processedDatasetFile)
if df is None:
    log.info('Loading back the dataframe.')
    df= pd.read_csv( processedDatasetFile,index_col=False)
df   

Unnamed: 0,chunk,embedding
0,LLM Variants and Meta's Open Source Before she...,"[0.02095666341483593, -0.010611234232783318, 0..."
1,ational code model;Codel Llama - Python specia...,"[0.00975396391004324, 0.0061698732897639275, 0..."
2,"erm ""multimodal"" refers to their ability to pr...","[0.001414407161064446, 0.024413447827100754, 0..."
3,"es it matter? LLM connections, like the LlamaI...","[0.0024489574134349823, 0.014700595289468765, ..."
4,understand data in the AI-driven future. Fro...,"[-0.03746529296040535, 0.018381565809249878, 0..."
...,...,...
169,versity. In-breadth Evolving solves this probl...,"[0.03136121854186058, 0.003093137638643384, 0...."
170,"ns are done, the initial instruction dataset (...","[0.02660106122493744, -0.016632692888379097, 0..."
171,"er, the Prompt should be as follows: Best Use...","[-0.0027382965199649334, 0.01911219209432602, ..."
172,"sis, and visualization.Machine Learning Pipeli...","[0.012401485815644264, -0.0017868350259959698,..."


### Similarity in action

In [24]:
question = "How many parameters LLaMA2 model has?"
(embQuestion, totalTokens) = getEmbedding(question, embeddingsClient)
assert not embQuestion is None

14:47:31 [ INFO] HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [25]:
cosineSimilarities= cosine_similarity([embQuestion],embeddings)[0]
len(cosineSimilarities)

174

In [26]:
bestIndex = np.argsort(cosineSimilarities)[::-1][:1]
bestIndex

array([114])

In [27]:
NumberOfChunksToRetrieve= 3
indices = np.argsort(cosineSimilarities)[::-1][:NumberOfChunksToRetrieve]
print(indices)

[114  75  89]


In [28]:
for idx, item in enumerate(df.chunk[indices]):
  print(f"> Chunk {idx+1}")
  print(item)
  print("----")

> Chunk 1
by Meta that ventures into both the AI and academic spaces. The model aims to help researchers, scientists, and engineers advance their work in exploring AI applications. It will be released under a non-commercial license to prevent misuse, and access will be granted to academic researchers, individuals, and organizations affiliated with the government, civil society, academia, and industry research facilities on a selective case-by-case basis. The sharing of codes and weights allows other researchers to test new approaches in LLMs. The LLaMA models have a range of 7 billion to 65 billion parameters. LLaMA-65B can be compared to DeepMind's Chinchilla and Google's PaLM. Publicly available unlabeled data was used to train these models, and training smaller foundational models require less computing power and resources. LLaMA 65B and 33B have been trained on 1.4 trillion tokens in 20 different languages, and according to the Facebook Artificial Intelligence Research (FAIR) team,

## Augmenting the prompt with the retrieve chunks

In [42]:
systemPrompt= ('You are an assistant and expert in answering questions from a chunks of content. '
               +'Only answer AI-related question, else say that you cannot answer this question.')
userPrompt=  ('Read the following informations that might contain the context you require to answer the question. '
              +'You can use the informations starting from the <START_OF_CONTEXT> tag and end with the <END_OF_CONTEXT> tag. '
              +'Here is the content:\n\n<START_OF_CONTEXT>\n{context}\n<END_OF_CONTEXT>\n\n'
              +'Please provide an informative and accurate answer to the following question based on the avaiable context. '
              +'Be concise and take your time. \nQuestion: {question}\nAnswer:')
NumberOfChunksToRetrieve= 3
indices = np.argsort(cosineSimilarities)[::-1][:NumberOfChunksToRetrieve]
print(indices)

context= ''.join( df.chunk[indices])
context

[114  75  89]


'by Meta that ventures into both the AI and academic spaces. The model aims to help researchers, scientists, and engineers advance their work in exploring AI applications. It will be released under a non-commercial license to prevent misuse, and access will be granted to academic researchers, individuals, and organizations affiliated with the government, civil society, academia, and industry research facilities on a selective case-by-case basis. The sharing of codes and weights allows other researchers to test new approaches in LLMs. The LLaMA models have a range of 7 billion to 65 billion parameters. LLaMA-65B can be compared to DeepMind\'s Chinchilla and Google\'s PaLM. Publicly available unlabeled data was used to train these models, and training smaller foundational models require less computing power and resources. LLaMA 65B and 33B have been trained on 1.4 trillion tokens in 20 different languages, and according to the Facebook Artificial Intelligence Research (FAIR) team, the mo

In [44]:
question = 'How many parameters does the LLaMA 2 model have?'

try:
    systemPrompt= ('You are an assistant and expert in answering questions from a chunks of content. '
                   'Only answer AI-related question, else say that you cannot answer this question.')
    
    userPrompt = ('Read the following informations that might contain the context you require to answer the question. '
                  'You can use the informations starting from the <START_OF_CONTEXT> tag and end with the <END_OF_CONTEXT> tag. '
                  'Here is the content:\n\n<START_OF_CONTEXT>\n{context}\n<END_OF_CONTEXT>\n\n'
                  'Please provide an informative and accurate answer to the following question based on the avaiable context. '
                  'Be concise and take your time. \nQuestion: {question}\nAnswer:')
    
    messages= [('system',systemPrompt),
               ('human',userPrompt)]

    llmModelFactory= LlmModelFactory.createGoogleGeminiFlash()
    model = llmModelFactory.createChatModel()

    chain = ChatPromptTemplate(messages) | model | StrOutputParser()
    result= chain.invoke({'context': context, 'question': question})
    print(result)
except Exception as exc:
    log.error(f'Exception:{exc}')

LLaMA 2 comes in four sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.


### How augmenting the prompt addresses Knowledge Cutoff and Hallucinations
#### Zero-shot question without additional data

In [34]:
question = 'How many parameters LLaMA 3.1 model has?'
try:
    systemPrompt='You are an assistant and expert in answering questions.'
    userPrompt= 'Be concise and take your time to answer the following question. Be succint in the answer.\nQuestion: {question}\nAnswer:"'
    
    messages= [('system',systemPrompt),
               ('human',userPrompt)]

    llmModelFactory= LlmModelFactory.createGoogleGeminiFlash()
    print(llmModelFactory)
    model = llmModelFactory.createChatModel()

    chain = ChatPromptTemplate(messages) | model | StrOutputParser()
    result= chain.invoke({"question": question})
    print(result)
except Exception as exc:
    log.error(f'Exception:{exc}')

(model:LlmModel.GeminiFlash, vendor:LlmVendor.Google, envVar:GOOGLE_API_KEY)
LLaMA 3.1's parameter count isn't publicly released by Meta.  No official number exists.


#### Question with augmented context

In [36]:
exampleChunk = """
Introducing Llama 3.1 Llama 3.1 405B is the first openly available model that rivals the top AI models when it comes to state-of-the-art capabilities in general knowledge, steerability, math, tool use, and multilingual translation. With the release of the 405B model, we’re poised to supercharge innovation—with unprecedented opportunities for growth and exploration. We believe the latest generation of Llama will ignite new applications and modeling paradigms, including synthetic data generation to enable the improvement and training of smaller models, as well as model distillation—a capability that has never been achieved at this scale in open source.
As part of this latest release, we’re introducing upgraded versions of the 8B and 70B models. These are multilingual and have a significantly longer context length of 128K, state-of-the-art tool use, and overall stronger reasoning capabilities. This enables our latest models to support advanced use cases, such as long-form text summarization, multilingual conversational agents, and coding assistants. We’ve also made changes to our license, allowing developers to use the outputs from Llama models—including the 405B—to improve other models. True to our commitment to open source, starting today, we’re making these models available to the community for download on llama.meta.com and Hugging Face and available for immediate development on our broad ecosystem of partner platforms. Model evaluations
For this release, we evaluated performance on over 150 benchmark datasets that span a wide range of languages. In addition, we performed extensive human evaluations that compare Llama 3.1 with competing models in real-world scenarios. Our experimental evaluation suggests that our flagship model is competitive with leading foundation models across a range of tasks, including GPT-4, GPT-4o, and Claude 3.5 Sonnet. Additionally, our smaller models are competitive with closed and open models that have a similar number of parameters.
Model Architecture As our largest model yet, training Llama 3.1 405B on over 15 trillion tokens was a major challenge. To enable training runs at this scale and achieve the results we have in a reasonable amount of time, we significantly optimized our full training stack and pushed our model training to over 16 thousand H100 GPUs, making the 405B the first Llama model trained at this scale.
To address this, we made design choices that focus on keeping the model development process scalable and straightforward. We opted for a standard decoder-only transformer model architecture with minor adaptations rather than a mixture-of-experts model to maximize training stability.
We adopted an iterative post-training procedure, where each round uses supervised fine-tuning and direct preference optimization. This enabled us to create the highest quality synthetic data for each round and improve each capability’s performance.
Compared to previous versions of Llama, we improved both the quantity and quality of the data we use for pre- and post-training. These improvements include the development of more careful pre-processing and curation pipelines for pre-training data, the development of more rigorous quality assurance, and filtering approaches for post-training data.
"""

In [41]:
temperature=0.7
question = 'How many parameters LLaMA 3.1 model has?'
try:
    systemPrompt=('You are an assistant and expert in answering questions from a chunks of content. '
                  'Only answer AI-related question, else say that you cannot answer this question.')
    
    userPrompt= ('Read the following informations that might contain the context you require to answer the question. '
                 'You can use the informations starting from the <START_OF_CONTEXT> tag and end with the <END_OF_CONTEXT> tag. '
                 'Here is the content:\n\n<START_OF_CONTEXT>\n{content}\n<END_OF_CONTEXT>\n\n'
                 'Please provide an informative and accurate answer to the following question based on the avaiable context. '
                 'Be concise and take your time. \nQuestion: {question}\nAnswer:')
    
    messages= [('system',systemPrompt),
               ('human',userPrompt)]

    llmModelFactory= LlmModelFactory.createGoogleGeminiFlash()
    model = llmModelFactory.createChatModel(temperature=temperature)

    chain = ChatPromptTemplate(messages) | model | StrOutputParser()
    result= chain.invoke({'question': question, 'content': exampleChunk})
    print(result)
except Exception as exc:
    log.error(f'Exception:{exc}')

The provided text mentions a 405B parameter LLaMA 3.1 model, as well as upgraded 8B and 70B parameter models.
