# Creating the Vector DB dataset

In [6]:
from huggingface_hub import hf_hub_download
from llama_index.core import Document
from openai import OpenAI
from typing import List

import json
import os

OPEN_AI_KEY_NAME='OPENAI_API_KEY'
assert OPEN_AI_KEY_NAME in os.environ

TAI_DATASET_ROOT_ENV_VAR='TAI_DATASET_ROOT'
assert TAI_DATASET_ROOT_ENV_VAR in os.environ

In [2]:
datasetDirpath = os.path.join( os.environ[TAI_DATASET_ROOT_ENV_VAR], 'ai_tutor_knowledge')
print(f'datasetDirpath: {datasetDirpath}')

datasetDirpath: /home/minguzzi/repo/towards_ai_course/dataset/ai_tutor_knowledge


In [3]:
datasetFilepath = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="ai_tutor_knowledge.jsonl",repo_type="dataset",local_dir=datasetDirpath)
print(f'datasetFilepath: {datasetFilepath}')

datasetFilepath: /home/minguzzi/repo/towards_ai_course/dataset/ai_tutor_knowledge/ai_tutor_knowledge.jsonl


In [5]:
with open(datasetFilepath, "r") as file:
    ai_tutor_knowledge = [json.loads(line) for line in file]
ai_tutor_knowledge[1]['content']

"Github Repo: https://github.com/vaibhawkhemka/ML-Umbrella/tree/main/NLP/Product-Categorization   From e-commerce to Customer support  all businesses require some kind of NER model to process huge amounts of texts from users.   To automate this whole  one requires NER models to extract relevant and important entities from text.   Final Result/OutputInput text = EL D68 (Green  32 GB) 3 GB RAM [3 GB RAM U+007C 32 GB ROM U+007C Expandable Upto 128 GB  15.46 cm (6.088 inch) Display  13MP Rear Camera U+007C 8MP Front Camera  4000 mAh Battery  Quad-Core Processor]   Output =   Green ->>>> COLOR 32 GB ->>>> STORAGE 3 GB RAM ->>>> RAM 3 GB RAM ->>>> RAM 32 GB ROM ->>>> STORAGE Expandable Upto 128 GB ->>>> EXPANDABLE_STORAGE 15.46 cm (6.088 inch) ->>>> SCREEN_SIZE 13MP Rear Camera ->>>> BACK_CAMERA 8MP Front Camera ->>>> FRONT_CAMERA 4000 mAh Battery ->>>> BATTERY_CAPACITY Quad-Core Processor ->>>> PROCESSOR_CORE   Data PreparationA tool for creating this dataset (https://github.com/tecoholic/n

## Create the Llama Index documents

In [7]:
def create_docs_from_list(data_list: List[dict]) -> List[Document]:
    documents = []
    for data in data_list:
        documents.append(
            Document(
                doc_id=data["doc_id"],
                text=data["content"],
                metadata={  # type: ignore
                    "url": data["url"],
                    "title": data["name"],
                    "tokens": data["tokens"],
                    "source": data["source"],
                },
                excluded_llm_metadata_keys=[
                    "title",
                    "tokens",
                    "source",
                ],
                excluded_embed_metadata_keys=[
                    "url",
                    "tokens",
                    "source",
                ],
            )
        )
    return documents

doc = create_docs_from_list(ai_tutor_knowledge)
doc[2]

Document(id_='45501b72-9391-529e-8e5e-59a2604ba26e', embedding=None, metadata={'url': 'https://towardsai.net/p/machine-learning/adaboost-explained-from-its-original-paper', 'title': 'AdaBoost Explained From Its Original Paper', 'tokens': 1697, 'source': 'tai_blog'}, excluded_embed_metadata_keys=['url', 'tokens', 'source'], excluded_llm_metadata_keys=['title', 'tokens', 'source'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="This publication is meant to show a very popular ML algorithm in complete detail  how it works  the math behind it  how to execute it in Python and an explanation of the proofs of the original paper. There will be math and code  but it is written in a way that allows you to decide which are the fun parts.   A bit on the origins of the algorithm: It was proposed by Yoav Freund and Robert E. Schapire in a 1997 paper  A Decision-Theoretic Generalization of On-Line Learning a

In [8]:
len(doc)

762

In [9]:
import nest_asyncio
nest_asyncio.apply()

In [10]:
vectorDbPath = os.path.join( os.environ[TAI_DATASET_ROOT_ENV_VAR], 'ai_tutor_knowledge_vect_db')
print(f'vectorDbPath: {vectorDbPath}')

vectorDbPath: /home/minguzzi/repo/towards_ai_course/dataset/ai_tutor_knowledge_vect_db


In [11]:
from llama_index.core.node_parser import TokenTextSplitter

# Define the splitter object that split the text into segments with 1536 tokens,
# with a 128 overlap between the segments.
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    KeywordExtractor,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

# save to disk
db = chromadb.PersistentClient(path=vectorDbPath)
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
llm = OpenAI(temperature=0, model="gpt-4o-mini")

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        QuestionsAnsweredExtractor(questions=2, llm=llm),
        SummaryExtractor(summaries=["prev", "self"], llm=llm),
        KeywordExtractor(keywords=10, llm=llm),
        OpenAIEmbedding(model = "text-embedding-3-small"),
    ],
    vector_store=vector_store,
)

# Run the transformation pipeline.
nodes = pipeline.run(documents=doc, show_progress=True)

Parsing nodes:   0%|          | 0/762 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5834/5834 [56:57<00:00,  1.71it/s]
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-neGD6TXgMpurKxduMXmsAgtL on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-neGD6TXgMpurKxduMXmsAgtL on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-neGD6TXgMpurKxduMXmsAgtL on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-neGD6TXgMpurKxduMXmsAgtL on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-neGD6TXgMpurKxduMXmsAgtL on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error co

# Queries the Vector DB

In [19]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

assert TAI_DATASET_ROOT_ENV_VAR in os.environ
vectorDbPath = os.path.join( os.environ[TAI_DATASET_ROOT_ENV_VAR], 'ai_tutor_knowledge_vectdb')
print(f'vectorDbPath: {vectorDbPath}')

# Load the vector store from the local storage.
db = chromadb.PersistentClient(path=vectorDbPath)
chroma_collection = db.get_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create the index based on the vector store.
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

vectorDbPath: /home/minguzzi/repo/towards_ai_course/dataset/ai_tutor_knowledge_vectdb


⚠️ It looks like you upgraded from a version below 0.5.6 and could benefit from vacuuming your database. Run chromadb utils vacuum --help for more information.


In [20]:
query_engine = index.as_query_engine(similarity_top_k=4)

res = query_engine.query("Explain how RAG works?")

print(f"top results:")
print("\t", res.response)
print("-_" * 20)

top results:
	 RAG (Retrieval-Augmented Generation) works by combining retrieval-based and generation-based approaches in natural language processing tasks. It involves retrieving relevant information from a large database or knowledge source using a retriever model and then generating responses or outputs based on this retrieved information using a generator model. This hybrid approach allows for more accurate and contextually relevant responses by leveraging both the retrieval of existing knowledge and the generation of new content.
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
