# Towards AI Online course: Improving Data Sources and Prompts

In [34]:
from llama_index.core import Document, Settings, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.vector_stores.chroma import ChromaVectorStore

import chromadb
import csv
import nest_asyncio
import os

nest_asyncio.apply()

In [7]:
# Factorize all constants here.
EmbeddingModel='text-embedding-3-small'

CollectionName= 'mini-llama-articles'
VectorDbPath= f'./{CollectionName}'

In [10]:
Settings.embed_model = OpenAIEmbedding( model=EmbeddingModel)

In [12]:
chromaClient = chromadb.PersistentClient(path= VectorDbPath)
collection = chromaClient.get_or_create_collection(name=CollectionName)
isCollectionEmpty= (0 == collection.count())
isCollectionEmpty

True

In [14]:
vectorStore = ChromaVectorStore(chroma_collection=collection)

## Downloads and prepares the dataset

In [17]:
!rm -rfv another_dataset
!mkdir -pv another_dataset
!curl -o ./another_dataset/mini-dataset.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv

removed 'another_dataset/mini-dataset.csv'
removed directory 'another_dataset'
mkdir: created directory 'another_dataset'
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  169k  100  169k    0     0   642k      0 --:--:-- --:--:-- --:--:--  644k


In [20]:
rows = []

with open("./another_dataset/mini-dataset.csv", mode="r", encoding="utf-8") as file:
  csvReader = csv.reader(file)

  for index, row in enumerate(csvReader):
    if index == 0: 
        continue; # Skip header row
    rows.append(row)

# The number of characters in the dataset.
len(rows)

14

In [23]:
rows[0][0]

"Beyond GPT-4: What's New?"

In [26]:
documents = [Document(text=row[1], metadata={"title": row[0], "url": row[2], "sourceName": row[3]}) for row in rows]
print(f'Number of documents:{len(documents)}')

Number of documents:14


In [27]:
# Inspects the metadata related to a document.
documents[0].metadata

{'title': "Beyond GPT-4: What's New?",
 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8',
 'sourceName': 'towards_ai'}

## Processes the documents

In [29]:
textSplitter = TokenTextSplitter( separator=" ", chunk_size=512, chunk_overlap=128)

In [31]:
pipeline = IngestionPipeline(
    transformations=[
        textSplitter,
        OpenAIEmbedding(model = 'text-embedding-3-small'),
    ],
    vector_store=vectorStore
)
nodes = pipeline.run(documents=documents, show_progress=True)

Parsing nodes:   0%|          | 0/14 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/108 [00:00<?, ?it/s]

In [33]:
index = VectorStoreIndex.from_vector_store(vectorStore)

## Queries the document store

In [35]:
llm = Gemini(model="models/gemini-2.0-flash", temperature=1, max_tokens=512)

In [40]:
queryEngine = index.as_query_engine(llm=llm, similarity_top_k=5)
res = queryEngine.query("How many parameters LLaMA2 model has?")

print(res.response)

The Llama 2 model is available in four sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.



In [41]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Score\t", src.score)
    print("URL\t", src.metadata["url"])
    print("-_" * 20)

Node ID	 79b6aeec-c502-4b4c-a474-a573f095e841
Title	 Fine-Tuning a Llama-2 7B Model for Python Code Generation
Score	 0.3637786840450056
URL	 https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73#bf4e
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 a2ef178f-a57b-42af-9cfa-7eb9bd267889
Title	 Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use
Score	 0.3566809510280558
URL	 https://pub.towardsai.net/metas-llama-2-revolutionizing-open-source-language-models-for-commercial-use-1492bec112b#148f
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 300ff917-2a24-48a9-805c-15581108f942
Title	 Fine-Tuning a Llama-2 7B Model for Python Code Generation
Score	 0.35481100704185614
URL	 https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73#bf4e
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 37b66172-43e9-4243-bb95-151245e43d2d
Title	 LLaMA-GPT4All: Simplified Local ChatGPT
Score	 0.352

## Response modes

In [46]:
queryEngineRefine = index.as_query_engine(response_mode="refine", llm=llm, similarity_top_k=3)
res = queryEngineRefine.query("How many parameters LLaMA2 model has?")

In [47]:
print(res)

The Llama 2 model is available in four sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.



In [48]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Score\t", src.score)
    print("URL\t", src.metadata["url"])
    print("-_" * 20)

Node ID	 79b6aeec-c502-4b4c-a474-a573f095e841
Title	 Fine-Tuning a Llama-2 7B Model for Python Code Generation
Score	 0.3637786840450056
URL	 https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73#bf4e
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 a2ef178f-a57b-42af-9cfa-7eb9bd267889
Title	 Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use
Score	 0.3566809510280558
URL	 https://pub.towardsai.net/metas-llama-2-revolutionizing-open-source-language-models-for-commercial-use-1492bec112b#148f
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 300ff917-2a24-48a9-805c-15581108f942
Title	 Fine-Tuning a Llama-2 7B Model for Python Code Generation
Score	 0.35481100704185614
URL	 https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73#bf4e
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


### The No Text - Response Mode

In [49]:
queryEngineRefine = index.as_query_engine(response_mode="no_text", llm=llm, similarity_top_k=3)
res = queryEngineRefine.query("How many parameters LLaMA2 model has?")
print(res)

None


In [50]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Score\t", src.score)
    print("URL\t", src.metadata["url"])
    print("-_" * 20)

Node ID	 79b6aeec-c502-4b4c-a474-a573f095e841
Title	 Fine-Tuning a Llama-2 7B Model for Python Code Generation
Score	 0.3637786840450056
URL	 https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73#bf4e
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 a2ef178f-a57b-42af-9cfa-7eb9bd267889
Title	 Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use
Score	 0.3566809510280558
URL	 https://pub.towardsai.net/metas-llama-2-revolutionizing-open-source-language-models-for-commercial-use-1492bec112b#148f
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 300ff917-2a24-48a9-805c-15581108f942
Title	 Fine-Tuning a Llama-2 7B Model for Python Code Generation
Score	 0.35481100704185614
URL	 https://pub.towardsai.net/fine-tuning-a-llama-2-7b-model-for-python-code-generation-865453afdf73#bf4e
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
