## Install ChromaDB

In [1]:
!pip install chromadb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

## Create ChromaDB client

In [2]:
import chromadb

client = chromadb.Client()

## Create ChromaDB collection

In [3]:
collection = client.create_collection(name="my_collection")

## Add data to ChromaDB collection

In [4]:
cricket_news = """
The T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide.
India's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign.
The tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.
"""

football_news = """
The world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally.
In the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to face Manchester City, while Bayern Munich will take on Paris Saint-Germain.
Both ties promise thrilling encounters, featuring some of the best talents in world football.
"""

election_news = """
As election season heats up, the latest developments reveal a highly competitive atmosphere across several key races.
The presidential election has seen intense campaigning from all major candidates, with recent polls indicating a tight race.
Incumbent President Jane Doe is seeking re-election on a platform of economic stability and healthcare reform, while her main rival, Senator John Smith, focuses on education and climate change initiatives."""


ai_revolution_news = """
The AI revolution continues to transform industries and reshape the global economy.
Significant advancements in artificial intelligence have led to breakthroughs in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs.
Autonomous systems are becoming increasingly prevalent in logistics and transportation, enhancing efficiency and safety."""

In [5]:
collection.add(
    documents = [cricket_news, football_news, election_news, ai_revolution_news],
    metadatas = [{"source": "cricket"},{"source": "football"},{'source':'election'},{"source":"ai revolution"}],
    ids = ["id1", "id2", "id3", "id4"]
)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 49.6MiB/s]


## Similarity search

In [6]:
results = collection.query(
    query_texts=["technology"],
    n_results=2
)

results

{'ids': [['id4', 'id1']],
 'distances': [[1.5124857425689697, 1.699276089668274]],
 'metadatas': [[{'source': 'ai revolution'}, {'source': 'cricket'}]],
 'embeddings': None,
 'documents': [['\nThe AI revolution continues to transform industries and reshape the global economy. \nSignificant advancements in artificial intelligence have led to breakthroughs in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs. \nAutonomous systems are becoming increasingly prevalent in logistics and transportation, enhancing efficiency and safety.',
   "\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New

In [7]:
collection.count()

4

## CRUD operations on Vector Database

#### Add data

In [8]:
blockchain_news = """
The blockchain industry continues to evolve rapidly, marked by significant technological advancements and regulatory developments.
This month, the spotlight is on the launch of Ethereum 3.0, which promises enhanced scalability and security features.
This upgrade is expected to drastically reduce transaction fees and increase processing speeds, making decentralized applications (dApps) more efficient and user-friendly.
"""

In [9]:
collection.add(
    documents = [blockchain_news],
    metadatas = [{"source": "blockchain"}],
    ids = ["id5"]
)

In [10]:
collection.count()

5

In [11]:
results = collection.query(
    query_texts=["technology"],
    n_results=2
)

results

{'ids': [['id4', 'id1']],
 'distances': [[1.5124857425689697, 1.699276089668274]],
 'metadatas': [[{'source': 'ai revolution'}, {'source': 'cricket'}]],
 'embeddings': None,
 'documents': [['\nThe AI revolution continues to transform industries and reshape the global economy. \nSignificant advancements in artificial intelligence have led to breakthroughs in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs. \nAutonomous systems are becoming increasingly prevalent in logistics and transportation, enhancing efficiency and safety.',
   "\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New

#### Read data

In [12]:
res = collection.get()
res

{'ids': ['id1', 'id2', 'id3', 'id4', 'id5'],
 'embeddings': None,
 'metadatas': [{'source': 'cricket'},
  {'source': 'football'},
  {'source': 'election'},
  {'source': 'ai revolution'},
  {'source': 'blockchain'}],
 'documents': ["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n",
  '\nThe world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally. \nIn the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to fac

In [13]:
res = collection.get(ids=["id1", "id3"])
res

{'ids': ['id1', 'id3'],
 'embeddings': None,
 'metadatas': [{'source': 'cricket'}, {'source': 'election'}],
 'documents': ["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n",
  '\nAs election season heats up, the latest developments reveal a highly competitive atmosphere across several key races. \nThe presidential election has seen intense campaigning from all major candidates, with recent polls indicating a tight race. \nIncumbent President Jane Doe is seeking re-election on a platform of economic stability and healthcare re

#### Update data

In [14]:
collection.update(
    ids=["id3"],
    documents=["This is sample document about generative AI"],
    metadatas=[{"source": "gen ai"}],
)

In [15]:
res = collection.get(ids=["id3"])
res

{'ids': ['id3'],
 'embeddings': None,
 'metadatas': [{'source': 'gen ai'}],
 'documents': ['This is sample document about generative AI'],
 'uris': None,
 'data': None}

In [16]:
res = collection.get()
res

{'ids': ['id1', 'id2', 'id3', 'id4', 'id5'],
 'embeddings': None,
 'metadatas': [{'source': 'cricket'},
  {'source': 'football'},
  {'source': 'gen ai'},
  {'source': 'ai revolution'},
  {'source': 'blockchain'}],
 'documents': ["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n",
  '\nThe world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally. \nIn the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to face 

#### Delete data

In [17]:
results = collection.query(
    query_texts=["sport"],
    n_results=2
)

results

{'ids': [['id1', 'id2']],
 'distances': [[1.520041823387146, 1.5631263256072998]],
 'metadatas': [[{'source': 'cricket'}, {'source': 'football'}]],
 'embeddings': None,
 'documents': [["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n",
   '\nThe world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally. \nIn the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to face Manchester City, while Bayern Munich will t

In [18]:
collection.count()

5

In [19]:
collection.delete(ids = ['id2'])

In [20]:
collection.count()

4

In [21]:
results = collection.query(
    query_texts=["sport"],
    n_results=2
)

results

{'ids': [['id1', 'id3']],
 'distances': [[1.520041823387146, 1.755448818206787]],
 'metadatas': [[{'source': 'cricket'}, {'source': 'gen ai'}]],
 'embeddings': None,
 'documents': [["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n",
   'This is sample document about generative AI']],
 'uris': None,
 'data': None}

## Use alternative Embedding model

In [22]:
!pip install sentence_transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [23]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
embeddings = embedding_model.encode([cricket_news, football_news])

In [25]:
embeddings

array([[-0.02901842,  0.01924439, -0.0181424 , ...,  0.00644326,
        -0.01740811, -0.01381657],
       [-0.00384662, -0.07271519, -0.00284145, ..., -0.02027755,
         0.02123847, -0.03015987]], dtype=float32)

In [26]:
len(embeddings[0])

768

In [27]:
new_collection = client.create_collection(name="my_new_collection")

In [28]:
new_collection.add(
    documents = [cricket_news, football_news],
    embeddings = embeddings,
    metadatas = [{"source": "cricket"},{"source": "football"}],
    ids = ["id1", "id2"]
)

In [29]:
new_collection.get()

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'metadatas': [{'source': 'cricket'}, {'source': 'football'}],
 'documents': ["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n",
  '\nThe world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally. \nIn the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to face Manchester City, while Bayern Munich will take on Paris Saint-Germain. \nBoth ties promise thrilling encou

In [30]:
# results = new_collection.query(
#     query_texts=["test worldcup"],
#     n_results=1
# )

# results

In [31]:
results = new_collection.query(
    query_embeddings=embedding_model.encode(["test worldcup"]),
    n_results=1
)

results

{'ids': [['id1']],
 'distances': [[0.9270880818367004]],
 'metadatas': [[{'source': 'cricket'}]],
 'embeddings': None,
 'documents': [["\nThe T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide. \nIndia's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign. \nThe tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.\n"]],
 'uris': None,
 'data': None}