## Install necessary libraries

In [8]:
!pip install sentence_transformers -q
!pip install pinecone-client -q

## Load data

In [2]:
cricket_news = """
The T20 World Cup 2024 is in full swing, bringing excitement and drama to cricket fans worldwide.
India's team, captained by Rohit Sharma, is preparing for a crucial match against Ireland, with standout player Jasprit Bumrah expected to play a pivotal role in their campaign.
The tournament has already seen controversy, particularly concerning the pitch conditions at Nassau County International Cricket Stadium in New York, which came under fire after a low-scoring game between Sri Lanka and South Africa.
"""

football_news = """
The world of football is buzzing with excitement as major tournaments and league matches continue to captivate fans globally.
In the UEFA Champions League, the semi-final matchups have been set, with defending champions Real Madrid set to face Manchester City, while Bayern Munich will take on Paris Saint-Germain.
Both ties promise thrilling encounters, featuring some of the best talents in world football.
"""

election_news = """
As election season heats up, the latest developments reveal a highly competitive atmosphere across several key races.
The presidential election has seen intense campaigning from all major candidates, with recent polls indicating a tight race.
Incumbent President Jane Doe is seeking re-election on a platform of economic stability and healthcare reform, while her main rival, Senator John Smith, focuses on education and climate change initiatives."""


ai_revolution_news = """
The AI revolution continues to transform industries and reshape the global economy.
Significant advancements in artificial intelligence have led to breakthroughs in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs.
Autonomous systems are becoming increasingly prevalent in logistics and transportation, enhancing efficiency and safety."""

## Perform embeddings on data

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
embeddings = embedding_model.encode([cricket_news, football_news, election_news, ai_revolution_news])

In [5]:
embeddings

array([[-0.02901842,  0.0192444 , -0.0181424 , ...,  0.00644327,
        -0.01740812, -0.01381658],
       [-0.00384662, -0.07271519, -0.00284145, ..., -0.02027755,
         0.02123847, -0.03015987],
       [-0.02962372,  0.05711373,  0.01119961, ...,  0.0131924 ,
         0.02634867,  0.01807423],
       [-0.01667612,  0.05068192, -0.05662728, ..., -0.00878626,
        -0.02318501, -0.04949613]], dtype=float32)

In [6]:
len(embeddings[0])

768

## Initiate Pinecone

In [10]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key="30daac5c-3268-4532-b893-867f779b9c2a")
spec = ServerlessSpec(cloud='aws', region='us-east-1')

## Create Index

In [11]:
pc.create_index("example-index", dimension=768, metric="cosine", spec=spec)

In [None]:
pc.list_indexes()

{'indexes': [{'dimension': 768,
              'host': 'example-index-6rqbdt6.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'example-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

## Use Index

In [None]:
index = pc.Index("example-index")

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Add data to Pinecone Index

In [None]:
index.upsert([
    {"id":"id1", "values":embeddings[0], "metadata":{'source': 'cricket'}},
    {"id":"id2", "values":embeddings[1], "metadata":{'source': 'football'}},
    {"id":"id3", "values":embeddings[2], "metadata":{'source': 'election'}},
    {"id":"id4", "values":embeddings[3], "metadata":{'source': 'ai_revolution'}}
])

{'upserted_count': 4}

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Similarity Search

In [None]:
query = "technology"
query_embedding = embedding_model.encode(query).tolist()

In [None]:
len(query_embedding)

768

In [None]:
similar_docs = index.query(vector=query_embedding, top_k=2, include_metadata=True)
similar_docs

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

## CRUD operations on Vector Database

#### Add data

In [None]:
blockchain_news = """
The blockchain industry continues to evolve rapidly, marked by significant technological advancements and regulatory developments.
This month, the spotlight is on the launch of Ethereum 3.0, which promises enhanced scalability and security features.
This upgrade is expected to drastically reduce transaction fees and increase processing speeds, making decentralized applications (dApps) more efficient and user-friendly.
"""

In [None]:
embedding_query = embedding_model.encode(blockchain_news).tolist()

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

In [None]:
index.upsert([{"id":"id5", "values":embedding_query, "metadata":{"source":"blockchain"}}])

{'upserted_count': 1}

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4}},
 'total_vector_count': 4}

In [None]:
query_embedding = embedding_model.encode("technology").tolist()
similar_docs = index.query(vector=query_embedding, top_k=2, include_metadata=True)
similar_docs

{'matches': [{'id': 'id4',
              'metadata': {'source': 'ai_revolution'},
              'score': 0.218479618,
              'values': []},
             {'id': 'id1',
              'metadata': {'source': 'cricket'},
              'score': 0.0995326,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

#### Read data

In [None]:
results = index.fetch(ids=['id1', 'id3'])

In [None]:
for k in results["vectors"]:
  print(results["vectors"][k]['metadata'])

{'source': 'cricket'}
{'source': 'election'}


#### Update data

In [None]:
embedding_query = embedding_model.encode("This is sample document about generative AI").tolist()
index.upsert([("id3", embedding_query, {"source":"gen ai"})])

{'upserted_count': 1}

In [None]:
index.fetch(ids=['id3'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'id3': {'id': 'id3',
                     'metadata': {'source': 'election'},
                     'values': [-0.0296237152,
                                0.0571137294,
                                0.0111996075,
                                0.0148383053,
                                -0.00865601655,
                                -0.00182349479,
                                -0.141189873,
                                -0.00111300102,
                                -0.0202395171,
                                -0.0288040377,
                                0.0270527769,
                                0.00560142752,
                                0.0218389817,
                                0.0586290471,
                                -0.0187733117,
                                -0.0984891504,
                                0.0112384642,
                                0.0471920259,
                      

#### Delete data

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [None]:
index.delete(ids=['id2'])

{}

In [None]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [None]:
index.fetch(ids=['id2'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'id2': {'id': 'id2',
                     'metadata': {'source': 'football'},
                     'values': [-0.00384662021,
                                -0.072715193,
                                -0.0028414533,
                                0.0574586503,
                                -0.00515252585,
                                -0.0170758776,
                                -0.10618224,
                                -0.0312843956,
                                -0.0369069651,
                                -0.0311232638,
                                -0.0203364436,
                                0.00393039174,
                                0.00282162614,
                                -0.0322650373,
                                0.0612160601,
                                -0.0162303261,
                                -0.00738591375,
                                -0.0254140906,
                  

## Delete Pinecone Index

In [12]:
pc.delete_index('example-index')