# Pinecone
docs: https://docs.pinecone.io/home
- performant vector search
- highly scalable and **fully managed in the cloud**
- easy to configure underlying infrastructure (number of pods, pod size, replica for throughput.. )
- built-in monitoring
- data > embedding > Pinecone db < embedding < application query
- lots of embeddings: OpenAi, transformers (e.g. sentence_transformer.. )
- data updates immidiatly available (live vector update)
- spark connector
- index run at least on one pod (vitual unit) but can on multiple
- after index creation pod number + size, dimensions etc not changable but you can create a new index from collection (collection is a snapshot of your index)



In [1]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-4.0.0-py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.5/214.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-4.0.0


In [2]:
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata


In [3]:
pc = Pinecone(api_key=userdata.get('pinecone_api_key'))

In [7]:
index_name = 'my-index'

In [23]:

pc.create_index(index_name,
                dimension=5,
                metric='cosine',
                spec=ServerlessSpec(
                  cloud='aws',
                  region='us-east-1')
                )

In [8]:
index = pc.Index(index_name)

## add vectors to index
vectors are assigned to ids
-> tupel
(id, [vectorlist], {optional metadata})

In [52]:
index.upsert( [
    ('my-1', [2.,3.,1.,0.,9.], {'some_metadata': 'A' }),
    ('my-2', [5.,8.,12.,11.,21.], {'some_metadata': 'B'} )
  ]
)

{'upserted_count': 2}

In [36]:
index.fetch(['2'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'2': {'id': '2',
                   'metadata': {'some_metadata': 'B'},
                   'values': [5.0, 8.0, 12.0, 11.0, 21.0]}}}

In [37]:
index.describe_index_stats()

{'dimension': 5,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2}},
 'total_vector_count': 2}

In [54]:
# lets create 10 more vectors with dimension 5
import numpy as np
import pandas as pd
import random


dimension = 5
n_vectors = 10
offset = index.describe_index_stats()['total_vector_count']

df = pd.DataFrame(
    {
        'id': [f'my-{i}' for i in range(offset, offset+n_vectors)],
        'values': np.random.normal(loc=10, scale=1, size=(10, 5)).tolist(),
        'metadata': [{'source': random.choice(['A', 'B', 'C'])} for _ in range(0, n_vectors)]
    }
)
#mistake in docstrings
#https://github.com/pinecone-io/pinecone-python-client/blob/06c69fbbe5c3fa57717ba71596d94f03ee50aaa3/pinecone/data/index.py#L211#
index.upsert_from_dataframe(df, show_progress=True)

sending upsert requests:   0%|          | 0/10 [00:00<?, ?it/s]

{'upserted_count': 10}

## query the index

In [36]:
query_vector = np.random.normal(loc=10, scale=1, size=(1, 5))

In [58]:
index.query(vector=query_vector.tolist(),
              top_k=2,
              include_values=True,
              include_metadata=True,)

{'matches': [{'id': 'my-4',
              'metadata': {'source': 'C'},
              'score': 0.998865068,
              'values': [10.1225519,
                         10.5818329,
                         9.66793251,
                         10.4530144,
                         10.9275446]},
             {'id': 'my-3',
              'metadata': {'source': 'B'},
              'score': 0.997716844,
              'values': [11.0463171,
                         9.62251,
                         9.61125374,
                         10.3891125,
                         9.76752567]}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [59]:
index.query(vector=query_vector.tolist(),
              top_k=2,
              include_values=True,
              include_metadata=True,
              filter={'source': 'A'} )




{'matches': [{'id': 'my-7',
              'metadata': {'source': 'A'},
              'score': 0.996453702,
              'values': [9.36811638,
                         10.5229712,
                         9.26913738,
                         8.89493275,
                         9.12673473]},
             {'id': 'my-5',
              'metadata': {'source': 'A'},
              'score': 0.993597806,
              'values': [9.09589672,
                         10.3351488,
                         7.96700716,
                         10.933691,
                         9.64216]}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [None]:
#index.delete(delete_all=True)

In [60]:
index.query(vector=query_vector.tolist(),
              top_k=2,
              include_values=True,
              include_metadata=True,
              filter={'source': {'$ne': 'A'}} ) # not A

{'matches': [{'id': 'my-4',
              'metadata': {'source': 'C'},
              'score': 0.998865068,
              'values': [10.1225519,
                         10.5818329,
                         9.66793251,
                         10.4530144,
                         10.9275446]},
             {'id': 'my-3',
              'metadata': {'source': 'B'},
              'score': 0.997716844,
              'values': [11.0463171,
                         9.62251,
                         9.61125374,
                         10.3891125,
                         9.76752567]}],
 'namespace': '',
 'usage': {'read_units': 6}}