# Pinecone CRUD

This is a step by step followed by [Manfye Goh](https://towardsdatascience.com/crud-with-pinecone-ee6b6f8b54e8)


In [2]:
from dotenv import load_dotenv

load_dotenv('.env')

True

In [4]:
%%capture

import pinecone
from os import getenv

pinecone.init(api_key=getenv('PINECONE_API_KEY'),
              environment=getenv('PINECONE_API_REGION'))

Connect to the index


In [15]:
index_name = 'table-qa'
# pinecone.create_index(index_name,
#                       dimension=300,
#                       metric="cosine")

existing_index = pinecone.Index(index_name=index_name)

In [19]:
str(pinecone.describe_index(index_name))

"IndexDescription(name='table-qa', metric='cosine', replicas=1, dimension=768.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')"

In [20]:
existing_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 20000}},
 'total_vector_count': 20000}

## C for CREATE


In [22]:
# Tickets reports

import pandas as pd

data = {
    'ticketno': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010],
    'complains': [
        'Broken navigation button on the website',
        'Incorrect pricing displayed for a product',
        'Unable to reset password',
        'App crashes on the latest iOS update',
        'Payment processing error during checkout',
        'Wrong product delivered',
        'Delayed response from customer support',
        'Excessive delivery time for an order',
        'Difficulty in finding a specific product',
        'Error in applying a discount coupon'
    ]
}

df = pd.DataFrame(data)

Before Pushing it into Pinecone, we need to squash it into a vector using some embeding model


In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("average_word_embeddings_glove.6B.300d")

df["question_vector"] = df.complains.apply(
    lambda x: model.encode(str(x)).tolist())

Downloading (…)dc709/.gitattributes: 100%|██████████| 690/690 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 480M/480M [01:45<00:00, 4.54MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 4.61M/4.61M [00:00<00:00, 6.10MB/s]
Downloading (…)mbedding_config.json: 100%|██████████| 164/164 [00:00<?, ?B/s] 
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
Downloading (…)8744edc709/README.md: 100%|██████████| 2.15k/2.15k [00:00<?, ?B/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<?, ?B/s] 
Downloading (…)4edc709/modules.json: 100%|██████████| 248/248 [00:00<?, ?B/s] 


Putting an eye on those vectors

In [27]:
df['question_vector'].head()

0    [-0.2649574875831604, -0.17953598499298096, 0....
1    [0.10973000526428223, 0.3845505118370056, 0.12...
2    [-0.21170000731945038, 0.2875896692276001, -0....
3    [-0.008352994918823242, -0.13370579481124878, ...
4    [-0.17191250622272491, 0.37106096744537354, 0....
Name: question_vector, dtype: object

Now the upsert


In [None]:
import itertools


def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


for batch in chunks([(str(t), v) for t, v in zip(df.ticketno, df.question_vector)]):
    index.upsert(vectors=batch)

Did anything got in?


In [None]:
index.describe_index_stats()

## R for READ


In [None]:
# Let's get them by ID.

index.fetch(["1010", "1009"])

In [None]:
# Now using a vector. Isn't it the joke here?
query_questions = [
    "navigation button",
]

query_vectors = [model.encode(str(question)).tolist()
                 for question in query_questions]
query_results = index.query(queries=query_vectors,
                            top_k=5, include_values=False)

Other ways to query/match

In [None]:
# Extract matches and scores from the results
matches = []
scores = []
for match in query_results['results'][0]['matches']:
    matches.append(match['id'])
    scores.append(match['score'])

# Create DataFrame with only matches and scores
matches_df = pd.DataFrame({'id': matches, 'score': scores})

# Match the result dataframe to main dataframe
df["ticketno"] = df["ticketno"].astype(str)
matches_df.merge(df, left_on="id", right_on="ticketno")

## U for UPDATE

In [None]:
index.upsert(vectors=batch)

## D for DELETE

In [None]:
# By ID

index.delete(ids=["id-1", "id-2"], namespace='')

In [26]:
# Sort of truncate

index.delete(deleteAll='true', namespace="")