# An introduction to Pinecone vector DB

### Imports

In [None]:
import os
from pinecone import Pinecone
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

### Initialize Pinecone

Get your API key from the Pinecone Console or in the [Quickstart guide](https://docs.pinecone.io/guides/get-started/quickstart).


In [68]:
#Initialize Pinecone Client
pc = Pinecone(api_key=PINECONE_API_KEY, pool_threads=30)

Next, we will create an index name, which is like a collection (or a DB)

In [101]:
# Create a dense index with integrated embedding
index_name = "healthnews"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "News"}
        }
    )

### Preparing the Data for Pinecone upsert

In this exercise, we're using the MovieLens 100k dataset.

In [80]:
# Load MovieLens Dataset
dtf = pd.read_csv("C:/Users/gurez/OneDrive/Área de Trabalho/ml-100k/u.item", 
                  sep='|',
                  names=str.split('id|movie title|release date|video release date|IMDb URL|unknown|Action|Adventure|Animation|Childrens|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western',
                                  '|'),
                  encoding='latin-1')

# From integer to string
cols = ['unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary','Drama',
        'Fantasy', 'Film-Noir', 'Horror','Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War','Western']

dtf = dtf.astype({col: str for col in cols})
dtf = dtf.astype({'id': str})

# Bind the columns with movie types to a single column with the binary values
dtf['categories'] = dtf[cols].apply(lambda x: ''.join(x), axis=1)

# Drop the original columns
dtf.drop(cols + ['release date', 'video release date', 'IMDb URL'], axis=1, inplace=True)

print(dtf.shape, '\n')

dtf.head()

(1682, 3) 



Unnamed: 0,id,movie title,categories
0,1,Toy Story (1995),1110000000000000
1,2,GoldenEye (1995),110000000000000100
2,3,Four Rooms (1995),100
3,4,Get Shorty (1995),100010010000000000
4,5,Copycat (1995),1010000000100


In [81]:
# Create the records for Pinecone
records = dtf.to_dict(orient="records")
records


[{'id': '1',
  'movie title': 'Toy Story (1995)',
  'categories': '0001110000000000000'},
 {'id': '2',
  'movie title': 'GoldenEye (1995)',
  'categories': '0110000000000000100'},
 {'id': '3',
  'movie title': 'Four Rooms (1995)',
  'categories': '0000000000000000100'},
 {'id': '4',
  'movie title': 'Get Shorty (1995)',
  'categories': '0100010010000000000'},
 {'id': '5',
  'movie title': 'Copycat (1995)',
  'categories': '0000001010000000100'},
 {'id': '6',
  'movie title': 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
  'categories': '0000000010000000000'},
 {'id': '7',
  'movie title': 'Twelve Monkeys (1995)',
  'categories': '0000000010000001000'},
 {'id': '8',
  'movie title': 'Babe (1995)',
  'categories': '0000110010000000000'},
 {'id': '9',
  'movie title': 'Dead Man Walking (1995)',
  'categories': '0000000010000000000'},
 {'id': '10',
  'movie title': 'Richard III (1995)',
  'categories': '0000000010000000010'},
 {'id': '11',
  'movie title': 'Seven (Se7en) (1995)',

### Data HealthNews

In [93]:
# Load data
records = pd.read_json("healthnews.json", lines=True)
# Add id column as it is needed
records = records.reset_index(names='_id')
# id must be string
records= records.astype({'_id': str,
                         'Date': str,})
# Format to list of dictionaries for Pinecone
records = records.to_dict(orient="records")
records

[{'_id': '0',
  'Date': '2011-11-17 00:46:58+00:00',
  'News': 'One in 12 Teens Engages in Self-Harm: Report:  http://on-msn.com/uULQjt'},
 {'_id': '1',
  'Date': '2014-09-16 22:58:15+00:00',
  'News': 'Well: Savory and Sweet Whole Wheat Focaccia http://nyti.ms/1oXFmKS'},
 {'_id': '2',
  'Date': '2013-11-27 16:38:24+00:00',
  'News': 'RT atceliadugger: Amazing interactive: if you or loved one has breast cancer, find women of same age and region, with same type, stage. http…'},
 {'_id': '3',
  'Date': '2014-05-22 20:36:32+00:00',
  'News': 'Well: Is Work Your Happy Place? http://nyti.ms/1maGPOT'},
 {'_id': '4',
  'Date': '2011-10-27 13:35:04+00:00',
  'News': 'Speech Therapy Key to Stroke Rehab, But Many Miss Out:  http://on-msn.com/sKoXA5'},
 {'_id': '5',
  'Date': '2014-05-27 16:57:49+00:00',
  'News': 'RT atpaula_span: Read atPerriKlass on her late, stubborn, but not really so "ungrateful," mother. http://newoldage.blogs.nytimes.com/2014/05/27/she-wasnt-so-ungrateful-after-all/'},
 {

### Upsert

In [None]:
# Create the index
# index = pc.Index(index_name)
# Upsert the records into a namespace under 1000 records
# dense_index.upsert_records("movies", records)

As the data is larger than 1000 records, let's upsert them in batches, using this function provided by Pinecone.

In [91]:
import itertools

def chunks(iterable, batch_size=200):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [None]:
# Create the index
index = pc.Index(index_name)

# Upsert data with 90 vectors per upsert request
for ids_vectors_chunk in chunks(records, batch_size=90):
    index.upsert_records(namespace="healthnews", records=ids_vectors_chunk) 

In [109]:
# View stats for the index
stats = index.describe_index_stats()
print(stats)

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'helthnews': {'vector_count': 2000}},
 'total_vector_count': 2000,
 'vector_type': 'dense'}


### Searching the DB

In [119]:
%%time

# Define the query
query = "Quick Workout"

# Search the dense index
results = index.search(
    namespace="helthnews",
    query={
        "top_k": 3,
        "inputs": {
            'text': query
        }
    }
)

# Print the results
for hit in results['result']['hits']:
    print(f"id: {hit['_id']}, score: {round(hit['_score'], 2)}, text: {hit['fields']['News']}")



id: 1926, score: 0.32, text: Download the 7-Minute Workout app from atnytimeswell for iPhone and Android http://nyti.ms/ZQrfkz http://pbs.twimg.com/media/B0vj0v0CQAAl0dp.jpg
id: 519, score: 0.31, text: Well: The Advanced 7-Minute Workout http://nyti.ms/10rsibq
id: 1673, score: 0.26, text: Well: The Workout: Becoming Rocky Balboa http://nyti.ms/1nUBRK4
CPU times: total: 0 ns
Wall time: 548 ms


Next, we will re-rank the search

In [122]:
# Define the query
query = "Quick Workout"

# Search the dense index and rerank results
reranked_results = index.search(
    namespace="helthnews",
    query={
        "top_k": 5,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 5,
        "rank_fields": ["News"]
    }   
)

# Print the reranked results
for hit in reranked_results['result']['hits']:
    print(f"id: {hit['_id']}, score: {round(hit['_score'], 4)}, text: {hit['fields']['News']}")


id: 1926, score: 0.3932, text: Download the 7-Minute Workout app from atnytimeswell for iPhone and Android http://nyti.ms/ZQrfkz http://pbs.twimg.com/media/B0vj0v0CQAAl0dp.jpg
id: 519, score: 0.0717, text: Well: The Advanced 7-Minute Workout http://nyti.ms/10rsibq
id: 1297, score: 0.029, text: Instead of a large all-out workout, have you tried exercising in snack-size portions?  http://nyti.ms/1jtdqKX
id: 1673, score: 0.0048, text: Well: The Workout: Becoming Rocky Balboa http://nyti.ms/1nUBRK4
id: 401, score: 0.0006, text: What kind of workout does it take to be a broadway star. atpatinamiller of atPippinMusical shows us her routine: http://nyti.ms/16gZkt0
