In [11]:
pip install pinecone-client openai wget

Collecting openai
  Downloading openai-1.33.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0

https://cookbook.openai.com/examples/vector_databases/pinecone/using_pinecone_for_embeddings_search

# Get data

In [12]:
import openai

from typing import List, Iterator
import pandas as pd
import numpy as np
import os
import wget
from ast import literal_eval

# Pinecone's client library for Python
import pinecone

# I've set this to our new embeddings model, this can be changed to the embedding model of your choice
EMBEDDING_MODEL = "text-embedding-3-small"

# Ignore unclosed SSL socket warnings - optional in case you get these errors
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [13]:
embeddings_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'

# The file is ~700 MB so this will take some time
wget.download(embeddings_url)

'vector_database_wikipedia_articles_embedded.zip'

In [14]:
import zipfile
with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip","r") as zip_ref:
    zip_ref.extractall("../data")

In [15]:
article_df = pd.read_csv('../data/vector_database_wikipedia_articles_embedded.csv')

In [38]:
article_df.head()

Unnamed: 0,id,url,title,text,title_vector,content_vector,vector_id
0,1,https://simple.wikipedia.org/wiki/April,April,April is the fourth month of the year in the J...,"[0.001009464613161981, -0.020700545981526375, ...","[-0.011253940872848034, -0.013491976074874401,...",0
1,2,https://simple.wikipedia.org/wiki/August,August,August (Aug.) is the eighth month of the year ...,"[0.0009286514250561595, 0.000820168002974242, ...","[0.0003609954728744924, 0.007262262050062418, ...",1
2,6,https://simple.wikipedia.org/wiki/Art,Art,Art is a creative activity that expresses imag...,"[0.003393713850528002, 0.0061537534929811954, ...","[-0.004959689453244209, 0.015772193670272827, ...",2
3,8,https://simple.wikipedia.org/wiki/A,A,A or a is the first letter of the English alph...,"[0.0153952119871974, -0.013759135268628597, 0....","[0.024894846603274345, -0.022186409682035446, ...",3
4,9,https://simple.wikipedia.org/wiki/Air,Air,Air refers to the Earth's atmosphere. Air is a...,"[0.02224554680287838, -0.02044147066771984, -0...","[0.021524671465158463, 0.018522677943110466, -...",4


In [17]:
# Read vectors from strings back into a list
article_df['title_vector'] = article_df.title_vector.apply(literal_eval)
article_df['content_vector'] = article_df.content_vector.apply(literal_eval)

# Set vector_id to be a string
article_df['vector_id'] = article_df['vector_id'].apply(str)

In [58]:
article_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              200 non-null    int64 
 1   url             200 non-null    object
 2   title           200 non-null    object
 3   text            200 non-null    object
 4   title_vector    200 non-null    object
 5   content_vector  200 non-null    object
 6   vector_id       200 non-null    object
dtypes: int64(1), object(6)
memory usage: 11.1+ KB


# **Pinecone**

In [57]:
article_df=article_df.head(200)

In [None]:
from google.colab import userdata
userdata.get('secretName')

In [26]:
import os
import pinecone
from pinecone import Pinecone, ServerlessSpec

pinecone = Pinecone(api_key=userdata.get('YOUR_API_KEY'))

In [27]:
# Models a simple batch generator that make chunks out of an input DataFrame
class BatchGenerator:


    def __init__(self, batch_size: int = 10) -> None:
        self.batch_size = batch_size

    # Makes chunks out of an input DataFrame
    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
        splits = self.splits_num(df.shape[0])
        if splits <= 1:
            yield df
        else:
            for chunk in np.array_split(df, splits):
                yield chunk

    # Determines how many chunks DataFrame contains
    def splits_num(self, elements: int) -> int:
        return round(elements / self.batch_size)

    __call__ = to_batches

df_batcher = BatchGenerator(300)

In [54]:
print(type(article_df['title_vector'][0]))
print(type(article_df['content_vector'][0]))


<class 'list'>
<class 'list'>


In [84]:
article_df['title'][0]

'April'

In [68]:

'''
import os
import pinecone
from pinecone import Pinecone, ServerlessSpec
import ast

pinecone_client = Pinecone(api_key=userdata.get('YOUR_API_KEY'))
# Initialize Pinecone
#pinecone_client = pinecone.Client(api_key=os.environ.get("YOUR_API_KEY"))

# Define index name
index_name = "wiki3"


# Connect to the index
index = pinecone_client.Index(index_name)


# Define df_batcher function
def df_batcher(df, batch_size=100):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

# Upsert title vectors in title namespace
print("Uploading vectors to title namespace..")
for batch_df in df_batcher(article_df):
    #vectors = [(str(id_), vector) for id_, vector in zip(batch_df['vector_id'], batch_df['title_vector'])]
    #index.upsert(vectors=vectors, namespace='title')
    index.upsert(vectors=zip(batch_df.vector_id, batch_df.content_vector), namespace='content')
    index.upsert(vectors=zip(batch_df.vector_id, batch_df.title_vector), namespace='title')

print("Data upserted successfully.")


'''

Uploading vectors to title namespace..
Data upserted successfully.


In [77]:
import os
import pinecone
from pinecone import Pinecone, ServerlessSpec
import ast

pinecone_client = Pinecone(api_key=userdata.get('YOUR_API_KEY'))
# Initialize Pinecone
#pinecone_client = pinecone.Client(api_key=os.environ.get("YOUR_API_KEY"))

# Define index name
index_name = "wiki4"


# Connect to the index
index = pinecone_client.Index(index_name)


In [92]:
for batch_df in df_batcher(article_df):
    vectors_with_metadata = [
        {
            "id": str(id_),
            "values": vector,
            "metadata": {"title": title, "text": text}
        }
        for id_, vector, title, text in zip(batch_df['vector_id'], batch_df['title_vector'], batch_df['title'], batch_df['text'])
    ]
    index.upsert(vectors=vectors_with_metadata, namespace='title')

print("Data upserted successfully.")

Data upserted successfully.


In [89]:
# Upsert content vectors in content namespace - this can take a few minutes
#print("Uploading vectors to content namespace..")
#for batch_df in df_batcher(article_df):
 #   index.upsert(vectors=zip(batch_df.vector_id, batch_df.content_vector), namespace='content')

Uploading vectors to content namespace..


In [88]:
# Upsert title vectors in title namespace - this can also take a few minutes
#print("Uploading vectors to title namespace..")
#for batch_df in df_batcher(article_df):
 #   index.upsert(vectors=zip(batch_df.vector_id, batch_df.title_vector), namespace='title')

Uploading vectors to title namespace..


In [80]:
# Check index size for each namespace to confirm all of our docs have loaded
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'content': {'vector_count': 200},
                'title': {'vector_count': 200}},
 'total_vector_count': 400}

In [71]:
# First we'll create dictionaries mapping vector IDs to their outputs so we can retrieve the text for our search results
titles_mapped = dict(zip(article_df.vector_id,article_df.title))
content_mapped = dict(zip(article_df.vector_id,article_df.text))

In [100]:

# Perform the query
response = index.query(
    vector=query_vector,
    top_k=3,  # Number of top results to return
    namespace='title',  # Specify the namespace
    include_metadata=True  # Ensure metadata is included in the response
)

# Print the response metadata
print("Query Results:")
for match in response['matches']:
    print(f"ID: {match['id']}")
    print(f"Score: {match['score']}")
    if 'metadata' in match:
        print("Metadata:")
        for key, value in match['metadata'].items():
            print(f"  {key}: {value}")
    print("\n")

Query Results:
ID: 133
Score: 515.064392
Metadata:
  text: Everything2 or E2 is a website.  It lets people make pages about many different things, and some people use it as a diary.

E2 users create pages called nodes and add stuff in writeups.  Only logged-in users can create writeups.  Only the person who created the writeup or someone who the website owners (called "gods") choose can edit the writeup.  On the other hand, on Wikipedia, anyone can edit pages, but on Everything2 only those who can edit the writeup can edit pages.

Everything2 does not require a neutral point of view like Wikipedia does. So, it is possible to have more than one article (writeups) under the same title (node), each by different authors, and presenting different points of view.

Other websites 
 Everything2 website
 Everything2.com article about Wikipedia

Websites
  title: Everything2


ID: 52
Score: 515.097351
Metadata:
  text: Bubonic plague is the best-known form of the disease plague, which is caused 