In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [14]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [15]:
import json

In [16]:
with open('sample.json', 'r') as file:
    data = json.load(file)

In [17]:
data

[{'country': 'US',
  'description': "A good everyday Pinot Noir, tart in acids and with some edgy tannins. But it has the variety's nice, silky texture, with rich cherry, rosehip tea and sandalwood flavors.",
  'price': 18.0,
  'title': 'San Simeon 2009 Pinot Noir (Monterey)'},
 {'country': 'France',
  'description': "Round and fruity, this is cleanly made, with its soft character coming from the ripe strawberry fruit. It's a good candidate to pair with a fruit dessert or to serve as an apéritif.",
  'price': 10.0,
  'title': 'Veuve du Vernay NV Brut Rosé Sparkling (Vin Mousseux)'},
 {'country': 'US',
  'description': "Full-bodied and dry, this has a very deep, saturated color and aromas of spicy oak, raspberries and cherries. With velvety texture, fine tannins and a lingering finish, it's well structured and youthful tasting. It should improve through 2017.",
  'price': 48.0,
  'title': 'Charles B. Mitchell 2012 Estate Grand Reserve Red (Fair Play)'},
 {'country': 'France',
  'descrip

In [21]:
data[0]["description"]

"A good everyday Pinot Noir, tart in acids and with some edgy tannins. But it has the variety's nice, silky texture, with rich cherry, rosehip tea and sandalwood flavors."

In [25]:
import unicodedata
def to_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

In [29]:
print(to_ascii("Veuve du Vernay NV Brut Ros\u00e9 Sparkling (Vin Mousseux)"))

Veuve du Vernay NV Brut Rose Sparkling (Vin Mousseux)


In [26]:
processed_data = []
client = OpenAI()

for item in data:
    response = client.embeddings.create(
        input=item["description"],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    ascii_title=to_ascii(item["title"])
    processed_data.append({
        "values": embedding,
        "id": ascii_title,
        "metadata": {
            "description": item["description"],
            "country": item["country"],
            "price": item["price"],
        }
    })

In [27]:
processed_data

[{'values': [-0.028531857,
   -0.027739305,
   -0.05336514,
   -0.010687438,
   0.021110693,
   -0.03078943,
   0.02376454,
   0.0075112274,
   0.031966247,
   -0.028651942,
   0.015442748,
   0.024016716,
   -0.0080876285,
   -0.007817441,
   0.033215117,
   -0.023788556,
   -0.0002932666,
   -0.0057940325,
   0.0062983837,
   0.042653687,
   -0.0028775027,
   -0.009042293,
   0.026034119,
   0.002860991,
   0.0047553093,
   -0.035929006,
   -0.044118706,
   -0.023188138,
   0.022791862,
   0.007355119,
   -0.0042539607,
   -0.0333352,
   0.032254446,
   -0.02010199,
   0.022791862,
   0.02273182,
   0.01893718,
   -0.021542992,
   0.0040167957,
   -0.015250614,
   -0.047312927,
   0.05182807,
   0.031990264,
   0.039075196,
   0.022359561,
   -0.06556563,
   0.006754701,
   0.015094506,
   -0.02105065,
   0.028916124,
   -0.041068584,
   -0.0008308282,
   -0.0007895495,
   0.0015595854,
   -0.05855275,
   0.012194487,
   -0.014386012,
   0.07238638,
   0.055094343,
   0.04311,
   0.0

In [28]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 100}

In [30]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 100}},
 'total_vector_count': 100}