In [10]:
from dotenv import load_dotenv 
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec 


In [11]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"))

# Create a Pinecone index

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '23c24b385c745538fad7b537ebdfbc69', 'Date': 'Sun, 25 Aug 2024 07:00:00 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [12]:
import json
data=json.load(open("reviews.json")) # Load the review data
data['reviews']

[{'professor': 'Dr. Alice Johnson',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Great at explaining complex concepts, but her exams are tough.'},
 {'professor': 'Dr. Mark Evans',
  'subject': 'Mathematics',
  'stars': 5,
  'review': 'Very engaging lectures and always willing to help during office hours.'},
 {'professor': 'Dr. Lisa Wong',
  'subject': 'Physics',
  'stars': 3,
  'review': 'Interesting lectures but sometimes hard to follow.'},
 {'professor': 'Dr. Kevin Patel',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'He knows his stuff but expects students to be self-driven.'},
 {'professor': 'Dr. Susan Brown',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Passionate about the subject, makes learning fun and interactive.'},
 {'professor': 'Dr. Robert Adams',
  'subject': 'History',
  'stars': 3,
  'review': 'A bit dry at times, but knowledgeable and fair in grading.'},
 {'professor': 'Dr. Emily Taylor',
  'subject': 'Economics',
  'stars': 4,
  'review': '

In [13]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'], #Takes each review in reviews
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

In [14]:
processed_data[0]

{'values': [-0.013750226,
  -0.030419335,
  0.039548,
  0.00278832,
  0.031420913,
  0.011596835,
  -0.009693838,
  0.041608386,
  0.0039383452,
  0.013242284,
  0.0057340306,
  -0.016640492,
  -0.016282786,
  -0.04916314,
  -0.015882155,
  -0.0055158297,
  -0.038002707,
  -0.03136368,
  0.010910039,
  0.011146125,
  0.037945475,
  -0.0049435,
  0.039061517,
  0.02539714,
  0.016282786,
  -0.02541145,
  0.031563997,
  0.011904462,
  0.046215642,
  0.029288985,
  0.08235828,
  -0.019502142,
  0.031821545,
  -0.041637003,
  -0.028058475,
  0.067191534,
  0.003374958,
  0.04238103,
  0.038517803,
  0.0145157175,
  -0.0060130414,
  0.05414241,
  -0.04481343,
  -0.01254118,
  0.051652774,
  0.013371058,
  -0.06787833,
  -0.030819967,
  0.018843964,
  0.037344526,
  -0.058377653,
  0.046158407,
  0.04469897,
  0.009207358,
  -0.04272443,
  -0.00014207643,
  0.014200936,
  0.0056267185,
  0.003974116,
  -0.01968815,
  0.04281028,
  -0.027514761,
  0.013678685,
  -0.009686684,
  -0.050365034,


In [15]:
# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)

In [16]:
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

Upserted count: 20
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}
