In [5]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [4]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily White',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Engaging lectures and clear explanations, but exams were challenging.'},
 {'professor': 'Dr. John Smith',
  'subject': 'Calculus I',
  'stars': 3,
  'review': 'Great at explaining concepts, but the pace was too fast.'},
 {'professor': 'Dr. Ava Martinez',
  'subject': 'World History',
  'stars': 5,
  'review': 'Incredibly knowledgeable and makes history come alive!'},
 {'professor': 'Dr. Michael Johnson',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': "Very difficult course, and the professor's teaching style wasn't helpful."},
 {'professor': 'Dr. Sophia Davis',
  'subject': 'English Literature',
  'stars': 5,
  'review': 'Passionate about the subject and always available for help.'},
 {'professor': 'Dr. David Lee',
  'subject': 'Microeconomics',
  'stars': 4,
  'review': 'Interesting course, but grading was tough.'},
 {'professor': 'Dr. Isabella Clark',
  'subject': 'Bio

In [8]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [9]:
processed_data[0]

{'values': [-0.027074821,
  0.00441639,
  -0.011801335,
  -0.029447015,
  0.004214289,
  0.020912416,
  -0.008899041,
  0.018182406,
  0.005854284,
  0.047152333,
  0.019136583,
  -0.0089388,
  -0.03880327,
  -0.013245855,
  -0.010482714,
  -0.0062617976,
  0.023019562,
  -0.0038167147,
  -0.01660536,
  0.02252922,
  0.04042007,
  -0.012722383,
  0.06552027,
  0.007255734,
  -0.036603354,
  -0.064407066,
  0.0016789238,
  -0.009270111,
  0.0038332804,
  0.028519342,
  0.05942413,
  -0.015849968,
  0.004015502,
  -0.024490587,
  -0.058522962,
  0.05900005,
  0.037451513,
  0.029128956,
  0.026120642,
  0.009005061,
  0.020899164,
  0.0357552,
  -0.031143334,
  -0.031143334,
  0.020064257,
  0.003314777,
  -0.012682625,
  -0.04309707,
  0.045800578,
  0.020382317,
  -0.04153328,
  0.053911097,
  0.0918132,
  0.013259108,
  -0.032813147,
  -0.040764634,
  0.0050094384,
  0.04436931,
  -0.03230955,
  -0.0054500834,
  0.071828455,
  -0.03970444,
  0.03498655,
  0.007587046,
  -0.0041414006,

In [10]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}