In [17]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [12]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [13]:
import json
data = json.load(open("reviews.json"))
data["reviews"]

[{'professor': 'Dr. Emily Thompson',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Engaging lectures with lots of real-world examples. Exams are fair but study the textbook thoroughly.'},
 {'professor': 'Dr. Michael Richards',
  'subject': 'Advanced Calculus',
  'stars': 3,
  'review': 'The material is challenging, and the professor sometimes moves too quickly. Office hours are helpful.'},
 {'professor': 'Dr. Sarah Lee',
  'subject': 'Modern Art History',
  'stars': 5,
  'review': 'Passionate about the subject and encourages class discussions. The assignments are creative and enjoyable.'},
 {'professor': 'Dr. John Anderson',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': 'Difficult to follow lectures, and the exams are tough. You really need to put in extra time outside of class.'},
 {'professor': 'Dr. Jessica Miller',
  'subject': 'Introduction to Sociology',
  'stars': 4,
  'review': "Interesting course with lots of group work. The professor is ap

In [19]:
processed_data = []
client = OpenAI()

for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small"
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })


In [22]:
processed_data[0]

{'values': [-0.0112527795,
  0.0078066546,
  0.004631119,
  0.02264241,
  0.02843986,
  0.008503344,
  -0.016596138,
  0.013336628,
  0.010126879,
  0.06538928,
  0.06573763,
  0.00232178,
  -0.029360486,
  -0.0114020705,
  -0.036003917,
  0.014058199,
  -0.0023264452,
  0.020900685,
  0.00577257,
  -0.0053651314,
  0.044861827,
  0.0013436155,
  0.055386815,
  0.021062417,
  -0.02799199,
  -0.05379438,
  -0.010748924,
  0.032570235,
  -0.0010318158,
  0.010823569,
  0.056929484,
  -0.020129351,
  1.6255764e-05,
  -0.0328937,
  -0.037347533,
  0.094849296,
  0.04590686,
  0.020054705,
  0.02532964,
  -0.004656001,
  0.010394358,
  -0.012198287,
  -0.035133056,
  -0.012129862,
  0.03704895,
  -0.012098759,
  0.041577432,
  -0.0116135655,
  0.06917132,
  0.04264735,
  -0.024794681,
  0.0017215073,
  0.09624268,
  -0.031898428,
  -0.023003194,
  0.02562822,
  0.0049794638,
  0.05235124,
  -0.013908908,
  0.023177367,
  0.022679731,
  0.004562694,
  0.011227897,
  0.0043574194,
  -0.048444

In [23]:
index = pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [24]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}