In [6]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec


In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Alice Johnson',
  'subject': 'Calculus I',
  'stars': 4,
  'review': 'Very knowledgeable, but sometimes hard to follow. Great at answering questions during office hours.'},
 {'professor': 'Dr. Michael Smith',
  'subject': 'Introduction to Psychology',
  'stars': 5,
  'review': 'Amazing lecturer! Makes difficult concepts easy to understand. Highly recommended.'},
 {'professor': 'Dr. Emily Davis',
  'subject': 'Organic Chemistry',
  'stars': 3,
  'review': 'Challenging course, but the professor is fair. Lectures can be dry, but the labs are engaging.'},
 {'professor': 'Dr. Robert Wilson',
  'subject': 'World History',
  'stars': 2,
  'review': 'Lectures were unorganized, and the material was often confusing. Not very approachable outside of class.'},
 {'professor': 'Dr. Karen Thompson',
  'subject': 'Introduction to Programming',
  'stars': 4,
  'review': 'Good teacher with a lot of experience. Homework is tough but useful for learning the material.'},
 {'professor': 

In [7]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

In [8]:
processed_data[0]

{'values': [-0.009935268,
  -0.009652531,
  0.013834417,
  0.039741073,
  0.02331599,
  0.0021238197,
  0.011506763,
  0.020028343,
  0.010191705,
  0.029720327,
  -0.004530377,
  0.033875912,
  0.0068251546,
  -0.033349887,
  -0.0076273405,
  0.00086300727,
  -0.024538996,
  -0.008561032,
  0.01501797,
  0.013794965,
  0.051813312,
  -0.013203189,
  0.030509362,
  0.02307928,
  -0.040898323,
  -0.007620765,
  0.0041917497,
  0.021119842,
  0.022066684,
  0.015649198,
  0.063070215,
  -0.012769219,
  0.002819157,
  -0.014163182,
  -0.065384716,
  0.039977785,
  0.017700689,
  0.022277094,
  0.0335603,
  0.045448426,
  0.009211986,
  0.012887575,
  -0.01571495,
  -0.008567607,
  0.031193191,
  -0.045132812,
  -0.059072435,
  -0.018726436,
  0.053917404,
  0.044922404,
  -0.020606969,
  0.0099484185,
  0.07443232,
  0.035901103,
  -0.04970922,
  0.01884479,
  0.030772373,
  -0.023026677,
  0.04305502,
  -0.03724246,
  0.035375077,
  -0.016753847,
  0.022619009,
  -0.039320253,
  -0.01930

In [9]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [10]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}