In [24]:
from dotenv import load_dotenv
load_dotenv()
import os 
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

In [20]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=1536, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [23]:
import json 
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Roberts',
  'subject': 'Introduction to Psychology',
  'stars': 4,
  'review': 'Engaging lectures but a bit too fast-paced. Would recommend attending office hours.'},
 {'professor': 'Prof. Michael Tanaka',
  'subject': 'Advanced Calculus',
  'stars': 5,
  'review': 'Challenging course, but Prof. Tanaka explains everything clearly. Great professor!'},
 {'professor': 'Dr. Sarah Lemaire',
  'subject': 'Modern European History',
  'stars': 3,
  'review': 'Very knowledgeable, but the lectures can be dry. Lots of reading required.'},
 {'professor': 'Prof. Carlos Martinez',
  'subject': 'Organic Chemistry',
  'stars': 2,
  'review': 'Difficult material and not much support. Exams are tough and grading is harsh.'},
 {'professor': 'Dr. Rebecca Singh',
  'subject': 'Cognitive Science',
  'stars': 5,
  'review': 'Fascinating lectures with lots of real-world applications. Highly recommended!'},
 {'professor': 'Prof. John Smith',
  'subject': 'Introduction to Sociology',
 

In [25]:
processed_data = []
client = OpenAI()

for review in data['reviews']:
    response = client.embeddings.create(
        input=review['review'],
        model="text-embedding-3-small",
    )
    embedding = response.data[0].embedding
    processed_data.append({
        "values": embedding,
        "id": review['professor'],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]

        }
    })

In [27]:
processed_data[0]

{'values': [-0.033247102,
  0.04624846,
  -0.007244398,
  -0.02345477,
  0.0007536205,
  0.0062389965,
  0.008580342,
  0.0116654085,
  0.03883879,
  0.0146953855,
  -0.0070205927,
  0.01571456,
  0.01196152,
  -0.02543803,
  0.025823662,
  -0.048286803,
  -0.014213343,
  -0.042392123,
  0.01530138,
  0.0066177435,
  0.002819944,
  0.0053919805,
  0.031869844,
  0.02733865,
  -0.05663301,
  -0.009675265,
  0.020796655,
  0.0050820964,
  0.03958251,
  0.031153666,
  0.049553886,
  -0.022215236,
  0.008291117,
  -0.05195032,
  -0.06440077,
  0.064015135,
  0.024115857,
  0.039141785,
  0.029225498,
  -0.0063595073,
  0.016320555,
  0.013283691,
  -0.00970281,
  -0.026567383,
  0.04120768,
  0.0038356746,
  0.023564951,
  -0.023633815,
  0.051757503,
  0.033632737,
  -0.07024036,
  0.0026873825,
  0.05624738,
  0.013435191,
  -0.017546317,
  -0.012960035,
  0.02370268,
  -0.010129762,
  0.023261955,
  0.015177427,
  0.08941185,
  -0.037158526,
  0.010453418,
  0.0012266239,
  0.0055847974

In [28]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [29]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}