In [None]:
# Setup Cell - Run this after each kernel restart
from dotenv import load_dotenv
import os
import json
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
load_dotenv('.env.local')  # Specify the file name if it's not just .env

# Function to load JSON data
def load_json_data(file_path="reviews.json"):
    try:
        with open(file_path, "r", encoding="utf-8-sig") as file:
            data = json.load(file)
        print(f"Successfully loaded {len(data['reviews'])} reviews.")
        print(f"First review: {data['reviews'][0]}")
        return data
    except FileNotFoundError:
        print(f"File '{file_path}' not found. Check the file path.")
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None

# Load JSON data
data = load_json_data()

# Initialize OpenAI client
client = OpenAI()

# Initialize Pinecone (adjust as needed)
pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Print API keys to verify (be careful not to share this output)
openai_key = os.getenv('OPENAI_API_KEY')
pinecone_key = os.getenv('PINECONE_API_KEY')
print(f"Loaded OpenAI API key: {openai_key[:5]}...{openai_key[-5:] if openai_key else 'Not found'}")
print(f"Loaded Pinecone API key: {pinecone_key[:5]}...{pinecone_key[-5:] if pinecone_key else 'Not found'}")

print("Setup complete!")


In [3]:
processed_data = []
client = OpenAI()
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding=response.data[0].embedding  
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
        "review": review["review"],
        "subject": review["subject"],
        "stars": review["stars"],
        }

})


In [None]:
processed_data[0]

In [None]:
pc=Pinecone()
index=pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace='ns1'
)

In [None]:
print(processed_data[:1])  # Print the first item to check


In [None]:
index.describe_index_stats()
