In [1]:
# Setup Cell - Run this after each kernel restart
from dotenv import load_dotenv
import os
import json
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
load_dotenv('.env.local')  # Specify the file name if it's not just .env

# Function to load JSON data
def load_json_data(file_path="reviews.json"):
    try:
        with open(file_path, "r", encoding="utf-8-sig") as file:
            data = json.load(file)
        print(f"Successfully loaded {len(data['reviews'])} reviews.")
        print(f"First review: {data['reviews'][0]}")
        return data
    except FileNotFoundError:
        print(f"File '{file_path}' not found. Check the file path.")
    except json.JSONDecodeError as e:
        print(f"JSON decoding error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None

# Load JSON data
data = load_json_data()

# Initialize OpenAI client
client = OpenAI()

# Initialize Pinecone (adjust as needed)
pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Print API keys to verify (be careful not to share this output)
openai_key = os.getenv('OPENAI_API_KEY')
pinecone_key = os.getenv('PINECONE_API_KEY')
print(f"Loaded OpenAI API key: {openai_key[:5]}...{openai_key[-5:] if openai_key else 'Not found'}")
print(f"Loaded Pinecone API key: {pinecone_key[:5]}...{pinecone_key[-5:] if pinecone_key else 'Not found'}")

print("Setup complete!")


Successfully loaded 20 reviews.
First review: {'professor': 'Dr. Emma Thompson', 'subject': 'Biology', 'stars': 5, 'review': "Dr. Thompson's lectures are engaging and her passion for biology is contagious. Best professor I've had!"}
Loaded OpenAI API key: sk-pr...J4QcA
Loaded Pinecone API key: 7401e...757a9
Setup complete!


  from tqdm.autonotebook import tqdm


In [2]:
processed_data = []
client = OpenAI()
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review["review"],
        model="text-embedding-3-small",
    )
    embedding=response.data[0].embedding  
    processed_data.append({
        "values": embedding,
        "id": review["professor"],
        "metadata": {
        "review": review["review"],
        "subject": review["subject"],
        "stars": review["stars"],
        }

})


In [3]:
processed_data[0]

{'values': [0.03333573,
  -0.012095369,
  -0.016961727,
  0.058255248,
  -0.01783156,
  0.023367923,
  0.02830481,
  0.017608223,
  -0.027952174,
  -0.0036350756,
  0.038084544,
  0.013400117,
  -0.003925999,
  -0.019289115,
  -0.009215519,
  -0.014387494,
  0.03592172,
  -0.020887727,
  0.020041402,
  0.04847551,
  0.048898675,
  -0.037355766,
  0.025812857,
  -0.019618241,
  -0.036650497,
  -0.07899016,
  0.017666997,
  0.025648294,
  -0.01183677,
  -0.016820673,
  0.06902236,
  -0.021451943,
  0.0036938481,
  -0.03241888,
  -0.02668269,
  0.042245634,
  -0.012165896,
  0.0132238,
  0.005762638,
  0.016456284,
  0.054681882,
  0.014081878,
  -0.0050367983,
  0.03032658,
  0.0034822673,
  0.0013341345,
  -0.04802884,
  -0.03815507,
  0.02089948,
  0.025483731,
  -0.025084078,
  0.0095564,
  0.015751015,
  0.019054025,
  -0.07287783,
  0.022368792,
  0.0053100903,
  0.036650497,
  -0.016667865,
  -0.06629532,
  0.034017492,
  0.031102378,
  0.012730111,
  -0.026471108,
  -0.010291055,


In [4]:
pc=Pinecone()
index=pc.Index("rag")
index.upsert(
    vectors=processed_data,
    namespace='ns1'
)

{'upserted_count': 20}

In [5]:
print(processed_data[:1])  # Print the first item to check


[{'values': [0.03333573, -0.012095369, -0.016961727, 0.058255248, -0.01783156, 0.023367923, 0.02830481, 0.017608223, -0.027952174, -0.0036350756, 0.038084544, 0.013400117, -0.003925999, -0.019289115, -0.009215519, -0.014387494, 0.03592172, -0.020887727, 0.020041402, 0.04847551, 0.048898675, -0.037355766, 0.025812857, -0.019618241, -0.036650497, -0.07899016, 0.017666997, 0.025648294, -0.01183677, -0.016820673, 0.06902236, -0.021451943, 0.0036938481, -0.03241888, -0.02668269, 0.042245634, -0.012165896, 0.0132238, 0.005762638, 0.016456284, 0.054681882, 0.014081878, -0.0050367983, 0.03032658, 0.0034822673, 0.0013341345, -0.04802884, -0.03815507, 0.02089948, 0.025483731, -0.025084078, 0.0095564, 0.015751015, 0.019054025, -0.07287783, 0.022368792, 0.0053100903, 0.036650497, -0.016667865, -0.06629532, 0.034017492, 0.031102378, 0.012730111, -0.026471108, -0.010291055, -0.010179387, -0.04548987, 0.0041787205, -0.00528952, -0.008457354, 0.025789348, -0.018771918, -0.08453828, 0.0583963, -0.02276

In [6]:
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}