In [1]:
from dotenv import load_dotenv
load_dotenv()
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import os

  from tqdm.autonotebook import tqdm


In [3]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Create a Pinecone index
pc.create_index(
    name="rag",
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)


In [4]:
import json

# Load the review data
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Winters',
  'subject': 'Biology',
  'stars': 4,
  'review': 'Dr. Winters is very knowledgeable and passionate about biology. Her lectures are engaging, but sometimes the workload can be overwhelming.'},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Computer Science',
  'stars': 5,
  'review': "Prof. Chen is an excellent instructor. His explanations are clear, and he's always willing to help during office hours."},
 {'professor': 'Dr. Sarah Johnson',
  'subject': 'Psychology',
  'stars': 3,
  'review': "Dr. Johnson's content is interesting, but her teaching style can be a bit dry. More interactive sessions would be beneficial."},
 {'professor': 'Prof. Robert Taylor',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Prof. Taylor makes complex math concepts accessible. His problem-solving approach is very helpful.'},
 {'professor': 'Dr. Lisa Martinez',
  'subject': 'History',
  'stars': 5,
  'review': 'Dr. Martinez brings history to life! Her lectures 

In [8]:
processed_data = []
client = OpenAI()

# Create embeddings for each review
for review in data["reviews"]:
    response = client.embeddings.create(
        input=review['review'], model="text-embedding-3-small"
    )

    embedding = response.data[0].embedding
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata":{
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )

# Insert the embeddings into the Pinecone index
index = pc.Index("rag")
upsert_response = index.upsert(
    vectors=processed_data,
    namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")

# Print index statistics
print(index.describe_index_stats())

Upserted count: 20
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [9]:
processed_data[0]

{'values': [0.009808532,
  0.016675165,
  0.019300448,
  0.031160394,
  -0.024023319,
  -0.013363878,
  0.031054854,
  -0.0050889594,
  -0.0088784695,
  0.008199062,
  0.0024471858,
  0.06184586,
  -0.043323766,
  -0.014564384,
  -0.0019442266,
  0.028627457,
  7.188815e-05,
  -0.022189578,
  -0.020909917,
  0.028732996,
  0.02403651,
  -0.012855971,
  0.030342467,
  -0.020712031,
  -0.030949317,
  -0.013957535,
  0.017677786,
  -0.0019326832,
  0.056516144,
  0.0015542269,
  0.052584816,
  -0.01732159,
  0.041054677,
  -0.007935215,
  -0.014419268,
  0.022690888,
  -0.016173854,
  0.021015456,
  0.027466528,
  0.0022525983,
  0.014749077,
  0.018218672,
  -0.055144135,
  0.00860143,
  0.0026829997,
  0.0057980497,
  -0.025711942,
  0.00078659545,
  0.03585688,
  0.028944075,
  -0.008522276,
  0.04411531,
  0.022229156,
  -0.009023585,
  -0.053244434,
  0.035962418,
  0.013001087,
  0.005880502,
  -0.0040764445,
  -0.03828428,
  0.0706056,
  0.009920668,
  0.013020876,
  -0.044590235,
