In [10]:
from dotenv import load_dotenv #dotenv: invalid keyword"
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai
import time
from gemini import Gemini

In [11]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [12]:
import json
data = json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Sanders',
  'subject': 'Computer Science',
  'stars': 5,
  'review': 'Dr. Sanders is an excellent professor! Her lectures are clear, and she is always willing to help students.'},
 {'professor': 'Dr. James Harrison',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Dr. Harrison is knowledgeable and explains concepts well, but sometimes the pace is too fast.'},
 {'professor': 'Dr. Karen Lee',
  'subject': 'Physics',
  'stars': 3,
  'review': "Dr. Lee's classes are interesting, but her grading is quite tough."},
 {'professor': 'Dr. Michael Brown',
  'subject': 'Chemistry',
  'stars': 2,
  'review': "The lectures are somewhat disorganized, and it's hard to follow the material."},
 {'professor': 'Dr. Olivia Martinez',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Dr. Martinez is amazing! She makes the subject so engaging and accessible.'},
 {'professor': 'Dr. William Johnson',
  'subject': 'History',
  'stars': 4,
  'review': "Dr. Johnson's lectures are

In [13]:
# Configure the API with your API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

processed_data = []

# Loop through each review and generate embeddings
for review in data['reviews']:
    # Use the appropriate method to create embeddings
    response = genai.embed_content(
        model="models/text-embedding-004",  # Use the correct model for embedding
        content=review['review']
    )
    
    # Extract the embedding from the response
    embedding = response['embedding']  # Adjust this if the response structure differs
    
    # Append the processed data to the list
    processed_data.append(
        {
            "values": embedding,
            "id": review['professor'],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"]
            }
        }
    )

In [14]:
processed_data

[{'values': [0.06284978,
   -0.010355489,
   -0.019314654,
   0.060124684,
   0.010580365,
   -0.021908684,
   0.00717726,
   0.03607081,
   -0.025046857,
   0.019935446,
   0.06459493,
   0.0322437,
   0.0857579,
   -0.014565524,
   -0.013010221,
   -0.049174856,
   -0.0018636091,
   0.012562665,
   -0.10490633,
   0.018167028,
   0.02010871,
   -0.04493666,
   -0.038230095,
   -0.068877734,
   -0.0024682041,
   -0.045556348,
   0.011384341,
   -0.04967896,
   0.049997292,
   -0.04422053,
   0.06689225,
   0.0005989617,
   -0.01697457,
   -0.080033466,
   -0.052313562,
   0.016916841,
   0.0071697417,
   -0.06122502,
   0.03571847,
   -0.073154315,
   -0.01936768,
   -0.055884294,
   -0.034590326,
   0.021831244,
   -0.00547692,
   0.010118377,
   -0.02389192,
   0.091172405,
   -0.008864059,
   0.071193285,
   -0.008407666,
   0.030583644,
   -0.054554995,
   0.034956712,
   -0.05193924,
   -0.07077292,
   -0.036745418,
   -0.040855177,
   0.041616566,
   -0.014313966,
   -0.03037526

In [15]:
# processed_data = []
# client = OpenAI()

# for review in data['reviews']:
#     response = client.embeddings.create(
#         input=review['review'], 
#         model="text-embedding-3-small"
#     )
#     embedding = response.data[0].embedding
#     processed_data.append(
#         {
#             "values": embedding,
#             "id": review['professor'],
#             "metadata": {
#                 "review": review["review"],
#                 "subject": review["subject"],
#                 "stars" : review["stars"]
#             }
#         }
#     )
#     time.sleep(30)

In [16]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [17]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}