In [1]:
from dotenv import load_dotenv
import google.generativeai as genai
load_dotenv()
import os
import json
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
pc = Pinecone(api_key= os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag2", dimension=768, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [25]:
import json
data= json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. John Smith',
  'subject': 'Physics',
  'stars': 4,
  'review': 'Very knowledgeable, but his lectures can be a bit dry at times.'},
 {'professor': 'Dr. Jane Doe',
  'subject': 'Chemistry',
  'stars': 5,
  'review': 'Excellent teacher! Makes complex concepts easy to understand.'},
 {'professor': 'Dr. Emily Johnson',
  'subject': 'Mathematics',
  'stars': 3,
  'review': 'Good professor but the exams are really tough.'},
 {'professor': 'Dr. Robert Brown',
  'subject': 'Biology',
  'stars': 5,
  'review': 'Great at engaging students and very approachable.'},
 {'professor': 'Dr. Linda Davis',
  'subject': 'History',
  'stars': 2,
  'review': 'Lectures are not well-organized and hard to follow.'},
 {'professor': 'Dr. Michael Miller',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Good professor with a lot of real-world examples.'},
 {'professor': 'Dr. Karen Wilson',
  'subject': 'English',
  'stars': 3,
  'review': 'Decent lectures, but the grading is quite ha

In [26]:
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

processed_data = []

for review in data["reviews"]:
    response = genai.embed_content(
        model="models/text-embedding-004",  # Specify the Gemini embedding model
        content=review['review'],
        task_type="retrieval_document",
        title=f"Embedding of review for {review['professor']}"
    )
    
    embedding = response['embedding']
    
    processed_data.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"],
            }
        }
    )


In [27]:
processed_data[0]

{'values': [0.02554556,
  -0.029166522,
  -0.044988643,
  0.019385392,
  0.013778925,
  -0.00162117,
  0.028454341,
  0.041502453,
  -0.024089629,
  0.050912313,
  0.022050893,
  0.02676771,
  0.05312001,
  0.02725006,
  -0.02984122,
  -0.032080553,
  0.019756326,
  -0.0059958543,
  -0.09330726,
  0.008926071,
  -0.018015485,
  -0.0578612,
  0.07525423,
  -0.050331388,
  0.015292661,
  -0.030154014,
  0.030631727,
  -0.0396676,
  0.0043025906,
  -0.03779628,
  0.056916844,
  0.0023173413,
  -0.012780759,
  0.011209679,
  -0.0017548195,
  0.07655534,
  0.00010179218,
  0.0071604275,
  0.035100635,
  -0.03558813,
  -0.029997298,
  0.055852517,
  -0.019435402,
  0.042034436,
  -0.08532198,
  -0.023936637,
  -0.012008807,
  0.10208843,
  0.017739953,
  0.059046615,
  0.017239938,
  0.08583789,
  -0.06547643,
  0.06699775,
  -0.006965261,
  -0.032142628,
  -0.018118775,
  -0.05393876,
  0.006505425,
  -0.0055711144,
  -0.016927117,
  -0.004308358,
  -0.027541295,
  -0.042949665,
  0.0650031

In [28]:
index = pc.Index('rag2')
index.upsert(
    vectors=processed_data,
    namespace="ns1"
)

{'upserted_count': 20}

In [29]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}