In [None]:
from dotenv import load.dotenv
load.dotenv()

import os
import openai
from pinecone import Pinecone, ServerlessSpec



In [None]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECON_API_KEY"))

# Delete the exisiting index if it exists
pc.create_index("rag")

# Create a new Pinecone index with dimension 512
pc.create_index(
    name = "rag",
    dimension = 512,#Set the dimension to match your embeddings
    metric = "cosine",
    spec = ServerlessSpec(cloud="aws", region="us-east-1"),
)


In [None]:
import json
#load the review data
data = json.load(open("review.json"))

data['reviews']

In [None]:
# Fetch the API key from the environment variables
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# Check if the key was loaded correctly
if not GOOGLE_API_KEY:
    raise ValueError("API key not found")

# Define the API endpoint URL
embedding_api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={GOOGLE_API_KEY}"

# Initialize an empty list to hold the processed data
processed_data = []

# Create content for each review
for review in data["reviews"]:
    response = requests.post(
        embedding_api_url,
        json={
            "contents": [
                {
                    "parts": [
                        {"text": review['review']}  # Pass the review text here
                    ]
                }
            ]
        }
    )

    if response.status_code == 200:
        #Extract the generated content from the response
        candidates = response.jsons().get('candidates', [])
        if candidates:
            content_parts = candidates(o).get('content', {}).get('parts', [])
            if content_parts:
                generated_content = content_parts[0].get('text', '')
                processed_data.append(
                    {
                        "values": generated_content,
                        "id": review['professor'],
                        "metadata": {
                            "course": review['course'],
                            "professor": review['professor'],
                            "rating": review['rating'],
                            "review": review['review']
                        }
                    }
                )
            else:
                print(f"Error {response.status_code}: {response.text}")
        # Now you have the processed data with the generated content for each review.

In [None]:
processed_data[0]

In [None]:
def convert_to_embedding(text):
    #Replace this with your actual embedding logic.
    # For example, if you are generating embeddings from a model,
    # use the model's output here.
    return [0,1] * 512

valid_vectors = []
for item in processed_data:
    embedding = convert_to_embedding(item['values'])

    #Ensure the embedding is not a zero vector
    if any(embedding):
        item["values"] = embedding
        valid_vectors.append(item)

index = pc.index("rag")
upsert_response = index.upsert(
    vectors = valid_vectors,
    namespace = "ns1",
)
print (f"Upserted count: {upsert_response['upserted_count']}")

print(index.describe_index_stats())
