In [5]:
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
load_dotenv()
import os
import json
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [6]:
client = InferenceClient(token=os.getenv("HF_TOKEN"))
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [None]:
pc.create_index(
    name="rag",
    dimension=1024,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

In [13]:
data = json.load(open("reviews.json"))
reviews = data["reviews"]
processed_reviews = []

In [16]:

for review in reviews:
    output = client.feature_extraction(
        model="intfloat/multilingual-e5-large",
        text=review["review"]
    )
    embedding = output.tolist()
    processed_reviews.append(
        {
            "values": embedding,
            "id": review["professor"],
            "metadata": {
                "review": review["review"],
                "subject": review["subject"],
                "stars": review["stars"]
            }
        }
    )

In [20]:
pinecone_index = pc.Index("rag")
pinecone_index.upsert(
    vectors=processed_reviews,
    namespace="professor-reviews"
)

upserted_count: 20

In [21]:
pinecone_index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0},
                'professor-reviews': {'vector_count': 20}},
 'total_vector_count': 20}