In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from groq import Groq
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer


  from tqdm.autonotebook import tqdm


In [2]:
pc = Pinecone(api_key= os.getenv('PINECONE_API_KEY'))
pc.create_index(
    name = "rag", dimension=1024 ,metric='cosine' , spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [3]:
import json
data = json.load(open('reviews.json'))
data['reviews']

[{'professor': 'Dr. Emily Sharma',
  'subject': 'Computer Science',
  'stars': 4,
  'review': 'Dr. Sharma is an excellent professor. Her lectures are engaging, and she is always willing to help students during office hours. Highly recommended!'},
 {'professor': 'Prof. John Williamson',
  'subject': 'Economics',
  'stars': 3,
  'review': "Prof. Williamson's class was challenging, but I learned a lot. He could be a bit strict, but I appreciated his high standards."},
 {'professor': 'Dr. Olivia Martinez',
  'subject': 'Biology',
  'stars': 5,
  'review': "Dr. Martinez is hands down the best biology professor I've had. Her passion for the subject is contagious, and she makes the material easy to understand."},
 {'professor': 'Prof. Michael Chen',
  'subject': 'Mathematics',
  'stars': 2,
  'review': "I struggled a lot in Prof. Chen's class. His lectures were dry, and he didn't seem interested in helping students who were falling behind."},
 {'professor': 'Dr. Sarah Nguyen',
  'subject': 'E

In [4]:
# Load the correct multilingual E5 model
model = SentenceTransformer('intfloat/multilingual-e5-large')

processed_data = []

for review in data['reviews']:
    # Generate embeddings for the review using the Hugging Face model
    embedding = model.encode(review['review']).tolist()
    
    # Append the processed data
    processed_data.append({
        "values": embedding,
        "id": review['professor'],  
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })

# # Check the embedding dimension (optional)
# embedding_dim = len(processed_data[0]["values"])
# print("Embedding Dimension:", embedding_dim)  # Should print 1024


In [5]:
processed_data[0]

{'values': [-0.014858780428767204,
  -0.015334798023104668,
  -0.016450606286525726,
  -0.032037388533353806,
  0.03843126446008682,
  -0.05087555944919586,
  -0.002738317707553506,
  0.03440488874912262,
  0.054305337369441986,
  -0.008762052282691002,
  0.02724512107670307,
  0.027922779321670532,
  -0.05334586650133133,
  -0.018113112077116966,
  -0.032637543976306915,
  0.007377806585282087,
  -0.010614591650664806,
  0.01990623027086258,
  0.022881103679537773,
  -0.03137196600437164,
  0.024258634075522423,
  0.00564871821552515,
  -0.07222767919301987,
  -0.03015150874853134,
  -0.04640179127454758,
  -0.01283810194581747,
  -0.012951786629855633,
  -0.030907204374670982,
  -0.027462517842650414,
  -0.025642525404691696,
  -0.014613768085837364,
  -0.004935133270919323,
  -0.02488398738205433,
  -0.018170686438679695,
  -0.039902541786432266,
  0.020615391433238983,
  0.029962733387947083,
  0.01944734901189804,
  -0.030829396098852158,
  0.030523553490638733,
  -0.0233776159584

In [6]:
index = pc.Index('rag')
index.upsert(
    vectors= processed_data, #collection
    namespace= "ns1"   #document in firebase
)

{'upserted_count': 20}

In [7]:
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}