In [1]:
from dotenv import load_dotenv
load_dotenv() 
import os
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm, trange


In [4]:

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="rag" , dimension=384, metric="cosine", spec=ServerlessSpec(cloud="aws",region="us-east-1")
)


In [5]:
import json
data=json.load(open("reviews.json"))
data['reviews']

[{'professor': 'Dr. Emily Johnson',
  'subject': 'Psychology',
  'stars': 5,
  'review': "Dr. Johnson's lectures are insightful and engaging. She makes complex topics easy to understand."},
 {'professor': 'Prof. Michael Lee',
  'subject': 'Mathematics',
  'stars': 4,
  'review': 'Prof. Lee is very knowledgeable, but his lectures can be a bit fast-paced at times.'},
 {'professor': 'Dr. Susan Carter',
  'subject': 'Biology',
  'stars': 3,
  'review': "Dr. Carter's lectures are informative, but her grading criteria can be unclear."},
 {'professor': 'Prof. David Thompson',
  'subject': 'History',
  'stars': 5,
  'review': 'Prof. Thompson has a passion for history that makes his classes incredibly engaging.'},
 {'professor': 'Dr. Jessica White',
  'subject': 'Chemistry',
  'stars': 4,
  'review': 'Dr. White is very approachable and always willing to help with difficult concepts.'},
 {'professor': 'Prof. John Davis',
  'subject': 'Physics',
  'stars': 2,
  'review': 'Prof. Davis is knowledge

In [6]:
processed_data = []
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

for review in data['reviews']:
    embedding = model.encode(review['review'])
    processed_data.append({
        "values": embedding.tolist(),
        "id": review["professor"],
        "metadata": {
            "review": review["review"],
            "subject": review["subject"],
            "stars": review["stars"]
        }
    })



In [7]:
processed_data[0]

{'values': [-0.11260172724723816,
  -0.2067752331495285,
  -0.20154356956481934,
  0.2864531874656677,
  -0.2876005172729492,
  0.39670422673225403,
  -0.28936707973480225,
  0.12330622971057892,
  -0.04254348576068878,
  0.04124194756150246,
  -0.05252169072628021,
  0.18020972609519958,
  -0.05709284916520119,
  0.3233307898044586,
  -0.2015521377325058,
  0.4123784899711609,
  0.5116992592811584,
  0.01579703576862812,
  -0.327656626701355,
  -0.2663812041282654,
  -0.4254342317581177,
  0.20099563896656036,
  0.2671351730823517,
  0.031201748177409172,
  -0.21057938039302826,
  -0.040881168097257614,
  0.12797759473323822,
  -0.2695575952529907,
  0.22025948762893677,
  -0.2071692794561386,
  -0.02558192051947117,
  0.34368661046028137,
  0.14719438552856445,
  0.11731182783842087,
  -0.223134383559227,
  0.049508966505527496,
  0.3103870451450348,
  0.2609415054321289,
  0.3485448658466339,
  0.1466878354549408,
  -0.1375439316034317,
  -0.09912081062793732,
  0.4110373258590698,


In [8]:
index = pc.Index('rag')
index.upsert(
    vectors=processed_data,
    namespace="ns1" 
)

{'upserted_count': 20}

In [9]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 20}},
 'total_vector_count': 20}