In [1]:
import pinecone
from pinecone import ServerlessSpec, Pinecone
import os
import time
import json
from dotenv import load_dotenv
import pandas as pd
from sentence_transformers import SentenceTransformer

load_dotenv()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable not set")
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Create an index
index_name = "movies-actors"
if index_name not in pc.list_indexes().names():  # Check if index exists
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )
    while index_name not in pc.list_indexes().names():  # Wait for index creation
        time.sleep(1)

# Connect to the index
index = pc.Index(index_name)

In [5]:

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
# Load movie dataset in chunks
#output_directory = 'constant'  # or whatever your directory is.
# Get the absolute path.

movie_file = ('/Users/mohitbhoir/Git/Movie_Recommendation_Chatbot/constant/movies_transformed_preprocessed.csv')

print(f"Checking for file: {movie_file}")

Checking for file: /Users/mohitbhoir/Git/Movie_Recommendation_Chatbot/constant/movies_transformed_preprocessed.csv


In [16]:
if not os.path.exists(movie_file):
    raise FileNotFoundError(f"File not found: {movie_file}")

In [None]:
import csv

with open(movie_file, 'r', newline='', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='^')
    for row in reader:
        print(row)

NameError: name 'row' is not defined

In [40]:
import csv

# Generate vectorized data
print("Generating vectorized data...")
reader = pd.read_csv(movie_file, chunksize=1000000, sep='^', low_memory=False, header=None, quoting=csv.QUOTE_NONE)

vectorized_data = []
total_size = 0  # Track total size of vectorized data

# Constants
VECTOR_SIZE = 384 * 4  # 384-dim vector * 4 bytes per float32
for chunk in reader:
    print(chunk.columns)  # Print the column names
    break  # Stop after printing the header
for chunk in reader:
    for _, row in chunk.iterrows():
        embedding = model.encode(row["primaryTitle"], convert_to_numpy=True).tolist()
        metadata = {
            "title": row["primaryTitle"],
            "originalTitle": row["originalTitle"],
            "startYear": str(row["startYear"]),
            "genres": row["genres"],
            "averageRating": str(row["averageRating"]),
            "numVotes": str(row["numVotes"]),
            "actor": row["actor"] if pd.notna(row["actor"]) else "",
            "actress": row["actress"] if pd.notna(row["actress"]) else "",
            "director": row["director"] if pd.notna(row["director"]) else "",
            "producer": row["producer"] if pd.notna(row["producer"]) else "",
            "writer": row["writer"] if pd.notna(row["writer"]) else ""
        }
        metadata_size = len(json.dumps(metadata).encode('utf-8'))
        record_size = VECTOR_SIZE + metadata_size

        vectorized_data.append((row["tconst"], embedding, metadata))
        total_size += record_size

print(f"Vectorized data generated. Total size: {total_size / (1024 * 1024):.2f} MB")

Generating vectorized data...


ParserError: Error tokenizing data. C error: Expected 16 fields in line 88514, saw 20


In [None]:
# Load the data
def upload_to_pinecone(index, data, max_batch_size=100, max_request_size=2_000_000):
    """Uploads vectorized data to Pinecone in chunks."""
    batch = []
    batch_size = 0
    vector_count = 0
    vector_size = 384 * 4

    for tconst, embedding, metadata in data:
        metadata_size = len(json.dumps(metadata).encode('utf-8'))
        record_size = vector_size + metadata_size

        if vector_count >= max_batch_size or batch_size + record_size > max_request_size:
            if batch:
                index.upsert(vectors=batch)
                batch = []
                batch_size = 0
                vector_count = 0

        batch.append((tconst, embedding, metadata))
        batch_size += record_size
        vector_count += 1

    if batch:
        index.upsert(vectors=batch)

    print("Upload to Pinecone completed successfully!")

print("Uploading vectorized data to Pinecone...")
upload_to_pinecone(index, vectorized_data)