In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec, PodSpec
import torch
import os
from tqdm.auto import tqdm
import time

In [None]:
os.environ['PINECONE_API_KEY'] = "add-api-key"
os.environ['PINECONE_ENVIRONMENT'] = "us-east-1"

In [None]:
# Configuration

FINETUNED_MODEL_PATH = 'output/finetuned-all-distilroberta-v1-2025-04-22_15-26-27'
INPUT_PARQUET_PATH = 'new_formatted_addresses.parquet'
ID_COLUMN = 'OID_'
ADDRESS_COLUMN = 'FormattedFullAddress'
LAT_COLUMN = 'Latitude'
LON_COLUMN = 'Longitude'

# Pinecone Configuration
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.environ.get("PINECONE_ENVIRONMENT")
PINECONE_INDEX_NAME = 'address-data-index'
MODEL_EMBEDDING_DIMENSION = 768
PINECONE_METRIC = 'cosine'
PINECONE_SPEC = ServerlessSpec(cloud='aws', region='us-east-1')

# Processing Configuration
UPSERT_BATCH_SIZE = 100
ENCODE_BATCH_SIZE = 64

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA.")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

GPU is available. Using CUDA.


In [None]:
# Initialize Pinecone

print("\nInitializing Pinecone connection...")

pinecone = Pinecone(api_key=PINECONE_API_KEY)

# Check if index exists
if PINECONE_INDEX_NAME not in pinecone.list_indexes().names():
    print(f"Index '{PINECONE_INDEX_NAME}' does not exist. Creating...")
    pinecone.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=MODEL_EMBEDDING_DIMENSION,
        metric=PINECONE_METRIC,
        spec=PINECONE_SPEC
    )
    print(f"Created new index: {PINECONE_INDEX_NAME}")
else:
    print(f"Index '{PINECONE_INDEX_NAME}' already exists.")

# Connect to the index
index = pinecone.Index(PINECONE_INDEX_NAME)
print("Connected to Pinecone index.")
print(index.describe_index_stats())


Initializing Pinecone connection...
Index 'address-data-index' does not exist. Creating...
Created new index: address-data-index
Connected to Pinecone index.
{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [None]:
# Loading Fine-tuned Model

model = SentenceTransformer(FINETUNED_MODEL_PATH, device=str(device))
test_embedding = model.encode("test")
print(f"Fine-tuned model loaded successfully to {model.device}.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fine-tuned model loaded successfully to cuda:0.


In [None]:
# Load Address Data

df_raw = pd.read_parquet(INPUT_PARQUET_PATH)
df = df_raw[df_raw['State'].str.lower() == 'ma']
print(f"Successfully loaded {len(df)} rows initially.")

Successfully loaded 10000 rows initially.


In [None]:
# Generate Embeddings and Upsert to Pinecone in Batches

print(f"\nStarting embedding generation and upsert process for {len(df)} records...")
total_records = len(df)

for i in tqdm(range(0, total_records, UPSERT_BATCH_SIZE), desc="Upserting Batches"):
    batch_df = df.iloc[i : i + UPSERT_BATCH_SIZE]

    # Get address texts for the current batch
    texts_to_encode = batch_df[ADDRESS_COLUMN].tolist()

    # Generate embeddings for the batch

    embeddings = model.encode(
        texts_to_encode,
        batch_size=ENCODE_BATCH_SIZE,
        show_progress_bar=False,
        device=str(device)
      )

    # Prepare vectors for Pinecone upsert
    vectors_to_upsert = []
    for row_idx, (df_index, row) in enumerate(batch_df.iterrows()):
        embedding = embeddings[row_idx].tolist()
        vector_id = str(row[ID_COLUMN])

        # Create metadata dictionary
        metadata = {"address_text": str(row[ADDRESS_COLUMN])}
        if LAT_COLUMN in row and pd.notna(row[LAT_COLUMN]):
             metadata["latitude"] = float(row[LAT_COLUMN])
        if LON_COLUMN in row and pd.notna(row[LON_COLUMN]):
             metadata["longitude"] = float(row[LON_COLUMN])

        vectors_to_upsert.append({
            "id": vector_id,
            "values": embedding,
            "metadata": metadata
        })

    # Upsert the batch to Pinecone
    index.upsert(vectors=vectors_to_upsert)


print("\n--- Upsert Process Complete ---")

final_stats = index.describe_index_stats()
print("Final Pinecone index stats:")
print(final_stats)


Starting embedding generation and upsert process for 10000 records...


Upserting Batches:   0%|          | 0/100 [00:00<?, ?it/s]


--- Upsert Process Complete ---
Final Pinecone index stats:
{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000,
 'vector_type': 'dense'}
