In [7]:

# Import necessary libraries
import os
import numpy as np
import pandas as pd
from pinecone import Pinecone
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from dotenv import load_dotenv

In [8]:
# Load environment variables
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

In [11]:
# Initialize Pinecone
#pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
# Create an instance of the Pinecone class
pinecone_instance = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

# Connect to the Pinecone index
INDEX_NAME = "semantic-search-fast"
index = pinecone_instance.Index(INDEX_NAME)

In [12]:
# Fetch embeddings from Pinecone
def fetch_embeddings(index, namespace=None, num_vectors=100):
    """
    Fetch embeddings from Pinecone index.
    Args:
        index: Pinecone index object.
        namespace: Namespace to fetch embeddings from.
        num_vectors: Number of embeddings to fetch.
    Returns:
        A list of vectors and their metadata.
    """
    vectors = index.query(
        namespace=namespace, vector=[], top_k=num_vectors, include_metadata=True
    )['matches']
    embeddings = [match['values'] for match in vectors]
    metadata = [match['metadata'] for match in vectors]
    return embeddings, metadata

embeddings, metadata = fetch_embeddings(index, namespace="example-namespace")

In [13]:
# Convert embeddings to NumPy array
embeddings_array = np.array(embeddings)

In [15]:
# Dimensionality Reduction for Visualization
def reduce_dimensions(embeddings, method="pca", n_components=2):
    """
    Reduce the dimensions of embeddings for visualization.
    Args:
        embeddings: Array of embeddings.
        method: Dimensionality reduction method ('pca' or 'tsne').
        n_components: Number of dimensions for reduction.
    Returns:
        Reduced embeddings.
    """
    if method == "pca":
        reducer = PCA(n_components=n_components)
    elif method == "tsne":
        reducer = TSNE(n_components=n_components, perplexity=30, random_state=42)
    else:
        raise ValueError("Invalid method. Choose 'pca' or 'tsne'.")
    return reducer.fit_transform(embeddings)

# Apply PCA
if len(embeddings_array) > 0:
    reduced_embeddings = reduce_dimensions(embeddings_array, method="pca")
else:
    reduced_embeddings = np.array([])

In [17]:
# Visualize Embeddings
def plot_embeddings(reduced_embeddings, metadata, title="Embeddings Visualization"):
    """
    Plot embeddings in 2D space.
    Args:
        reduced_embeddings: 2D embeddings after dimensionality reduction.
        metadata: Metadata corresponding to the embeddings.
        title: Plot title.
    """
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], alpha=0.7)
    for i, meta in enumerate(metadata):
        plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], meta.get("label", ""), fontsize=9)
    plt.title(title)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.grid(True)
    plt.show()

# Plot the embeddings
if len(reduced_embeddings) > 0:
    # Reshape to 2D array if needed
    if reduced_embeddings.ndim == 1:
        reduced_embeddings = reduced_embeddings.reshape(-1, 1)
    plot_embeddings(reduced_embeddings, metadata, title="PCA Visualization of Embeddings")
else:
    print("No embeddings to plot")

No embeddings to plot


In [20]:
# Cleanup Pinecone
index.delete(delete_all=True)

{}