# Vector DB: T-SNE visualisation

**Author:** Eva Rombouts  
**Date:** 2024-07-20  
**Version:** 1.0

### Description
This script retrieves embeddings and metadata from the Chroma vector database, reduces their dimensionality using t-SNE, and visualizes the results in an interactive plot using Plotly. 

In [1]:
# Environment setup
import os

def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        return "Local Environment"

env = check_environment()

if env == "Google Colab":
    print("Running in Google Colab")
    !pip install -q langchain langchain-community langchain_openai chromadb
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    os.chdir('/content/drive/My Drive/Colab Notebooks/GenCareAI/scripts')
    OPENAI_API_KEY = userdata.get('GCI_OPENAI_API_KEY')
else:
    print("Running in Local Environment")
    # !pip install -q plotly chromadb nbformat
    from dotenv import load_dotenv
    load_dotenv()
    OPENAI_API_KEY = os.getenv('GCI_OPENAI_API_KEY')

Running in Local Environment


In [2]:
import numpy as np
import random
import plotly.express as px
import pandas as pd
from tqdm import tqdm
from sklearn.manifold import TSNE
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [3]:
# Initialize Chroma vector database
vectordb = Chroma(persist_directory='../data/chroma_db_gcai_notes',
                  embedding_function=OpenAIEmbeddings(api_key=OPENAI_API_KEY, model='text-embedding-ada-002'),
                  collection_name = 'anonymous_notes'
                  )

In [4]:
# Step 1: Extract embeddings and metadata from the vectordb
def get_embeddings_and_metadata(vectordb):
    items = vectordb.get(include=['embeddings', 'metadatas', 'documents'])
    embeddings = items['embeddings']
    metadata = items['metadatas']
    documents = items['documents']
    return embeddings, metadata, documents

embeddings, metadata, documents = get_embeddings_and_metadata(vectordb)

In [9]:
def sample_embeddings(embeddings, metadata, documents, sample_size=5000):
    indices = random.sample(range(len(embeddings)), sample_size)
    sampled_embeddings = embeddings[indices]
    sampled_metadata = [metadata[i] for i in indices]
    sampled_documents = [documents[i] for i in indices]
    return sampled_embeddings, sampled_metadata, sampled_documents

sampled_embeddings, sampled_metadata, sampled_documents = sample_embeddings(np.array(embeddings), metadata, documents)

In [10]:
# Step 2: Dimensionality reduction with t-SNE with progress bar
def reduce_dimensions(embeddings):
    tsne = TSNE(n_components=2, random_state=6, verbose=1)
    reduced_embeddings = tsne.fit_transform(embeddings)
    return reduced_embeddings

# Add progress bar
embeddings_array = np.array(sampled_embeddings)
with tqdm(total=1) as pbar:
    reduced_embeddings = reduce_dimensions(embeddings_array)
    pbar.update(1)

  0%|          | 0/1 [00:00<?, ?it/s]

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.002s...
[t-SNE] Computed neighbors for 5000 samples in 0.605s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.126095
[t-SNE] KL divergence after 250 iterations with early exaggeration: 78.810669


100%|██████████| 1/1 [00:12<00:00, 12.67s/it]

[t-SNE] KL divergence after 1000 iterations: 1.466671





In [11]:
# Step 3: Create an interactive plot with Plotly
def create_interactive_plot(reduced_embeddings, metadata, documents):
    df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
    df['text'] = documents

    # Unpack metadata column into separate columns and combine with the original DataFrame
    df = df.join(pd.DataFrame(metadata))

    fig = px.scatter(df, x='x', y='y', hover_data=['text', 'topic'], color='topic',
                     title='t-SNE Visualization of VectorDB',
                     labels={'topic': 'Topic'},
                     color_discrete_sequence=px.colors.qualitative.Plotly)  # Gebruik een preset kleurenpalet

    fig.update_layout(xaxis_title='t-SNE x', yaxis_title='t-SNE y')
    fig.show()

create_interactive_plot(reduced_embeddings, sampled_metadata, sampled_documents)