<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/main/notebooks/104_VisualiseVectorDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vector DB: T-SNE visualisation

**Author:** Eva Rombouts  
**Date:** 2024-07-20  
**Updated:** 2024-10-14  
**Version:** Olympia

### Description
This script retrieves embeddings and metadata from the Chroma vector database, reduces their dimensionality using t-SNE, and visualizes the results in an interactive plot using Plotly.

In [None]:
!pip install GenCareAIUtils
from GenCareAIUtils import GenCareAISetup

setup = GenCareAISetup()

if setup.environment == 'Colab':
    !pip install -q langchain langchain-community langchain_openai chromadb langchain_chroma


In [2]:
import numpy as np
import random
import plotly.express as px
import pandas as pd
from tqdm import tqdm
from sklearn.manifold import TSNE
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

In [3]:
# Initialize Chroma vector database
vectordb = Chroma(persist_directory=setup.get_file_path('data/chroma_db_gcai'),
                  embedding_function=OpenAIEmbeddings(api_key=setup.get_openai_key(), model='text-embedding-ada-002'),
                  collection_name = 'Olympia'
                  )

In [4]:
# Extract embeddings and metadata from the vectordb
def get_embeddings_and_metadata(vectordb):
    items = vectordb.get(include=['embeddings', 'metadatas', 'documents'])
    embeddings = items['embeddings']
    metadata = items['metadatas']
    documents = items['documents']
    return embeddings, metadata, documents

embeddings, metadata, documents = get_embeddings_and_metadata(vectordb)

In [5]:
def sample_embeddings(embeddings, metadata, documents, sample_size=100):
    indices = random.sample(range(len(embeddings)), sample_size)
    sampled_embeddings = embeddings[indices]
    sampled_metadata = [metadata[i] for i in indices]
    sampled_documents = [documents[i] for i in indices]
    return sampled_embeddings, sampled_metadata, sampled_documents

sampled_embeddings, sampled_metadata, sampled_documents = sample_embeddings(np.array(embeddings), metadata, documents)

In [None]:
# Dimensionality reduction with t-SNE with progress bar
def reduce_dimensions(embeddings):
    tsne = TSNE(n_components=2, random_state=6, verbose=1)
    reduced_embeddings = tsne.fit_transform(embeddings)
    return reduced_embeddings

# Add progress bar
embeddings_array = np.array(sampled_embeddings)
with tqdm(total=1) as pbar:
    reduced_embeddings = reduce_dimensions(embeddings_array)
    pbar.update(1)

In [None]:
# Create an interactive plot with Plotly
def create_interactive_plot(reduced_embeddings, metadata, documents):
    df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
    df['text'] = documents

    # Unpack metadata column into separate columns and combine with the original DataFrame
    df = df.join(pd.DataFrame(metadata))

    fig = px.scatter(df, x='x', y='y', hover_data=['text', 'category'], color='category',
                     title='t-SNE Visualization of VectorDB',
                     labels={'category': 'Category'},
                     color_discrete_sequence=px.colors.qualitative.Plotly)  # Gebruik een preset kleurenpalet

    fig.update_layout(xaxis_title='t-SNE x', yaxis_title='t-SNE y')
    fig.show()

create_interactive_plot(reduced_embeddings, sampled_metadata, sampled_documents)