# Visualize our rst documentation vectors

### PART A: INIT

In [8]:
import os
import numpy as np
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [9]:
MODEL = "gpt-4.1-nano"
db_name = "./rst_doc_db"
load_dotenv(override=True)

True

In [12]:
# Pick an embedding model
embeddings = HuggingFaceEmbeddings(model="google/embeddinggemma-300M")

vectorstore = Chroma(embedding_function=embeddings, collection_name='symfony-docs-7.3', persist_directory=db_name)
print(f"Vectorstore has {vectorstore._collection.count()} documents")

Vectorstore has 15780 documents


In [13]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 15,780 vectors with 768 dimensions in the vector store


### Part C: Visualize!

In [28]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['type'] for metadata in metadatas]
list(dict.fromkeys(doc_types))
colors = [
    [
        'gray', 'darkolivegreen', 'saddlebrown', 'darkslateblue', 'mediumseagreen', 'darkkhaki', 'darkblue', 'goldenrod',
        'indigo', 'sienna', 'red', 'lime', 'blueviolet', 'springgreen', 'crimson', 'aqua',
        'deepskyblue', 'blue', 'greenyellow', 'orchid', 'coral', 'fuchsia', 'dodgerblue', 'gold', 'lightblue', 'deeppink', 'pink'
    ]
    [
        [
            'frontend',
            'setup',
            'testing',
            'console',
            'getting_started',
            'components',
            'reference',
            'bundles',
            'introduction',
            'routing',
            'doctrine',
            'configuration',
            'logging',
            'quick_tour',
            'validation',
            'security',
            'controller',
            'http_cache',
            'service_container',
            'form',
            'serializer',
            'create_framework',
            'messenger',
            'contributing',
            '_build',
            'workflow',
            'deployment'
        ].index(t)
    ] for t in doc_types
]
# colors

In [29]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [30]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()