### Installs

In [2]:
!pip install chromadb
!pip install sentence_transformers
!pip install plotly
!pip install nbformat



### Imports

In [3]:
import chromadb
import json
import random
import string
import numpy as np
import pandas as pd
import plotly.express as px
import time

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from chromadb.utils import embedding_functions
from datetime import datetime
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


### Setup the ChromaDB collection and model

In [4]:
try:
    emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
    chroma_client = chromadb.Client()
    collection = chroma_client.get_or_create_collection(
        name="change_tickets", 
        embedding_function=emb_fn, 
        metadata={"hnsw:space": "cosine"}
    )
except ValueError as err:
    print(f"Error: Could not create collection - {err}")
else:
    print(f"Collection '{collection.name}' created successfully")

Collection 'change_tickets' created successfully


### Function to generate test change tickets

In [5]:
suffix_types = ['serv', 'nodeserv', 'nodeweb']
raw_change_tickets = [] # All generated tickets stored here
used_ticket_numbers = set()
app_names = set()

def generate_changes(ticket_count=1000, app_count=500, sample_size=5):
    for i in range(app_count):
        while True:  # Keep generating names until a unique one is found
            prefix = ''.join(random.choices(string.ascii_lowercase, k=5))
            app_name = f'{prefix}{random.choice(suffix_types)}'
            if app_name not in app_names:
                app_names.add(app_name)
                break
    
    for i in range(ticket_count):
        while True:
            ticket_number = f'CHNG{random.randint(100000, 999999)}'
            if ticket_number not in used_ticket_numbers:
                used_ticket_numbers.add(ticket_number)
                break
            
        app_name = random.choice(list(app_names))
        release_name = f'{app_name}-{datetime.now().strftime("%m%d%y%H%M%S%f")}'
        ticket_description = f'Recent deployment on {app_name} for manifest ID {release_name}'
        
        change_ticket = {
            'ticket_number': ticket_number,
            'ticket_description': ticket_description
        }
        raw_change_tickets.append(change_ticket)
    
    sample = random.sample(raw_change_tickets, sample_size)
    
    for ticket in sample:
        print(f'{ticket["ticket_number"]}: {ticket["ticket_description"]}')
    
    return raw_change_tickets

### Generate test data

In [6]:
if __name__ == '__main__':
    change_data = generate_changes(ticket_count=1000, app_count=500, sample_size=5)

CHNG748980: Recent deployment on gaczxserv for manifest ID gaczxserv-062623234552754078
CHNG870892: Recent deployment on ywoaynodeserv for manifest ID ywoaynodeserv-062623234552758217
CHNG420628: Recent deployment on uxtvinodeweb for manifest ID uxtvinodeweb-062623234552756085
CHNG624143: Recent deployment on pypxnnodeserv for manifest ID pypxnnodeserv-062623234552756316
CHNG653450: Recent deployment on tfymtnodeweb for manifest ID tfymtnodeweb-062623234552756392


### Transform test data for Chroma

In [7]:
change_ids = []
change_tickets = []

for change in change_data:
    change_ids.append(change['ticket_number'])
    change_details = f"{change['ticket_number']}: {change['ticket_description']}"
    change_tickets.append(change_details)

print(len(change_ids))
print(len(change_tickets))

1000
1000


### Add change tickets to ChromaDB

In [8]:
collection.add(
    documents=change_tickets,
    #metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    ids=change_ids
)
print(collection.count())

1000


### Query against the database using our search string/vector

In [17]:
# Load the SentenceTransformer model
model_name = "all-mpnet-base-v2"  # Replace with the desired model name
model = SentenceTransformer(model_name)

# Vectorize search string
search_text = "hello"
search_vector = model.encode([search_text])
search_vector = [float(value) for value in search_vector[0]]

results = collection.query(
    query_embeddings = [search_vector],
    #query_texts=["hello"],
    n_results=5, # MUST BE GREATER THAN PERPLEXITY USED IN VISUALIZATION
    include=['documents', 'metadatas', 'embeddings'],
    #where={"metadata_field": "is_equal_to_this"},
    #where_document={"$contains":"search_string"}
)

#print(json.dumps(results, indent=2))

### How do we know this isn't just doing a text search?

In [10]:
search_term = "otars"
for ticket in change_tickets:
    if search_term in ticket:
        print(f"Found '{search_term}' in ticket '{ticket}'")

### Perform dimensionality reduction with PCA

In [19]:
# Get search result data
data = {
    'embeddings': results["embeddings"][0],
    'documents': results["documents"][0],
    'metadatas': results["metadatas"][0],
    'ids': results["ids"][0]
}
df = pd.DataFrame.from_dict(data=data["embeddings"])

# Perform dimensionality reduction with PCA
pca_50 = PCA(n_components=5)
pca_result_50 = pca_50.fit_transform(df)

# Perform dimensionality reduction with t-SNE
tsne = TSNE(n_components=3, verbose=0, perplexity=3, n_iter=300) # To do: usage details -- PERPLEXITY AND NUMBER OF COMPONENTS MUST BE LESS THAN TOTAL RESULTS IN DATASET
tsne_pca_results = tsne.fit_transform(pca_result_50)
tsne_pca_results = tsne_pca_results / 3


### Prepare data for visualization

In [68]:
# Prepare data for visualization
groups = np.argmax(pca_result_50, axis=1)
import plotly.graph_objects as go

points = []

for position, document, metadata, id, group in zip(tsne_pca_results.tolist(), data["documents"], data["metadatas"], data["ids"], groups.tolist()):
    point = {
        'position_x': position[0],
        'position_y': position[1],
        'position_z': position[2],
        'document': document,
        'metadata': metadata,
        'id': id,
        'group': group
    }
    points.append(point)

# Add a point to identify the search
search_point = {
    'position_x': search_vector[0],
    'position_y': search_vector[1],
    'position_z': search_vector[2],
    'document': search_text,
    'metadata': None,
    'id': f'<b style="font-size: 14px">{search_text}</b>',
    'group': 0
}

points.append(search_point)

# Generate data for hovertemplate
fig = go.Figure(data=[
    go.Scatter3d(
        x=[point['position_x'] for point in points],
        y=[point['position_y'] for point in points],
        z=[point['position_z'] for point in points],
        mode='markers',
        marker=dict(
            color=[point['group'] if point != search_point else 'orange' for point in points],
            symbol=['circle' if point != search_point else 'diamond' for point in points],  # Set symbol as 'x' for the search point
            size=[12] * (len(points) - 1) + [15],
            colorscale='Viridis',
            colorbar=dict(title='Group')
        ),
        customdata=[(point['id'], point['document'], point['group']) for point in points],
        hovertemplate=
            '<b>ID:</b> %{customdata[0]}<br>'
            '<b>Document:</b> %{customdata[1]}<br>'
            '<b>Group:</b> %{customdata[2]}<br>'
            '<extra></extra>',
    )
])

# Add annotations for ID labels
annotations = []
font_size = 10
for point in points:
    annotation = go.layout.scene.Annotation(
        x=point['position_x'],
        y=point['position_y'],
        z=point['position_z'],
        xanchor='center',
        yanchor='top',
        text=point['id'],
        showarrow=True,
        font=dict(size=font_size, color='white')
    )
    annotations.append(annotation)

# Set plot layout options
fig.update_layout(
    scene=dict(
        xaxis=dict(title='X'),
        yaxis=dict(title='Y'),
        zaxis=dict(title='Z'),
    ),
    width=1024,
    height=768,
    template='plotly_dark',
    scene_annotations=annotations,
)

fig.show()