### Installs

In [162]:
!pip install chromadb
!pip install sentence_transformers
!pip install plotly
!pip install nbformat
!pip install ipywidgets
!pip install faker-microservice
!pip install openai

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Imports

In [163]:
import chromadb
import json
import random
import string
import numpy as np
import pandas as pd
import plotly.express as px
import time
import faker_microservice
import ipywidgets

from faker import Faker
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from chromadb.utils import embedding_functions
from datetime import datetime
from sentence_transformers import SentenceTransformer

### Setup the ChromaDB collection and model

In [164]:

local_emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
openai_emb_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key="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", # OPENAI API KEY IS REQUIRED!
    model_name="text-embedding-ada-002"
)

emb_fn = openai_emb_fn
try:
    # Embed function can be switched to openai_emb_fn
    emb_fn = emb_fn
    chroma_client = chromadb.Client()
    collection = chroma_client.get_or_create_collection(
        name="change_tickets", 
        embedding_function=emb_fn, 
        metadata={"hnsw:space": "cosine"}
    )
except ValueError as err:
    print(f"Error: Could not create collection - {err}")
else:
    print(f"Collection '{collection.name}' created successfully")

Collection 'change_tickets' created successfully


### Function to generate test change tickets

In [165]:
suffix_types = ['serv', 'nodeserv', 'nodeweb']
raw_change_tickets = [] # All generated tickets stored here
used_ticket_numbers = set()
app_names = set()
fake = Faker()
fake.add_provider(faker_microservice.Provider)

def generate_changes(ticket_count=1000, app_count=500, sample_size=5):
    for i in range(app_count):
        while True:  # Keep generating names until a unique one is found
            prefix = fake.microservice().translate(str.maketrans('', '', string.punctuation))
            app_name = f'{prefix}{random.choice(suffix_types)}'
            if app_name not in app_names:
                app_names.add(app_name)
                break
    
    for i in range(ticket_count):
        while True:
            ticket_number = f'CHNG{random.randint(100000, 999999)}'
            if ticket_number not in used_ticket_numbers:
                used_ticket_numbers.add(ticket_number)
                break
            
        app_name = random.choice(list(app_names))
        release_name = f'{app_name}-{datetime.now().strftime("%m%d%y%H%M%S%f")}'
        ticket_description = f'Recent deployment on {app_name} for manifest ID {release_name}'
        
        change_ticket = {
            'ticket_number': ticket_number,
            'ticket_description': ticket_description
        }
        raw_change_tickets.append(change_ticket)
    
    sample = random.sample(raw_change_tickets, sample_size)
    
    for ticket in sample:
        print(f'{ticket["ticket_number"]}: {ticket["ticket_description"]}')
    
    return raw_change_tickets

### Generate test data

In [166]:
if __name__ == '__main__':
    change_data = generate_changes(ticket_count=1000, app_count=2500, sample_size=5)

CHNG572939: Recent deployment on legacymemcacheadaptornodeserv for manifest ID legacymemcacheadaptornodeserv-063023054624026991
CHNG830634: Recent deployment on legacyhelpuinodeweb for manifest ID legacyhelpuinodeweb-063023054624063333
CHNG701653: Recent deployment on legacyprintapinodeserv for manifest ID legacyprintapinodeserv-063023054624069754
CHNG932930: Recent deployment on newsqliteuiserv for manifest ID newsqliteuiserv-063023054623991276
CHNG495499: Recent deployment on elasticsearchnodeserv for manifest ID elasticsearchnodeserv-063023054623999042


### Transform test data for Chroma

In [167]:
change_ids = []
change_tickets = []

for change in change_data:
    change_ids.append(change['ticket_number'])
    change_details = f"{change['ticket_number']}: {change['ticket_description']}"
    change_tickets.append(change_details)

print(len(change_ids))
print(len(change_tickets))

1000
1000


### Add change tickets to ChromaDB

In [168]:
collection.add(
    documents=change_tickets,
    #metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}, ...],
    ids=change_ids
)
print(collection.count())

1000


### Query against the database using our search string/vector

In [169]:

search_text = "Increase in FCIs on /api/v1/redis/status over the past 5 minutes"
search_results = collection.query(
    query_texts=[search_text],
    n_results=5, # MUST BE GREATER THAN PERPLEXITY USED IN VISUALIZATION
    include=['documents', 'metadatas', 'embeddings', 'distances'],
    #where={"metadata_field": "is_equal_to_this"},
    #where_document={"$contains":"search_string"}
)

print(json.dumps({'documents':search_results['documents'], 'distances': search_results['distances']}, indent=2))

{
  "documents": [
    [
      "CHNG272853: Recent deployment on redisfenodeweb for manifest ID redisfenodeweb-063023054623990462",
      "CHNG788134: Recent deployment on redisbackendserv for manifest ID redisbackendserv-063023054624070282",
      "CHNG369276: Recent deployment on redisapinodeweb for manifest ID redisapinodeweb-063023054623985574",
      "CHNG134333: Recent deployment on redisdbnodeserv for manifest ID redisdbnodeserv-063023054624059797",
      "CHNG898469: Recent deployment on newredisdashboardnodeweb for manifest ID newredisdashboardnodeweb-063023054624021865"
    ]
  ],
  "distances": [
    [
      0.20368337631225586,
      0.20677727460861206,
      0.2085297703742981,
      0.20867490768432617,
      0.2111315131187439
    ]
  ]
}


### How do we know this isn't just doing a text search?

In [170]:
search_term = "redis"
for ticket in change_tickets:
    if search_term in ticket:
        print(f"Found '{search_term}' in ticket '{ticket}'")

Found 'redis' in ticket 'CHNG369276: Recent deployment on redisapinodeweb for manifest ID redisapinodeweb-063023054623985574'
Found 'redis' in ticket 'CHNG272853: Recent deployment on redisfenodeweb for manifest ID redisfenodeweb-063023054623990462'
Found 'redis' in ticket 'CHNG844456: Recent deployment on oldredisnodeserv for manifest ID oldredisnodeserv-063023054623998702'
Found 'redis' in ticket 'CHNG507000: Recent deployment on redismanagerserv for manifest ID redismanagerserv-063023054624012776'
Found 'redis' in ticket 'CHNG793099: Recent deployment on newredisfrontendserv for manifest ID newredisfrontendserv-063023054624017667'
Found 'redis' in ticket 'CHNG225057: Recent deployment on legacyredisfenodeserv for manifest ID legacyredisfenodeserv-063023054624020097'
Found 'redis' in ticket 'CHNG898469: Recent deployment on newredisdashboardnodeweb for manifest ID newredisdashboardnodeweb-063023054624021865'
Found 'redis' in ticket 'CHNG148642: Recent deployment on redisdbnodeserv fo

### Perform dimensionality reduction with PCA & tSNE

In [171]:

# Get all documents
collection_data = collection.get(include=["documents", "metadatas", "embeddings"])

# create a 'document' for search query
search_query_embeddings = emb_fn(search_text)[0]
search_query_data = {
    'embeddings': [search_query_embeddings],
    'documents': [search_text],
    'metadatas': [None],
    'ids': ['SEARCH_QUERY']
}

# Get search result documents (search results are formatted differently and need to grab the first item)
search_results_data = {
    'embeddings': search_results["embeddings"][0],
    'documents': search_results["documents"][0],
    'metadatas': search_results["metadatas"][0],
    'ids': search_results["ids"][0],
    'distances': search_results["distances"][0]
}

#print(json.dumps(collection_data, indent=2))
#print(json.dumps(search_query_data, indent=2))
#print(json.dumps(search_results_data, indent=2))

# Combine all docs, search query, and search results into one structure by joining the arrays (THIS IS A HACK SHOULD BE IMPROVED)
data = {
    'embeddings': collection_data["embeddings"] +   search_query_data["embeddings"] +   search_results_data["embeddings"],
    'documents': collection_data["documents"] +     search_query_data["documents"] +    search_results_data["documents"],
    'metadatas': collection_data["metadatas"] +     search_query_data["metadatas"] +    search_results_data["metadatas"],
    'ids': collection_data["ids"] +                 search_query_data["ids"] +          search_results_data["ids"],
    'distances': [None] +                           [None] +                            search_results_data["distances"],
}

#print(json.dumps(data, indent=2))

# Extract only embeddings from data
df = pd.DataFrame.from_dict(data=data["embeddings"])

# Perform dimensionality reduction with PCA
pca_50 = PCA(n_components=5)
pca_result_50 = pca_50.fit_transform(df)

# Perform dimensionality reduction with t-SNE
tsne = TSNE(n_components=3, verbose=0, perplexity=30, n_iter=600) # To do: usage details -- PERPLEXITY AND NUMBER OF COMPONENTS MUST BE LESS THAN TOTAL RESULTS IN DATASET
tsne_pca_results = tsne.fit_transform(pca_result_50)
tsne_pca_results = tsne_pca_results / 3


### Prepare data for visualization

In [172]:
# Prepare data for visualization
groups = np.argmax(pca_result_50, axis=1)
import plotly.graph_objects as go

# Define search result points
search_result_points = []
for position, document, metadata, id, distance, group in zip(tsne_pca_results.tolist(), data["documents"], data["metadatas"], data["ids"], data["distances"], groups.tolist()):
    if(distance is not None):
        point = {
            'position_x': position[0],
            'position_y': position[1],
            'position_z': position[2],
            'document': document,
            'metadata': metadata,
            'id': id,
            'group': group
        }
        search_result_points.append(point)

# define initial search point
search_point = {
    'position_x': 0,
    'position_y': 0,
    'position_z': 0,
    'document': "NO SEARCH INPUT FOUND",
    'metadata': None,
    'id': f'<b style="font-size: 14px">"NO SEARCH INPUT FOUND"</b>',
    'distance': 0,
    'group': 0
}

# Extract search point from dataset based on id
for position, document, metadata, id, group in zip(tsne_pca_results.tolist(), data["documents"], data["metadatas"], data["ids"], groups.tolist()):
    if(id == "SEARCH_QUERY"):
        search_point = {
            'position_x': position[0],
            'position_y': position[1],
            'position_z': position[2],
            'document': search_text,
            'metadata': None,
            'id': f'<b style="font-size: 14px">{search_text}</b>',
            'distance': 0,
            'group': 0
        }

collection_points = []
# Define rest of collection points
for position, document, metadata, id, group in zip(tsne_pca_results.tolist(), data["documents"], data["metadatas"], data["ids"], groups.tolist()):
    if(id != "SEARCH_QUERY"):
        point = {
            'position_x': position[0],
            'position_y': position[1],
            'position_z': position[2],
            'document': document,
            'metadata': metadata,
            'id': id,
            'group': group
        }
        collection_points.append(point)


# Define traces
traces = []

# Add search results trace
traces.append(
    go.Scatter3d(
        x=[point['position_x'] for point in search_result_points],
        y=[point['position_y'] for point in search_result_points],
        z=[point['position_z'] for point in search_result_points],
        mode='markers',
        marker=dict(
            color='orange',
            symbol='circle',
            size=[12] * (len(search_result_points) - 1) + [15],
            colorscale='Viridis',
            colorbar=dict(title='Group')
        ),
        customdata=[(point['id'], point['document'], point['group']) for point in search_result_points],
        hovertemplate=
            '<b>ID:</b> %{customdata[0]}<br>'
            '<b>Document:</b> %{customdata[1]}<br>'
            '<b>Group:</b> %{customdata[2]}<br>'
            '<extra></extra>',
    )
)

traces.append(
    go.Scatter3d(
        x=[point['position_x'] for point in collection_points],
        y=[point['position_y'] for point in collection_points],
        z=[point['position_z'] for point in collection_points],
        mode='markers',
        marker=dict(
            color='gray',
            symbol='circle',
            size=[12] * (len(collection_points) - 1) + [15],
            colorscale='Viridis',
            colorbar=dict(title='Group')
        ),
        customdata=[(point['id'], point['document'], point['group']) for point in collection_points],
        hovertemplate=
            '<b>ID:</b> %{customdata[0]}<br>'
            '<b>Document:</b> %{customdata[1]}<br>'
            '<b>Group:</b> %{customdata[2]}<br>'
            '<extra></extra>',
    )
)

# Add search point
traces.append(
    go.Scatter3d(
        x=[search_point['position_x']],
        y=[search_point['position_y']],
        z=[search_point['position_z']],
        mode='markers',
        marker=dict(
            color='orange',
            symbol='diamond',
        ),
        customdata=[[search_point['id'], search_point['document'], search_point['group']]],
        hovertemplate=
            '<b>ID:</b> %{customdata[0]}<br>'
            '<b>Document:</b> %{customdata[1]}<br>'
            '<b>Group:</b> %{customdata[2]}<br>'
            '<extra></extra>',
    )
)

# Add relationships trace
for point in search_result_points:
    traces.append(
        go.Scatter3d(x=[search_point['position_x'],point['position_x']],
             y=[search_point['position_y'],point['position_y']],
             z=[search_point['position_z'],point['position_z']],
             name="V0",
             legendgroup="V0",
             showlegend=False,
             mode="lines",
             line=dict(color="orange"))
    )

# Add traces to figure
fig = go.Figure(data=traces)

# Add annotations for ID labels
annotations = []
font_size = 10
for point in search_result_points:
    annotation = go.layout.scene.Annotation(
        x=point['position_x'],
        y=point['position_y'],
        z=point['position_z'],
        xanchor='center',
        yanchor='top',
        text=point['id'],
        showarrow=True,
        font=dict(size=font_size, color='white')
    )
    annotations.append(annotation)


# Set plot layout options
fig.update_layout(
    scene=dict(
        xaxis=dict(title='X'),
        yaxis=dict(title='Y'),
        zaxis=dict(title='Z'),
    ),
    width=1024,
    height=768,
    template='plotly_dark',
    scene_annotations=annotations,
)
