In [None]:
!pip install pandas cohere datasets altair topically umap-learn bertopic

In [None]:
import pandas as pd
import numpy as np
import cohere
import topically
import umap
import altair as alt
from bertopic import BERTopic
from datasets import load_dataset
from typing import Optional, List

# Get (a small sample) the dataset
dataset = load_dataset("AmazonScience/massive", "en-US", split="train" )

# For a simple demo, try only 100 records
df = pd.DataFrame(dataset).sample(100) 


In [None]:
# Initialize the Cohere client
api_key =""
co = cohere.Client(api_key)

# Embed with Cohere’s embedding model, then convert into a numpy array
embeddings = co.embed(texts=list(df['utt']),
                       truncate="RIGHT").embeddings
embeddings = np.array(embeddings)


title = "Commands to AI personal assistant"

In [None]:
from sklearn.cluster import KMeans

n_clusters = 10

# Load and initialize BERTopic to use KMeans clustering with 8 clusters only.
cluster_model = KMeans(n_clusters=n_clusters)
topic_model = BERTopic(hdbscan_model=cluster_model)

# df is a dataframe. df['title'] is the column of text we're modeling
df['topic'], probabilities = topic_model.fit_transform(df['utt'], embeddings)



In [None]:
keywords = topic_model.generate_topic_labels()
df['cluster_keywords'] = df['topic'].map(lambda x: keywords[x])


In [None]:
df

Unnamed: 0,id,locale,partition,scenario,intent,utt,annot_utt,worker_id,slot_method,judgments,topic,cluster_keywords,x,y
8591,12757,en-US,train,1,54,send a taxi to come pick me up from my location,send a [transport_type : taxi] to come pick me...,169,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",0,0_train_ticket_me,17.128611,7.657280
3597,5337,en-US,train,16,23,alexa provide me the alarm times you have set,alexa provide me the alarm times you have set,571,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",3,3_alarm_at_am,14.907675,4.417607
8130,12097,en-US,train,6,6,let me see cultural events for this weekend in...,let me see cultural events for [date : this we...,646,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",7,7_in_events_there,16.387617,6.045707
8488,12612,en-US,train,1,2,get me a train ticket from mumbai to goa,get me a [transport_type : train] ticket from ...,328,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",0,0_train_ticket_me,17.109755,7.924650
228,342,en-US,train,8,56,get a cup of coffee ready now,get a cup of coffee ready now,203,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",2,2_coffee_room_of,13.210105,5.472404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11326,16894,en-US,train,7,44,how me my last two emails,how me my last two emails,521,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",8,8_reminders_email_new,14.339766,6.101216
5310,7856,en-US,train,2,50,set a meeting with the h. r. department for th...,set a [event_name : meeting] with the [person ...,246,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",5,5_meeting_with_calendar,15.343475,5.758651
616,922,en-US,train,3,45,play music from my favorite pandora artist sta...,play music from my favorite pandora artist sta...,683,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",1,1_play_music_from,11.992180,8.217563
1911,2811,en-US,train,8,18,make this room brighter,make this [house_place : room] brighter,250,"{'slot': [], 'method': []}","{'worker_id': [], 'intent_score': [], 'slots_s...",2,2_coffee_room_of,11.887174,6.187707


In [None]:
def interactive_clusters_scatterplot(
        df: pd.DataFrame,
        fields_in_tooltip: List[str] = None,
        title: str = '',
        title_column: str = 'keywords'
):
    if fields_in_tooltip is None:
        fields_in_tooltip = ['']

    selection = alt.selection_multi(fields=[title_column], bind='legend')

    chart = alt.Chart(df).transform_calculate(
    ).mark_circle(size=20, stroke='#666', strokeWidth=1, opacity=0.1).encode(
        x= 
        alt.X('x',
              scale=alt.Scale(zero=False),
              axis=alt.Axis(labels=False, ticks=False, domain=False)
              ),
        y=
        alt.Y('y',
              scale=alt.Scale(zero=False),
              axis=alt.Axis(labels=False, ticks=False, domain=False)
              ),

        color=alt.Color(f'{title_column}:N',
                        legend=alt.Legend(columns=1,
                                          symbolLimit=0,
                                          orient='right',
                                          labelFontSize=12)
                        ),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
        tooltip=fields_in_tooltip
    ).properties(
        width=600,
        height=400
    ).add_selection(
        selection
    ).configure_legend(labelLimit=0).configure_view(
        strokeWidth=0
    ).configure(background="#F6f6f6").properties(
        title=title
    ).configure_range(
        category={'scheme': 'category20'}
    )

    return chart 






# Reduce dimensions to be able to plot the embeddings
n_neighbors = 15
reducer = umap.UMAP(n_neighbors=n_neighbors)
umap_embeds = reducer.fit_transform(embeddings)
df['x'] = umap_embeds[:, 0]
df['y'] = umap_embeds[:, 1]

# Specify the names of columns to plot
title_column = 'cluster_keywords'
fields_in_tooltip = ['utt',  'topic', 'cluster_keywords']
title = "Commands to AI personal assistant"

chart = interactive_clusters_scatterplot(df,
                                            fields_in_tooltip=fields_in_tooltip,
                                            title=title + " - " + str(n_clusters) + " clusters",
                                            title_column=title_column)
chart

