In [96]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

# Load Model & Dataset

In [5]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
secret_label = "hf-token"
secret_value = UserSecretsClient().get_secret(secret_label)

login(secret_value)

In [6]:
# Load Dataset
# Questions, response, and winner info from chatbot arena
ds = load_dataset("lmsys/chatbot_arena_conversations")

README.md:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

(…)-00000-of-00001-cced8514c7ed782a.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33000 [00:00<?, ? examples/s]

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Create embeddings for conversation

In [8]:
# extract X questions only from the dataset
questions = [
    record["conversation_a"][0]["content"]
    for record in ds["train"].select(range(5000)) 
    if record["conversation_a"] and record["conversation_a"][0]["role"] == "user"
]

# de-dup while maintain index
seen = set()
unique_questions = []
for q in questions:
    if q not in seen:
        seen.add(q)
        unique_questions.append(q)
print(f'Remaining records after dedup: {len(unique_questions)}')
        
# Encode X records
embeddings = model.encode(unique_questions, show_progress_bar=True)

Remaining records after dedup: 4299


Batches:   0%|          | 0/135 [00:00<?, ?it/s]

# Similarity Search
## Encode some records, find the similar one when get a new entry

In [9]:
def find_similar_question(embeddings,texts, query, top_k=1):
    """Enter some string and find top_k most similar ones"""
    # encode new question
    query_embedding = model.encode([query], show_progress_bar=False) 

    # find similary ones from embeddings
    similarity = cosine_similarity(query_embedding, embeddings)
    top_indices = np.argsort(similarity[0])[::-1][:top_k] # sort the index of similarity scores, reverse it (descending), take top k 

    output = []

    for i, index in enumerate(top_indices):
        result = {
            'rank': i+1,
            'similarity_score': f"{similarity[0][index]:.4f}",
            'search_result': texts[index]
        }
        output.append(result)

    return output

In [16]:
find_similar_question(embeddings, unique_questions, "recommend me a camera", top_k=5)

[{'rank': 1,
  'similarity_score': '0.5590',
  'search_result': 'Can you suggest a good mobile for longer battery life and good camera'},
 {'rank': 2,
  'similarity_score': '0.4983',
  'search_result': 'You are a media technology professor with 20 years of experience. Explain to me like I am five, how a camera works.'},
 {'rank': 3,
  'similarity_score': '0.4400',
  'search_result': 'best practice for using the iphone camera'},
 {'rank': 4,
  'similarity_score': '0.4228',
  'search_result': 'How to film at night with a professional camera'},
 {'rank': 5,
  'similarity_score': '0.4015',
  'search_result': 'Im bored. What do you recommend me to do?'}]

# Clustering

## K-means

In [57]:
from sklearn.cluster import KMeans

num_clusters = 15  
kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings) 

In [58]:
values, counts = np.unique(cluster_labels, return_counts=True)

for val, count in zip(values, counts):
    print(f"Value: {val}, Count: {count}")

Value: 0, Count: 396
Value: 1, Count: 268
Value: 2, Count: 247
Value: 3, Count: 364
Value: 4, Count: 213
Value: 5, Count: 367
Value: 6, Count: 214
Value: 7, Count: 185
Value: 8, Count: 196
Value: 9, Count: 250
Value: 10, Count: 251
Value: 11, Count: 383
Value: 12, Count: 242
Value: 13, Count: 466
Value: 14, Count: 257


In [126]:
# See sample records from a cluster
indices_1 = np.where(cluster_labels==4)[0][:5]
selected_questions = [unique_questions[i] for i in indices_1]
selected_questions

['Write a python one-line lambda function that calculates dot product between two lists without using imported libraries. The entire function must fit on a single line and should begin like this: dot = lambda A, B: ',
 'Write a python one line lambda function that calculates mean of two lists, without using any imported libraries. The entire function should fit on a single line, start with this. mean = lambda A:',
 'write go code that calulates the first n prime numbers as fast as possible. n can be given as a command line parameter.',
 'Can you write code?',
 'write a bubble sort in python']

## Auto-Label each cluster

## Find most representative convo from each cluster

In [123]:
rep_records = {}
for i in range(kmeans.n_clusters):
    # get indices for the cluster
    cluster_indices = list(np.where(cluster_labels==i)[0])
    # get embeddings for the cluster
    cluster_embeddings = embeddings[cluster_indices]
    # calculate centroid of the cluster
    centroid = kmeans.cluster_centers_[i].reshape(1,-1)
    # calculate distance of all points to the centroid
    distance = cosine_distances(cluster_embeddings, centroid).flatten()
    # get x most representative indices in the cluster
    candidate_indices = np.argsort(distance)[:20]
    output_indices = [cluster_indices[idx] for idx in candidate_indices] # map idx back to cluster_indices
    # get x most representative questions in the cluster
    output_quesitons = [unique_questions[idx] for idx in output_indices]
    rep_records[f'cluster_{i}'] = output_quesitons

In [127]:
rep_records['cluster_0'][:5]

['give me some tech related business ideas',
 'What would Melissa Perri suggest to a company operating as a feature factory?',
 '\n"Create a prompt that encourages the model to adopt a personal assistant-like mindset, complete a task, and present the solution in a clear and concise manner with a focus on results, innovation and efficiency." task : prepare a brochure about isec wealth management portfolio management services. the end result is the best low-key stylish and informative brochure that will entice potential investors that are looking for long term commitment, with fixed income portfolios among others available',
 "Hi I'm developing a startup. Give me five tips to create an attractive brand.",
 "Rewrite this marketing email to be more catchy for clients: Subject: Unlock Valuable Insights with ABC Consulting's Data & Analytics Services\n\nDear {Recipient},\n\nAre you aware of the untapped potential within your organization's data? In today's data-driven world, harnessing valua