# Generate Data

Generates UMAP coordinates and model from HuggingFace datasets or a JSON file.

In this notebook, we use the [Wikipedia sentences](https://huggingface.co/datasets/sentence-transformers/wikipedia-en-sentences) (HuggingFace) + [CHI 2024 paper titles](https://observablehq.com/@john-guerra/chi2024-papers) (JSON) datasets as examples. You may need to adjust the code to fit your needs.

## Helpers and imports

In [109]:
from datasets import load_dataset
import pandas as pd
import os
import json
import pickle
import numpy as np

import torch
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel
import vec2text

import umap
import plotly.express as px
from sklearn.cluster import KMeans

from openai import OpenAI
import ast

In [96]:
# check if models folder exists
if not os.path.exists("../models"):
    os.makedirs("../models")

In [None]:
# load the api keys from the secrets file
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    print("API keys loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

In [55]:
OPENAI_MODEL = "gpt-4o-mini"
llm = OpenAI(api_key=open_ai_key)

In [112]:
# Setting the random seed for reproducibility
random_state = 42
sample_size = 1000
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# check for cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### generate embeddings

In [None]:
def get_gtr_embeddings(text_list,
                       encoder: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer) -> torch.Tensor:

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=128,
                       truncation=True,
                       padding="max_length",).to(device)

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state
        embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])

    return embeddings

encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")

In [94]:
def generate_embeddings(sentences, dataset_name):
    # batch embedding process
    batch_size = 100
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        embeddings.append(get_gtr_embeddings(sentences[i:i+batch_size], encoder, tokenizer))

    # combine embeddings into one tensor
    embeddings = torch.cat(embeddings, dim=0)

    # normalize embeddings
    norms = torch.norm(embeddings, dim=1)
    embeddings /= norms[:, None]

    # save embeddings to file 
    torch.save(embeddings, f"{dataset_name}/{dataset_name}_embeddings.pt")

    print(f"Embeddings for {dataset_name} generated and saved to file.")
    return embeddings

### UMAP

In [122]:
def run_umap(embeddings, sentences, dataset_name, n_neighbors=100, min_dist=0.1):
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=2, metric='cosine', random_state=random_state)
    reducer = umap_model.fit(embeddings.cpu())
    umap_embeddings = reducer.transform(embeddings.cpu())

    # save results to df
    df_res = pd.DataFrame()
    df_res['sentence'] = sentences
    df_res['umap_x'] = umap_embeddings[:,0]
    df_res['umap_y'] = umap_embeddings[:,1]

    # save umap model
    umap_path = f'../models/{dataset_name}_umap_reducer'
    umap_pickle = open(umap_path, 'wb')
    pickle.dump(reducer, umap_pickle)
    umap_pickle.close()

    print(f"UMAP model for {dataset_name} saved to file.")
    return df_res

### Cluster

In [100]:
def label_clusters(cluster_samples):
    cluster_examples = "" # input to LLM
    for cluster, samples in cluster_samples.items():
        cluster_examples += f'Cluster {cluster}\n'
        cluster_examples += '----------------\n'
        for sample in samples:
            cluster_examples += sample + '\n'
        cluster_examples += '\n'

    system_prompt = f"""Given the example 10 sentences in each cluster, please assign a short phrase to describe each cluster. 
                    Format your answer as a dict where the key is the cluster number and the value is the phrase.
                    Just output the dict, nothing else."""
    user_prompt = f"{cluster_examples}"
    temperature = 0.1

    # get completion from LLM (ask it to label the clusters)
    completion = llm.chat.completions.create(
        model=OPENAI_MODEL,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
    )
    res = completion.choices[0].message.content

    # read res as a dictionary
    label_map = ast.literal_eval(res)
    print("Cluster labels generated.")
    return label_map


In [121]:
def get_clusters(df_res, embeddings, dataset_name, num_clusters=10):
    # create copy of df_res
    df_cluster = df_res.copy()

    # create a kmeans model
    kmeans = KMeans(n_clusters=num_clusters, random_state=random_state)

    # fit the model
    kmeans.fit(embeddings)

    # get the cluster labels
    cluster_labels = kmeans.labels_
    df_cluster['cluster'] = cluster_labels

    # randomly sample 10 sentences from each cluster
    # and add them to a dictionary where the key is the cluster number
    cluster_samples = {}
    for cluster in range(num_clusters):
        cluster_samples[cluster] = df_cluster[df_cluster['cluster'] == cluster].sample(10)['sentence'].tolist()

    # get the labels for the clusters
    label_map = label_clusters(cluster_samples)

    # map df_cluster['cluster'] to the labels
    df_cluster['cluster'] = df_cluster['cluster'].map(label_map)

    # save data to json
    df_cluster.to_json(f'{dataset_name}/{dataset_name}_data.json', orient='records')
    
    print(f"Cluster info and umap coords for {dataset_name} saved to file.")
    return df_cluster

In [104]:
def visualize_clusters(df_cluster):
    # create interactive visualization of the embeddings
    fig = px.scatter(df_cluster, x='umap_x', y='umap_y', hover_data=['sentence'], color='cluster')

    width = 1000

    # add a title and change the layout
    fig.update_layout(title='UMAP of sentence embeddings with clusters',
        autosize=False,
        width=width,
        height=400,
    )

    # show the plot
    fig.show()

## Wikipedia Sentences (HuggingFace Example)

In [106]:
dataset_to_load = "sentence-transformers/wikipedia-en-sentences" # TODO: REPLACE WITH ANY HUGGINGFACE DATASET
data = load_dataset(dataset_to_load)

dataset_name = "wiki" # TODO: REPLACE WITH YOUR DATASET NAME

In [92]:
# check if folder exists
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)

In [7]:
# sample n rows from the dataset
n = 10000
sample = data['train'].shuffle(seed=42).select(range(n))

In [None]:
df = pd.DataFrame(sample[:sample_size])
# rename the column to 'Sentence'
df = df.rename(columns={'sentence': 'Sentence'})
df.head()

In [32]:
sentences = df['Sentence'].to_list() # get sentences from df

In [None]:
# generate embeddings
embeddings = generate_embeddings(sentences, dataset_name)

In [None]:
# get umap coords
df_res = run_umap(embeddings, sentences, dataset_name)

In [None]:
# cluster the data
df_cluster = get_clusters(df_res, embeddings, dataset_name)

In [None]:
# visualize the clusters
visualize_clusters(df_cluster)

## CHI 2024 Papers (JSON Example)

In [107]:
# load json data
file_name = "chi2024data.json" # TODO: REPLACE WITH YOUR FILE NAME
dataset_name = "chi" # TODO: REPLACE WITH YOUR DATASET NAME

with open(file_name, "r") as f:
    data = json.load(f)

In [117]:
# check if folder exists
if not os.path.exists(dataset_name):
    os.makedirs(dataset_name)

In [None]:
papers = data["contents"]

# get title from each paper
titles = [paper["title"].strip() for paper in papers]

len(titles)

In [113]:
# randomly sample 1000 transcripts from this dataset
np.random.seed(random_state)
sampled_indices = np.random.choice(len(titles), sample_size, replace=False)

sentences = titles

In [None]:
sentences[:5]

In [None]:
# generate embeddings
embeddings = generate_embeddings(sentences, dataset_name)

In [None]:
# get umap coords
df_res = run_umap(embeddings, sentences, dataset_name)

In [None]:
# cluster the data
df_cluster = get_clusters(df_res, embeddings, dataset_name, num_clusters=25)

In [None]:
# visualize the clusters
visualize_clusters(df_cluster)