In [6]:
import os
from pathlib import Path
from huggingface_hub import login
from datasets import load_dataset

from dotenv import load_dotenv

load_dotenv()

True

In [7]:
login(token = os.getenv("HF_TOKEN"))
dataset = load_dataset("nvidia/Nemotron-Personas")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [9]:
import pandas as pd

df = pd.read_parquet("../Nemotron_Personas.parquet").iloc[:3000,:]

FileNotFoundError: [Errno 2] No such file or directory: '../Nemotron_Personas.parquet'

In [None]:
df.info()

In [None]:
df.columns

In [None]:
not_required_columns=['zipcode', 'country']
persona_columns=['persona', 'professional_persona', 'sports_persona',
                'arts_persona', 'travel_persona', 'culinary_persona']
categorical_columns=['sex', 'marital_status','education_level', 
                     'bachelors_field', 'occupation', 'city', 'state']

In [None]:
df[persona_columns].head()

## expertise_list

-TF-IDF 

In [None]:
import ast
df['hobbies_and_interests_list'] = df['hobbies_and_interests_list'].apply(ast.literal_eval)

In [None]:
total_hobbies = set()
for hobbies in df['hobbies_and_interests_list']:
    total_hobbies.update(hobbies)


len(total_hobbies)

In [None]:
total_hobbies


In [None]:
import json

JSON_FILE = '../all_hobbies.json'

try:
    with open(JSON_FILE, 'r') as f:
        total_hobbies = json.load(f)
    print("Successfully loaded 'total_hobbies' from JSON file.")

except FileNotFoundError:
    print(f"JSON file '{JSON_FILE}' not found.")
    with open(JSON_FILE, 'w') as f:
        json.dump(list(total_hobbies), f)

except json.JSONDecodeError:
    print(f"Error decoding JSON from file '{JSON_FILE}'.")


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

import networkx as nx

model = SentenceTransformer("all-MiniLM-L6-v2")

hobby_list = list(total_hobbies)
embeddings = model.encode(hobby_list, show_progress_bar=True)

similarity_matrix = cosine_similarity(embeddings)

threshold = 0.75



G = nx.Graph()
G.add_nodes_from(range(len(hobby_list)))

for i in range(len(hobby_list)):
    for j in range(i + 1, len(hobby_list)):
        if similarity_matrix[i, j] > threshold:
            G.add_edge(i, j)

merged_hobbies = []
for component in nx.connected_components(G):
    group = [hobby_list[idx] for idx in component]
    merged_hobbies.append(group)

canonical_hobbies = [g[0] for g in merged_hobbies]

df_merged = pd.DataFrame({
    "canonical_hobby": canonical_hobbies,
    "merged_group": [", ".join(g) for g in merged_hobbies]
})

df_merged.to_csv("semantically_merged_hobbies.csv", index=False)
print(f"✅ {len(canonical_hobbies)} unique semantic hobby groups created and saved to semantically_merged_hobbies.csv")


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm


model_name = "Qwen/Qwen3-Embedding-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


df_merged = pd.read_csv("semantically_merged_hobbies.csv")
hobbies = df_merged["canonical_hobby"].tolist()


def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()


embeddings = []
for hobby in tqdm(hobbies, desc="Creating embeddings"):
    embeddings.append(get_embedding(hobby))

embeddings = np.array(embeddings)
np.save("canonical_embeddings_qwen.npy", embeddings)

print(f"✅ Embeddings created for {len(hobbies)} hobbies and saved as canonical_embeddings_qwen.npy")

In [None]:
%pip install hdbscan scikit-learn

In [None]:
import hdbscan
import numpy as np
import pandas as pd

# Load embeddings if not in memory (optional, assuming they are from previous cell)
# embeddings = np.load("canonical_embeddings_qwen.npy")

clusterer = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=1, metric='euclidean')
cluster_labels = clusterer.fit_predict(embeddings)

df_hdbscan = pd.DataFrame({
    "hobby": hobbies,
    "cluster": cluster_labels
})

num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
num_noise = list(cluster_labels).count(-1)

print(f"HDBSCAN found {num_clusters} clusters and {num_noise} noise points.")
# df_hdbscan.to_csv("hdbscan_clusters.csv", index=False)

In [None]:
%pip install umap-learn matplotlib seaborn


In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

# Filter out noise points (-1) for metric calculation if desired, 
# but often we want to know how well the clustered points are separated.
# Here we calculate metrics on clustered data only.
clustered_mask = cluster_labels != -1
if clustered_mask.sum() > 1:
    sil_score = silhouette_score(embeddings[clustered_mask], cluster_labels[clustered_mask])
    db_score = davies_bouldin_score(embeddings[clustered_mask], cluster_labels[clustered_mask])
    print(f"Silhouette Score: {sil_score:.3f} (closer to 1 is better)")
    print(f"Davies-Bouldin Index: {db_score:.3f} (lower is better)")
else:
    print("Not enough clustered points to calculate metrics.")

In [None]:
import umap
import matplotlib.pyplot as plt
import seaborn as sns

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

plt.figure(figsize=(12, 8))
# Plot noise in grey
noise_mask = cluster_labels == -1
plt.scatter(embedding_2d[noise_mask, 0], embedding_2d[noise_mask, 1], c='grey', s=10, alpha=0.3, label='Noise')

# Plot clusters
clustered_mask = ~noise_mask
plt.scatter(embedding_2d[clustered_mask, 0], embedding_2d[clustered_mask, 1], 
            c=cluster_labels[clustered_mask], cmap='Spectral', s=20, alpha=0.8)

plt.title('UMAP Projection of Hobby Clusters', fontsize=16)
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.colorbar(label='Cluster Label')
plt.legend()
plt.show()

In [None]:
print("Random samples from top clusters:")
top_clusters = df_hdbscan['cluster'].value_counts().head(5).index.tolist()
if -1 in top_clusters:
    top_clusters.remove(-1)

for cluster_id in top_clusters:
    print(f"\nCluster {cluster_id}:")
    sample = df_hdbscan[df_hdbscan['cluster'] == cluster_id]['hobby'].sample(min(5, len(df_hdbscan[df_hdbscan['cluster'] == cluster_id])))
    for item in sample:
        print(f" - {item}")