# Embedding Model
- Notebook to test embedding model
- Model: all-MiniLM-L6-v2

In [69]:
# Import Libraries
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer


In [70]:
# Instantiate Model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [71]:
# Load prompt categories and labels from JSON
import json
from pathlib import Path

base = Path.cwd()
data_root = base / 'data' / 'prompts'
if not data_root.exists():
    data_root = base.parent / 'data' / 'prompts'

prompt_categories = json.loads((data_root / 'prompt_examples.json').read_text(encoding='utf-8'))
prompt_labels_map = json.loads((data_root / 'prompt_labels.json').read_text(encoding='utf-8'))


In [72]:
# Flatten all example prompts with their categories
all_prompt_categories = prompt_categories

category_examples = []
category_labels = []

for category, examples in all_prompt_categories.items():
    category_examples.extend(examples)
    category_labels.extend([category] * len(examples))

print(f"Total example prompts: {len(category_examples)}")


Total example prompts: 105


In [73]:
# Create embeddings for all example prompts (normalize for cosine similarity)
example_embeddings = model.encode(category_examples, normalize_embeddings=True)


In [74]:
# Build FAISS index with cosine similarity (inner product on normalized vectors)
dimension = example_embeddings.shape[1]  # 384 for all-MiniLM-L6-v2
index_ip = faiss.IndexFlatIP(dimension)
index_ip.add(example_embeddings)

print(f"Created FAISS IP index with {len(category_examples)} example prompts")
print(f"Embedding dimension: {dimension}")
print(f"Categories available: {list(all_prompt_categories.keys())}")


Created FAISS IP index with 105 example prompts
Embedding dimension: 384
Categories available: ['volume_consistency', 'distance_progression', 'pace_trends', 'heart_rate_analysis', 'workout_type', 'recovery_patterns', 'performance_metrics']


In [75]:
def classify_user_prompt(user_prompt, k=5, min_top_similarity=0.35, temperature=0.07, other_category_count=3):
    """
    Classify a user prompt into cardio data analysis categories

    Parameters:
    - user_prompt: str, the question/prompt from the user
    - k: int, number of nearest neighbors to consider
    - min_top_similarity: float or None, reject if top similarity is below this
    - temperature: float, softmax temperature for similarity weighting
    - other_category_count: int, number of alternative categories to return

    Returns:
    - dict with classification results
    """
    # Encode the user prompt
    prompt_embedding = model.encode([user_prompt], normalize_embeddings=True)

    # Search for k nearest neighbors
    similarities, indices = index_ip.search(prompt_embedding, k)

    # Get the categories of nearest neighbors
    nearest_categories = [category_labels[idx] for idx in indices[0]]
    nearest_similarities = similarities[0]

    # Distance-aware class scoring using softmax over similarities
    sims = np.array(nearest_similarities, dtype=float)
    temp = max(float(temperature), 1e-6)
    weights = np.exp((sims - sims.max()) / temp)

    scores = {}
    for i, cat in enumerate(nearest_categories):
        scores[cat] = scores.get(cat, 0.0) + weights[i]

    predicted_category = max(scores, key=scores.get)
    total_score = sum(scores.values())
    confidence = scores[predicted_category] / total_score if total_score else 0.0

    # Rejection logic based on similarity
    top_similarity = float(sims[0]) if len(sims) else 0.0

    rejected = False
    if min_top_similarity is not None and top_similarity < min_top_similarity:
        rejected = True

    if rejected:
        predicted_category = "other"

    # Alternative categories (exclude best prediction)
    other_categories = []
    for cat, score in scores.items():
        if cat == predicted_category:
            continue
        other_categories.append({
            'category': cat,
            'confidence': (score / total_score if total_score else 0.0)
        })

    other_categories.sort(key=lambda x: x['confidence'], reverse=True)
    other_categories = other_categories[:other_category_count]

    return {
        'predicted_category': predicted_category,
        'confidence': confidence,
        'rejected': rejected,
        'top_similarity': top_similarity,
        'other_categories': other_categories,
    }


In [79]:
# Try your own query
user_query = "poop"
result = classify_user_prompt(user_query)
print(f"Prompt: '{user_query}'")
print(f"Predicted category: {result['predicted_category']}")
print(f"Confidence: {result['confidence']:.3f}")
print("Other categories:")
for item in result['other_categories']:
    print(f"- {item['category']}: {item['confidence']:.3f}")


Prompt: 'poop'
Predicted category: other
Confidence: 0.721
Other categories:
- workout_type: 0.721
- pace_trends: 0.150
- recovery_patterns: 0.129
