# Embedding Model
- Notebook to test embedding model
- Model: all-MiniLM-L6-v2

In [23]:
# Import Libraries
import faiss
from sentence_transformers import SentenceTransformer

In [24]:
# Instantiate Model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
prompt_categories = {
    "volume_consistency": [
        "How many runs per week?",
        "What's the workout frequency?",
        "How consistent is the training?",
        "Show me workouts per week",
        "Training volume analysis"
    ],
    "distance_progression": [
        "Total distance covered",
        "Weekly mileage trends",
        "How far did they run?",
        "Distance over time",
        "Progression in distance"
    ],
    "pace_trends": [
        "How fast are they running?",
        "Pace improvement over time",
        "Average running speed",
        "Is pace getting better?",
        "Speed trends"
    ],
    "heart_rate_analysis": [
        "Heart rate zones",
        "Average heart rate trends",
        "Cardiovascular intensity",
        "HR during workouts",
        "Training intensity"
    ],
    "workout_type": [
        "What types of cardio?",
        "Running vs cycling preference",
        "Cardio activity breakdown",
        "Exercise type distribution"
    ],
    "recovery_patterns": [
        "Rest days between workouts",
        "Recovery time analysis",
        "How often do they rest?",
        "Training frequency gaps"
    ],
    "performance_metrics": [
        "Calories burned",
        "Elevation gain",
        "Overall performance",
        "Training effectiveness"
    ]
}

In [26]:
# Flatten all example prompts with their categories
category_examples = []
category_labels = []

for category, examples in prompt_categories.items():
    category_examples.extend(examples)
    category_labels.extend([category] * len(examples))


In [27]:
# Create embeddings for all example prompts
example_embeddings = model.encode(category_examples)

In [28]:
# Build FAISS index
dimension = example_embeddings.shape[1]  # 384 for all-MiniLM-L6-v2
index = faiss.IndexFlatL2(dimension)
index.add(example_embeddings)

print(f"Created FAISS index with {len(category_examples)} example prompts")
print(f"Embedding dimension: {dimension}")
print(f"\nCategories available: {list(prompt_categories.keys())}")

Created FAISS index with 32 example prompts
Embedding dimension: 384

Categories available: ['volume_consistency', 'distance_progression', 'pace_trends', 'heart_rate_analysis', 'workout_type', 'recovery_patterns', 'performance_metrics']


In [29]:
def classify_user_prompt(user_prompt, k=3):
    """
    Classify a user prompt into cardio data analysis categories
    
    Parameters:
    - user_prompt: str, the question/prompt from the user
    - k: int, number of nearest neighbors to consider
    
    Returns:
    - dict with classification results
    """
    # Encode the user prompt
    prompt_embedding = model.encode([user_prompt])
    
    # Search for k nearest neighbors
    distances, indices = index.search(prompt_embedding, k)
    
    # Get the categories of nearest neighbors
    nearest_categories = [category_labels[idx] for idx in indices[0]]
    nearest_examples = [category_examples[idx] for idx in indices[0]]
    nearest_distances = distances[0]
    
    # Determine the most common category
    from collections import Counter
    category_counts = Counter(nearest_categories)
    predicted_category = category_counts.most_common(1)[0][0]
    confidence = category_counts.most_common(1)[0][1] / k
    
    return {
        'predicted_category': predicted_category,
        'confidence': confidence,
        'nearest_matches': [
            {
                'example': nearest_examples[i],
                'category': nearest_categories[i],
                'distance': float(nearest_distances[i])
            }
            for i in range(k)
        ]
    }


In [30]:
# Test it out
test_prompts = [
    "Show me how many times they ran each week",
    "Are they getting faster?",
    "What's their average heart rate?",
    "Do they prefer running or biking?",
    "How many miles per week?"
]

print("Testing prompt classification:\n")
for prompt in test_prompts:
    result = classify_user_prompt(prompt)
    print(f"Prompt: '{prompt}'")
    print(f"Category: {result['predicted_category']} (confidence: {result['confidence']:.2f})")
    print(f"Top match: {result['nearest_matches'][0]['example']}\n")

Testing prompt classification:

Prompt: 'Show me how many times they ran each week'
Category: volume_consistency (confidence: 0.67)
Top match: How many runs per week?

Prompt: 'Are they getting faster?'
Category: pace_trends (confidence: 1.00)
Top match: How fast are they running?

Prompt: 'What's their average heart rate?'
Category: heart_rate_analysis (confidence: 1.00)
Top match: Average heart rate trends

Prompt: 'Do they prefer running or biking?'
Category: pace_trends (confidence: 0.67)
Top match: Running vs cycling preference

Prompt: 'How many miles per week?'
Category: volume_consistency (confidence: 0.67)
Top match: Weekly mileage trends



In [33]:
user_prompt = "How many calories do i burn per mile?"

result = classify_user_prompt(user_prompt)

print(f"Predicted Category: {result['predicted_category']}")
print(f"Confidence: {result['confidence']}")
print(f"\nNearest Matches:")
for match in result['nearest_matches']:
    print(f"  - {match['example']} (Category: {match['category']}, Distance: {match['distance']:.4f})")

Predicted Category: performance_metrics
Confidence: 0.3333333333333333

Nearest Matches:
  - Calories burned (Category: performance_metrics, Distance: 1.0320)
  - Weekly mileage trends (Category: distance_progression, Distance: 1.1621)
  - Average running speed (Category: pace_trends, Distance: 1.3168)
