In [6]:
# Import Libraries
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Instantiate Model
model = SentenceTransformer('all-MiniLM-L6-v2')

### Major Categories

In [8]:
CATEGORIES = [
    "volume",           # Duration, distance, calories, sessions
    "frequency",        # How often, rest days, consistency
    "intensity",        # HR, pace, speed, effort, zones
    "progression",      # Improvements over time, PRs
    "performance",      # Pace per distance, splits, elevation
    "distribution",     # Cardio types, equipment, time of day
    "recovery"          # Time between sessions, streaks, load
]

In [10]:
# Simple questions
test_queries = [
    "How is Sarah's pace?",
    "How many runs this week?",
    "What's the total distance?",
    "Is her heart rate improving?",
    "How many calories burned?",
    "Show me the last workout",
    "Is she running more often?",
    "What's her fastest 5K?"
]

# Classification function
def classify_query(query):
    query_embedding = model.encode(query)
    
    category_embeddings = {
        "volume": model.encode("total duration distance calories sessions workouts"),
        "frequency": model.encode("how often how many runs per week sessions consistency"),
        "intensity": model.encode("heart rate pace speed effort zones hard easy"),
        "progression": model.encode("improvement over time getting faster better PRs records"),
        "performance": model.encode("pace per distance splits fastest times elevation"),
        "distribution": model.encode("cardio types equipment indoor outdoor variety"),
        "recovery": model.encode("rest days time between sessions streaks recovery")
    }
    
    scores = {}
    for category, cat_embedding in category_embeddings.items():
        similarity = cosine_similarity([query_embedding], [cat_embedding])[0][0]
        scores[category] = similarity
    
    best_category = max(scores, key=scores.get)
    confidence = scores[best_category]
    
    return best_category, confidence, scores

# Test
for query in test_queries:
    category, confidence, all_scores = classify_query(query)
    print(f"Q: '{query}'")
    print(f"   → {category} ({confidence:.2f})")
    print(f"   Top 3: {sorted(all_scores.items(), key=lambda x: x[1], reverse=True)[:3]}")
    print()

Q: 'How is Sarah's pace?'
   → performance (0.35)
   Top 3: [('performance', np.float32(0.34701702)), ('intensity', np.float32(0.3133096)), ('progression', np.float32(0.20224018))]

Q: 'How many runs this week?'
   → frequency (0.52)
   Top 3: [('frequency', np.float32(0.52472126)), ('performance', np.float32(0.27364898)), ('recovery', np.float32(0.2276604))]

Q: 'What's the total distance?'
   → volume (0.42)
   Top 3: [('volume', np.float32(0.41819614)), ('performance', np.float32(0.34916878)), ('intensity', np.float32(0.116717346))]

Q: 'Is her heart rate improving?'
   → intensity (0.38)
   Top 3: [('intensity', np.float32(0.3794356)), ('performance', np.float32(0.24710076)), ('progression', np.float32(0.2201459))]

Q: 'How many calories burned?'
   → volume (0.46)
   Top 3: [('volume', np.float32(0.45537084)), ('distribution', np.float32(0.142205)), ('performance', np.float32(0.10238564))]

Q: 'Show me the last workout'
   → volume (0.44)
   Top 3: [('volume', np.float32(0.4386619

### Templates

In [15]:
CATEGORIES_2= {
    "volume": model.encode("total duration distance calories sessions workouts"),
    "frequency": model.encode("how often how many runs per week sessions consistency"),
    "intensity": model.encode("heart rate pace speed effort zones"),
    "progression": model.encode("improvement over time getting faster better PRs"),
    "performance": model.encode("last workout fastest times elevation best"),
    "distribution": model.encode("cardio types equipment variety breakdown"),
    "recovery": model.encode("rest days time between sessions streaks")
}


In [16]:
# SQL Templates per category
SQL_TEMPLATES = {
    "volume": {
        "total_distance": "SELECT SUM(distance) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
        "total_duration": "SELECT SUM(duration) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
        "total_calories": "SELECT SUM(calories_burned) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
        "avg_distance_per_session": "SELECT AVG(distance) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
    },
    
    "frequency": {
        "total_sessions": "SELECT COUNT(*) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
        "sessions_per_week": "SELECT COUNT(*)/({days}/7.0) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
        "rest_days": "SELECT {days} - COUNT(DISTINCT DATE(cardio_date)) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
    },
    
    "intensity": {
        "avg_heart_rate": "SELECT AVG(avg_heart_rate) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') AND avg_heart_rate IS NOT NULL",
        "max_heart_rate": "SELECT MAX(max_heart_rate) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') AND max_heart_rate IS NOT NULL",
        "avg_pace": "SELECT AVG(avg_pace) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') AND avg_pace IS NOT NULL",
        "avg_speed": "SELECT AVG(avg_speed) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') AND avg_speed IS NOT NULL",
    },
    
    "progression": {
        "pace_trend": "SELECT cardio_date, avg_pace FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') AND avg_pace IS NOT NULL ORDER BY cardio_date",
        "distance_trend": "SELECT cardio_date, distance FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') ORDER BY cardio_date",
        "fastest_pace": "SELECT MIN(avg_pace) as fastest_pace, cardio_date FROM Cardio WHERE client_id = {client_id} AND avg_pace IS NOT NULL",
    },
    
    "performance": {
        "last_workout": "SELECT * FROM Cardio WHERE client_id = {client_id} ORDER BY cardio_date DESC LIMIT 1",
        "elevation_gain": "SELECT SUM(elevation_gain) FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
        "best_distance": "SELECT MAX(distance) as longest_run, cardio_date FROM Cardio WHERE client_id = {client_id}",
    },
    
    "distribution": {
        "cardio_type_breakdown": "SELECT cardio_type, COUNT(*) as count FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days') GROUP BY cardio_type",
        "workout_variety": "SELECT COUNT(DISTINCT cardio_type) as variety FROM Cardio WHERE client_id = {client_id} AND cardio_date >= DATE('now', '-{days} days')",
    },
    
    "recovery": {
        "avg_rest_between": "SELECT AVG(julianday(cardio_date) - LAG(julianday(cardio_date)) OVER (ORDER BY cardio_date)) as avg_rest_days FROM Cardio WHERE client_id = {client_id}",
        "longest_streak": "SELECT MAX(streak) FROM (SELECT COUNT(*) as streak FROM Cardio WHERE client_id = {client_id} GROUP BY strftime('%Y%W', cardio_date))",
    }
}

In [17]:
# Embed templates
TEMPLATE_EMBEDDINGS = {}
for category, templates in SQL_TEMPLATES.items():
    TEMPLATE_EMBEDDINGS[category] = {
        name: model.encode(name.replace("_", " "))
        for name in templates.keys()
    }

def query_to_sql(query, client_id=123, days=21):
    # Step 1: Classify category
    q_emb = model.encode(query)
    cat_scores = {cat: cosine_similarity([q_emb], [emb])[0][0] for cat, emb in CATEGORIES_2.items()}
    best_cat = max(cat_scores, key=cat_scores.get)
    
    # Step 2: Match template
    temp_scores = {name: cosine_similarity([q_emb], [emb])[0][0] 
                   for name, emb in TEMPLATE_EMBEDDINGS[best_cat].items()}
    best_temp = max(temp_scores, key=temp_scores.get)
    
    # Step 3: Get SQL
    sql = SQL_TEMPLATES[best_cat][best_temp].format(client_id=client_id, days=days)
    
    return {
        "category": best_cat,
        "template": best_temp,
        "sql": sql,
        "confidence": temp_scores[best_temp]
    }

# Test
queries = [
    "How is Sarah's pace?",
    "How many runs this week?",
    "What's the total distance?",
    "Show me the last workout"
]

for q in queries:
    result = query_to_sql(q)
    print(f"\nQ: {q}")
    print(f"Category: {result['category']} | Template: {result['template']}")
    print(f"SQL: {result['sql']}")


Q: How is Sarah's pace?
Category: intensity | Template: avg_pace
SQL: SELECT AVG(avg_pace) FROM Cardio WHERE client_id = 123 AND cardio_date >= DATE('now', '-21 days') AND avg_pace IS NOT NULL

Q: How many runs this week?
Category: frequency | Template: sessions_per_week
SQL: SELECT COUNT(*)/(21/7.0) FROM Cardio WHERE client_id = 123 AND cardio_date >= DATE('now', '-21 days')

Q: What's the total distance?
Category: volume | Template: total_distance
SQL: SELECT SUM(distance) FROM Cardio WHERE client_id = 123 AND cardio_date >= DATE('now', '-21 days')

Q: Show me the last workout
Category: performance | Template: last_workout
SQL: SELECT * FROM Cardio WHERE client_id = 123 ORDER BY cardio_date DESC LIMIT 1
