# Human Action Classification v2.0 - Quick Start

This notebook demonstrates the basic usage of the modernized action classification system.

In [None]:
import sys
sys.path.insert(0, '../src')

import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

from hac import ActionPredictor
from hac.models.classifier import create_model

## 1. Basic Inference

Let's start with the simplest use case - predicting actions from an image.

In [None]:
# Initialize predictor
predictor = ActionPredictor(
    model_path=None,  # Using pretrained backbone
    device='cuda',
    use_pose_estimation=True
)

print("✓ Predictor initialized")

In [None]:
# Load and predict
image_path = 'path/to/your/image.jpg'

result = predictor.predict_image(
    image_path,
    return_pose=True,
    top_k=5
)

# Display results
print(f"\nPose: {result['pose']['class']}")
print(f"\nTop 5 Actions:")
for i, pred in enumerate(result['action']['predictions'], 1):
    print(f"  {i}. {pred['class']}: {pred['confidence']:.3f}")

In [None]:
# Visualize pose detection
if 'pose_image' in result:
    pose_img = cv2.cvtColor(result['pose_image'], cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(12, 8))
    plt.imshow(pose_img)
    plt.title(f"Detected Pose: {result['pose']['class']}")
    plt.axis('off')
    plt.show()

## 2. Model Comparison

Compare different model architectures.

In [None]:
import time
import torch

# Models to test
model_names = [
    'mobilenetv3_small_100',
    'mobilenetv3_large_100',
    'efficientnet_b0',
    'resnet18',
]

results = {}

for model_name in model_names:
    print(f"\nTesting {model_name}...")
    
    # Create model
    model = create_model(
        model_type='action',
        model_name=model_name,
        num_classes=40
    )
    model = model.cuda().eval()
    
    # Count parameters
    num_params = sum(p.numel() for p in model.parameters())
    
    # Measure inference time
    dummy_input = torch.randn(1, 3, 224, 224).cuda()
    
    # Warmup
    for _ in range(10):
        _ = model(dummy_input)
    
    # Time
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(100):
        _ = model(dummy_input)
    torch.cuda.synchronize()
    elapsed = (time.time() - start) / 100
    
    results[model_name] = {
        'params': num_params / 1e6,  # Millions
        'latency': elapsed * 1000  # Milliseconds
    }
    
    print(f"  Params: {results[model_name]['params']:.2f}M")
    print(f"  Latency: {results[model_name]['latency']:.2f}ms")

# Plot comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

models = list(results.keys())
params = [results[m]['params'] for m in models]
latencies = [results[m]['latency'] for m in models]

ax1.bar(models, params)
ax1.set_ylabel('Parameters (Millions)')
ax1.set_title('Model Size')
ax1.tick_params(axis='x', rotation=45)

ax2.bar(models, latencies)
ax2.set_ylabel('Latency (ms)')
ax2.set_title('Inference Speed')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 3. Video Processing

Process a video and aggregate predictions.

In [None]:
video_path = 'path/to/video.mp4'

result = predictor.predict_video(
    video_path,
    sample_rate=5,  # Sample every 5 frames
    aggregate_method='voting'
)

print(f"Video: {result['video_path']}")
print(f"Total frames: {result['total_frames']}")
print(f"Sampled: {result['sampled_frames']}")
print(f"\nPredicted action: {result['prediction']}")
print(f"Confidence: {result['confidence']:.3f}")

## 4. Pose Keypoint Analysis

Extract and visualize pose keypoints.

In [None]:
from hac.inference.pose_extractor import PoseExtractor

# Initialize pose extractor
pose_extractor = PoseExtractor(
    static_image_mode=True,
    model_complexity=1
)

# Extract keypoints
image = cv2.imread(image_path)
keypoints = pose_extractor.extract_keypoints(image)

if keypoints is not None:
    print(f"Detected {len(keypoints)} keypoints")
    print(f"Keypoint format: [x, y, visibility]")
    print(f"\nFirst 5 keypoints:")
    print(keypoints[:5])
    
    # Get normalized keypoints (scale/translation invariant)
    normalized = pose_extractor.extract_normalized_keypoints(image)
    print(f"\nNormalized keypoints shape: {normalized.shape}")
else:
    print("No pose detected")

## 5. Custom Model Training (Setup)

Example of how to set up training on your own dataset.

In [None]:
from hac.data.dataset import ActionDataset
from hac.utils.transforms import get_training_transforms, get_inference_transforms
from torch.utils.data import DataLoader

# Assuming you have organized data in:
# data/train/class1/, data/train/class2/, ...
# data/val/class1/, data/val/class2/, ...

data_root = 'path/to/your/data'

# Create datasets
train_dataset = ActionDataset(
    root_dir=data_root,
    split='train',
    transform=get_training_transforms()
)

val_dataset = ActionDataset(
    root_dir=data_root,
    split='val',
    transform=get_inference_transforms()
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Classes: {train_dataset.class_names}")

## 6. Batch Inference

Process multiple images efficiently.

In [None]:
import glob

# Get all images in directory
image_dir = 'path/to/images/'
image_paths = glob.glob(f"{image_dir}/*.jpg")

results_list = []

for img_path in image_paths[:10]:  # First 10 images
    result = predictor.predict_image(img_path, return_pose=False)
    results_list.append({
        'image': img_path,
        'action': result['action']['top_class'],
        'confidence': result['action']['top_confidence']
    })

# Display results
import pandas as pd
df = pd.DataFrame(results_list)
print(df)

## Next Steps

1. **Train on your data**: Use `python -m hac.training.train`
2. **Export to ONNX**: For production deployment
3. **Optimize for edge**: Test on Jetson/mobile
4. **Integrate with AVs**: Add pedestrian prediction logic

See the README for more details!