# Performance Optimization

This notebook demonstrates how to optimize numta performance using different backends and best practices.

In [None]:
import numpy as np
import time
from numta import SMA, EMA, RSI, get_available_backends, HAS_NUMBA
from numta import SMA_auto, SMA_cumsum
from numta.benchmark import PerformanceMeasurement

print(f"Numba available: {HAS_NUMBA}")
print(f"Available backends: {get_available_backends()}")

## Backend Selection

numta supports multiple computation backends:

| Backend | Speed | Requirements | Notes |
|---------|-------|--------------|-------|
| numpy (default) | 1x | None | Pure NumPy, works everywhere |
| cumsum | ~3x | None | Optimized algorithm, no dependencies |
| numba | 5-10x | numba | JIT compilation for best performance |

### NumPy Backend (Default)

In [None]:
# Create test data
np.random.seed(42)
data = np.random.uniform(100, 200, 10000)

# Default NumPy implementation
start = time.perf_counter()
for _ in range(100):
    result = SMA(data, timeperiod=30)
numpy_time = (time.perf_counter() - start) / 100

print(f"NumPy backend: {numpy_time*1000:.3f} ms per call")

### Cumsum Optimization

The cumsum backend uses an optimized algorithm that's ~3x faster than the default implementation.

In [None]:
# Cumsum optimized implementation
start = time.perf_counter()
for _ in range(100):
    result = SMA_cumsum(data, timeperiod=30)
cumsum_time = (time.perf_counter() - start) / 100

print(f"Cumsum backend: {cumsum_time*1000:.3f} ms per call")
print(f"Speedup: {numpy_time/cumsum_time:.2f}x")

### Numba JIT Compilation

Numba provides the best performance through Just-In-Time compilation.

In [None]:
if HAS_NUMBA:
    from numta import SMA_numba
    
    # Warmup (first call triggers compilation)
    _ = SMA_numba(data, timeperiod=30)
    
    # Benchmark
    start = time.perf_counter()
    for _ in range(100):
        result = SMA_numba(data, timeperiod=30)
    numba_time = (time.perf_counter() - start) / 100
    
    print(f"Numba backend: {numba_time*1000:.3f} ms per call")
    print(f"Speedup vs NumPy: {numpy_time/numba_time:.2f}x")
    print(f"Speedup vs cumsum: {cumsum_time/numba_time:.2f}x")
else:
    print("Numba not available. Install with: pip install numba")

### Automatic Backend Selection

Use `SMA_auto` to automatically select the best available backend.

In [None]:
# Auto backend selection
result = SMA_auto(data, timeperiod=30, backend='auto')
print(f"Result shape: {result.shape}")

# Or specify a specific backend
result_cumsum = SMA_auto(data, timeperiod=30, backend='cumsum')
print(f"Cumsum result shape: {result_cumsum.shape}")

## Benchmarking

numta includes a built-in benchmarking tool for comparing implementations.

In [None]:
# Setup benchmark
bench = PerformanceMeasurement()

# Add implementations to compare
bench.add_function("NumPy SMA", SMA, data, timeperiod=30)
bench.add_function("Cumsum SMA", SMA_cumsum, data, timeperiod=30)

if HAS_NUMBA:
    from numta import SMA_numba
    bench.add_function("Numba SMA", SMA_numba, data, timeperiod=30)

# Run benchmark
results = bench.run(iterations=100, warmup=10)
bench.print_results(results)

### Benchmark Across Data Sizes

In [None]:
# Compare performance across different data sizes
sizes = [1000, 5000, 10000, 50000, 100000]

print("Performance by data size (ms):\n")
print(f"{'Size':>10} | {'NumPy':>10} | {'Cumsum':>10}", end="")
if HAS_NUMBA:
    print(f" | {'Numba':>10}")
else:
    print()

print("-" * 50)

for size in sizes:
    test_data = np.random.uniform(100, 200, size)
    
    # NumPy
    start = time.perf_counter()
    for _ in range(50):
        SMA(test_data, timeperiod=30)
    np_time = (time.perf_counter() - start) / 50 * 1000
    
    # Cumsum
    start = time.perf_counter()
    for _ in range(50):
        SMA_cumsum(test_data, timeperiod=30)
    cs_time = (time.perf_counter() - start) / 50 * 1000
    
    print(f"{size:>10} | {np_time:>10.3f} | {cs_time:>10.3f}", end="")
    
    if HAS_NUMBA:
        from numta import SMA_numba
        _ = SMA_numba(test_data, timeperiod=30)  # warmup
        start = time.perf_counter()
        for _ in range(50):
            SMA_numba(test_data, timeperiod=30)
        nb_time = (time.perf_counter() - start) / 50 * 1000
        print(f" | {nb_time:>10.3f}")
    else:
        print()

## Memory Optimization

Tips for reducing memory usage with large datasets.

### Use Appropriate Data Types

In [None]:
# float64 (default) - 8 bytes per element
data_64 = np.random.uniform(100, 200, 1000000).astype(np.float64)
print(f"float64 size: {data_64.nbytes / 1024 / 1024:.2f} MB")

# float32 - 4 bytes per element (often sufficient)
data_32 = data_64.astype(np.float32)
print(f"float32 size: {data_32.nbytes / 1024 / 1024:.2f} MB")

### Process in Chunks

In [None]:
def process_in_chunks(data, chunk_size=100000, timeperiod=30):
    """Process large datasets in chunks to reduce memory usage."""
    results = []
    overlap = timeperiod - 1
    
    for i in range(0, len(data), chunk_size - overlap):
        chunk = data[i:i + chunk_size]
        sma_chunk = SMA(chunk, timeperiod=timeperiod)
        
        if i == 0:
            results.append(sma_chunk)
        else:
            # Skip overlap region
            results.append(sma_chunk[overlap:])
    
    return np.concatenate(results)

# Test chunked processing
large_data = np.random.uniform(100, 200, 500000)
result = process_in_chunks(large_data)
print(f"Processed {len(large_data)} points -> {len(result)} results")

### Use Streaming for Real-Time Data

In [None]:
from numta.streaming import StreamingSMA

# Streaming uses O(timeperiod) memory instead of O(data_length)
sma = StreamingSMA(timeperiod=20)

# Process one point at a time
for price in large_data[:100]:
    sma.update(price)

print(f"StreamingSMA uses minimal memory regardless of data length")

## Best Practices for Large Datasets

### 1. Pre-allocate Arrays

In [None]:
# Bad: Growing arrays
def slow_multi_indicator(data):
    results = {}
    results['sma'] = SMA(data, timeperiod=20)
    results['ema'] = EMA(data, timeperiod=20)
    results['rsi'] = RSI(data, timeperiod=14)
    return results

# Good: Pre-allocate all at once
def fast_multi_indicator(data):
    n = len(data)
    results = np.empty((3, n), dtype=np.float64)
    results[0] = SMA(data, timeperiod=20)
    results[1] = EMA(data, timeperiod=20)
    results[2] = RSI(data, timeperiod=14)
    return results

# Compare
test = np.random.uniform(100, 200, 100000)

start = time.perf_counter()
_ = slow_multi_indicator(test)
slow_time = time.perf_counter() - start

start = time.perf_counter()
_ = fast_multi_indicator(test)
fast_time = time.perf_counter() - start

print(f"Dict approach: {slow_time*1000:.2f} ms")
print(f"Array approach: {fast_time*1000:.2f} ms")

### 2. Avoid Repeated Calculations

In [None]:
# Bad: Recalculating for each use
def bad_strategy(data):
    signal1 = SMA(data, 20) > SMA(data, 50)
    signal2 = SMA(data, 20) > data  # Recalculates SMA!
    return signal1 & signal2

# Good: Calculate once, reuse
def good_strategy(data):
    sma20 = SMA(data, 20)
    sma50 = SMA(data, 50)
    signal1 = sma20 > sma50
    signal2 = sma20 > data
    return signal1 & signal2

# Compare
start = time.perf_counter()
for _ in range(100):
    bad_strategy(test)
bad_time = time.perf_counter() - start

start = time.perf_counter()
for _ in range(100):
    good_strategy(test)
good_time = time.perf_counter() - start

print(f"Bad approach: {bad_time*1000:.2f} ms")
print(f"Good approach: {good_time*1000:.2f} ms")
print(f"Speedup: {bad_time/good_time:.2f}x")

### 3. Use Contiguous Arrays

In [None]:
# Non-contiguous array (e.g., from slicing)
data_nc = test[::2]  # Every other element
print(f"Contiguous: {data_nc.flags['C_CONTIGUOUS']}")

# Make contiguous
data_c = np.ascontiguousarray(data_nc)
print(f"Now contiguous: {data_c.flags['C_CONTIGUOUS']}")

## Summary

| Tip | Impact |
|-----|--------|
| Use Numba backend | 5-10x faster |
| Use cumsum backend | ~3x faster |
| Pre-allocate arrays | Reduces memory fragmentation |
| Cache repeated calculations | Avoids redundant work |
| Use streaming for real-time | Constant memory usage |
| Process in chunks | Handles large datasets |
| Use float32 if precision allows | 50% memory reduction |