diff --git a/README.md b/README.md index ae4b250..4453981 100644 --- a/README.md +++ b/README.md @@ -51,15 +51,39 @@ cd viteo pip install -v -e . ``` -#### Rebuilding - +To rebuild after changes: ```bash -rm -rf dist/ pip install -e . --force-reinstall --no-deps ``` -### Requirements +## Configuration +### Logging + +You can enable debug logging with the `VITEO_DEBUG` environment variable: +```bash +$ VITEO_DEBUG=1 python example.py video_1080p.mp4 + +[viteo] Closed video resources +[viteo] Loaded asset from: tests/test-data/video_1080p.mp4 +[viteo] Found 1 video track(s) +[viteo] Video metadata: 1920x1080 @ 23.976 fps, 267 total frames +[viteo] Allocated batch buffer for 16 frames +[viteo] Created track output with hardware acceleration +[viteo] Reader initialized successfully + ... +``` + +### Batch size + +Internally, `viteo` passes frames to Python in batches for performance. +The default batch size is 8 frames, but you can change it by passing the `batch_size` argument: +```python +# Values between 2 and 16 are optimal +with viteo.open("video.mp4", batch_size=2) as frames: + for frame in frames: + process(frame) +``` + +## Performance -- macOS with Apple Silicon (M1/M2/M3/M4) -- Python 3.8+ -- MLX framework +benchmark_comparison diff --git a/src/native/include/frame_extractor.h b/src/native/include/frame_extractor.h index a2d324b..c55ba80 100644 --- a/src/native/include/frame_extractor.h +++ b/src/native/include/frame_extractor.h @@ -10,7 +10,7 @@ namespace viteo { /// High-performance video frame extractor for Apple Silicon class FrameExtractor { public: - FrameExtractor(); + FrameExtractor(size_t batch_size = 8); ~FrameExtractor(); /// Open video file for extraction diff --git a/src/native/src/bindings.cpp b/src/native/src/bindings.cpp index 8d2437c..cf8641f 100644 --- a/src/native/src/bindings.cpp +++ b/src/native/src/bindings.cpp @@ -30,7 +30,7 @@ NB_MODULE(_viteo, m) { m.doc() = "Hardware-accelerated video frame extraction for Apple Silicon"; nb::class_(m, "FrameExtractor") - .def(nb::init<>(), "Create new frame extractor") + .def(nb::init(), nb::arg("batch_size") = 8, "Create new frame extractor") .def("open", &FrameExtractor::open, nb::arg("path"), "Open video file for extraction") .def("next_frame", @@ -40,6 +40,7 @@ NB_MODULE(_viteo, m) { nb::gil_scoped_release release; frame_data = self.next_frame(); } + if (!frame_data) return nb::none(); return create_mlx_array(frame_data, self.height(), self.width()); }, "Get next frame as MLX array (None when done)") diff --git a/src/native/src/frame_extractor.mm b/src/native/src/frame_extractor.mm index 1754a5b..44d464f 100644 --- a/src/native/src/frame_extractor.mm +++ b/src/native/src/frame_extractor.mm @@ -3,6 +3,14 @@ #import #import #include "frame_extractor.h" +#include +#include + +#define DEBUG_LOG(msg) do { \ + if (debugLogging) { \ + std::cerr << "[viteo] " << msg << std::endl; \ + } \ +} while(0) namespace viteo { @@ -21,20 +29,27 @@ int64_t currentFrame = 0; // Internal batch buffer for performance - static constexpr size_t BATCH_SIZE = 16; + size_t batch_size; std::vector batch_buffer; size_t batch_count = 0; size_t batch_index = 0; bool isOpen = false; + bool debugLogging = false; - Impl() {} + Impl(size_t batch_size_param) : batch_size(batch_size_param) { + if (std::getenv("VITEO_DEBUG")) { + debugLogging = true; + } + DEBUG_LOG("Setting batch size to " << batch_size); + } ~Impl() { close(); // ARC handles cleanup automatically } + /// Releases all resources and resets state void close() { @autoreleasepool { if (reader) { @@ -47,46 +62,116 @@ void close() { isOpen = false; currentFrame = 0; } + DEBUG_LOG("Closed video resources"); } - bool open(const std::string& path) { - close(); + /// Loads asset from file path + AVAsset* loadAsset(const std::string& path) { + NSString* nsPath = [NSString stringWithUTF8String:path.c_str()]; + NSURL* url = [NSURL fileURLWithPath:nsPath]; + AVAsset* loadedAsset = [AVAsset assetWithURL:url]; - @autoreleasepool { - NSString* nsPath = [NSString stringWithUTF8String:path.c_str()]; - NSURL* url = [NSURL fileURLWithPath:nsPath]; + if (loadedAsset) { + DEBUG_LOG("Loaded asset from: " << path); + } else { + DEBUG_LOG("Failed to load asset from: " << path); + } - asset = [AVAsset assetWithURL:url]; - if (!asset) return false; + return loadedAsset; + } - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wdeprecated-declarations" - NSArray* tracks = [asset tracksWithMediaType:AVMediaTypeVideo]; - #pragma clang diagnostic pop + /// Extracts video track from asset + AVAssetTrack* extractVideoTrack(AVAsset* videoAsset) { + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wdeprecated-declarations" + NSArray* tracks = [videoAsset tracksWithMediaType:AVMediaTypeVideo]; + #pragma clang diagnostic pop - if (tracks.count == 0) return false; + if (tracks.count == 0) { + DEBUG_LOG("No video tracks found"); + return nil; + } - videoTrack = tracks[0]; + DEBUG_LOG("Found " << tracks.count << " video track(s)"); + return tracks[0]; + } - CGSize size = [videoTrack naturalSize]; - cachedWidth = static_cast(size.width); - cachedHeight = static_cast(size.height); - cachedFPS = [videoTrack nominalFrameRate]; + /// Caches video metadata from track + void cacheMetadata(AVAssetTrack* track, AVAsset* videoAsset) { + CGSize size = [track naturalSize]; + cachedWidth = static_cast(size.width); + cachedHeight = static_cast(size.height); + cachedFPS = [track nominalFrameRate]; + + CMTime duration = [videoAsset duration]; + cachedTotalFrames = static_cast( + CMTimeGetSeconds(duration) * cachedFPS + ); + + DEBUG_LOG("Video metadata: " << cachedWidth << "x" << cachedHeight + << " @ " << cachedFPS << " fps, " + << cachedTotalFrames << " total frames"); + } - CMTime duration = [asset duration]; - cachedTotalFrames = static_cast( - CMTimeGetSeconds(duration) * cachedFPS - ); + /// Opens video file and initializes extraction + bool open(const std::string& path) { + close(); + + @autoreleasepool { + asset = loadAsset(path); + if (!asset) return false; + + videoTrack = extractVideoTrack(asset); + if (!videoTrack) return false; + + cacheMetadata(videoTrack, asset); // Allocate batch buffer size_t frame_size = cachedWidth * cachedHeight * 4; - batch_buffer.resize(BATCH_SIZE * frame_size); + batch_buffer.resize(batch_size * frame_size); + DEBUG_LOG("Allocated batch buffer for " << batch_size << " frames"); isOpen = true; return setupReader(0); } } + /// Creates output settings dictionary for hardware accelerated decoding + NSDictionary* createOutputSettings() { + return @{ + (id)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA), + (id)kCVPixelBufferMetalCompatibilityKey: @YES, + (id)kCVPixelBufferIOSurfacePropertiesKey: @{}, + AVVideoDecompressionPropertiesKey: @{ + (id)kVTDecompressionPropertyKey_UsingHardwareAcceleratedVideoDecoder: @YES, + (id)kVTDecompressionPropertyKey_PropagatePerFrameHDRDisplayMetadata: @NO, + }, + }; + } + + /// Configures track output for optimal performance + AVAssetReaderTrackOutput* createTrackOutput(AVAssetTrack* track, NSDictionary* settings) { + AVAssetReaderTrackOutput* trackOutput = [[AVAssetReaderTrackOutput alloc] + initWithTrack:track outputSettings:settings]; + + trackOutput.alwaysCopiesSampleData = NO; + trackOutput.supportsRandomAccess = YES; + + DEBUG_LOG("Created track output with hardware acceleration"); + return trackOutput; + } + + /// Applies time range for seeking to specific frame + void applyTimeRange(AVAssetReader* videoReader, int64_t startFrame) { + if (startFrame > 0) { + CMTime startTime = CMTimeMake(startFrame, cachedFPS); + CMTime duration = CMTimeSubtract([asset duration], startTime); + videoReader.timeRange = CMTimeRangeMake(startTime, duration); + DEBUG_LOG("Seeking to frame " << startFrame); + } + } + + /// Initializes reader for frame extraction bool setupReader(int64_t startFrame) { @autoreleasepool { if (reader) { @@ -97,43 +182,26 @@ bool setupReader(int64_t startFrame) { NSError* error = nil; reader = [[AVAssetReader alloc] initWithAsset:asset error:&error]; - if (error || !reader) return false; - - // Configure for maximum performance with BGRA output - NSDictionary* outputSettings = @{ - (id)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA), - (id)kCVPixelBufferMetalCompatibilityKey: @YES, - (id)kCVPixelBufferIOSurfacePropertiesKey: @{}, - // Add VideoToolbox hardware acceleration hints - AVVideoDecompressionPropertiesKey: @{ - (id)kVTDecompressionPropertyKey_UsingHardwareAcceleratedVideoDecoder: @YES, - (id)kVTDecompressionPropertyKey_PropagatePerFrameHDRDisplayMetadata: @NO, - }, - }; - - output = [[AVAssetReaderTrackOutput alloc] - initWithTrack:videoTrack outputSettings:outputSettings]; - - // Critical performance settings - output.alwaysCopiesSampleData = NO; // Avoid unnecessary copies - output.supportsRandomAccess = YES; // Enable seeking + if (error || !reader) { + DEBUG_LOG("Failed to create reader: " << (error ? [[error localizedDescription] UTF8String] : "unknown error")); + return false; + } + + NSDictionary* outputSettings = createOutputSettings(); + output = createTrackOutput(videoTrack, outputSettings); if (![reader canAddOutput:output]) { + DEBUG_LOG("Cannot add output to reader"); reader = nil; output = nil; return false; } [reader addOutput:output]; - - // Set time range if seeking - if (startFrame > 0) { - CMTime startTime = CMTimeMake(startFrame, cachedFPS); - CMTime duration = CMTimeSubtract([asset duration], startTime); - reader.timeRange = CMTimeRangeMake(startTime, duration); - } + applyTimeRange(reader, startFrame); if (![reader startReading]) { + DEBUG_LOG("Failed to start reading"); reader = nil; output = nil; return false; @@ -142,10 +210,46 @@ bool setupReader(int64_t startFrame) { currentFrame = startFrame; batch_count = 0; batch_index = 0; + DEBUG_LOG("Reader initialized successfully"); return true; } } + /// Copies frame from pixel buffer to destination + void copyFrameData(CVImageBufferRef imageBuffer, uint8_t* dst) { + uint8_t* src = (uint8_t*)CVPixelBufferGetBaseAddress(imageBuffer); + size_t bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer); + size_t data_width = cachedWidth * 4; + size_t data_size = cachedHeight * data_width; + + if (bytesPerRow == data_width) { + memcpy(dst, src, data_size); + } else { + for (int y = 0; y < cachedHeight; y++) { + memcpy(dst + y * data_width, + src + y * bytesPerRow, + data_width); + } + } + } + + /// Processes single sample buffer and adds to batch + bool processSampleBuffer(CMSampleBufferRef sampleBuffer, size_t frame_size) { + CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer); + if (!imageBuffer) return false; + + CVPixelBufferLockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly); + + uint8_t* dst = batch_buffer.data() + (batch_count * frame_size); + copyFrameData(imageBuffer, dst); + + CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly); + batch_count++; + currentFrame++; + + return true; + } + /// Load next batch of frames into internal buffer void loadBatch() { if (!reader || !output || !isOpen) { @@ -157,51 +261,39 @@ void loadBatch() { batch_count = 0; @autoreleasepool { - while (batch_count < BATCH_SIZE) { - if (reader.status != AVAssetReaderStatusReading) break; + while (batch_count < batch_size) { + if (reader.status != AVAssetReaderStatusReading) { + DEBUG_LOG("Reader stopped, loaded " << batch_count << " frames"); + break; + } CMSampleBufferRef sampleBuffer = [output copyNextSampleBuffer]; - if (!sampleBuffer) break; - - CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer); - if (imageBuffer) { - CVPixelBufferLockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly); - - uint8_t* src = (uint8_t*)CVPixelBufferGetBaseAddress(imageBuffer); - size_t bytesPerRow = CVPixelBufferGetBytesPerRow(imageBuffer); - uint8_t* dst = batch_buffer.data() + (batch_count * frame_size); - - if (bytesPerRow == cachedWidth * 4) { - memcpy(dst, src, frame_size); - } else { - size_t copy_width = cachedWidth * 4; - for (int y = 0; y < cachedHeight; y++) { - memcpy(dst + y * copy_width, - src + y * bytesPerRow, - copy_width); - } - } - - CVPixelBufferUnlockBaseAddress(imageBuffer, kCVPixelBufferLock_ReadOnly); - batch_count++; - currentFrame++; + if (!sampleBuffer) { + DEBUG_LOG("No more sample buffers, loaded " << batch_count << " frames"); + break; } + processSampleBuffer(sampleBuffer, frame_size); CFRelease(sampleBuffer); } } batch_index = 0; + if (batch_count > 0) { + DEBUG_LOG("Loaded batch of " << batch_count << " frames"); + } } - /// Get pointer to next frame from batch + /// Returns pointer to next frame from batch uint8_t* nextFrame() { if (!isOpen) return nullptr; - // Load new batch if needed if (batch_index >= batch_count) { loadBatch(); - if (batch_count == 0) return nullptr; + if (batch_count == 0) { + DEBUG_LOG("No more frames available"); + return nullptr; + } } size_t frame_size = cachedWidth * cachedHeight * 4; @@ -210,14 +302,16 @@ void loadBatch() { return frame_ptr; } + /// Resets reader to specified frame index void reset(int64_t frameIndex) { if (!isOpen) return; + DEBUG_LOG("Resetting to frame " << frameIndex); setupReader(frameIndex); } }; // Public interface implementation -FrameExtractor::FrameExtractor() : impl(new Impl()) {} +FrameExtractor::FrameExtractor(size_t batch_size_param) : impl(new Impl(batch_size_param)) {} FrameExtractor::~FrameExtractor() { delete impl; } bool FrameExtractor::open(const std::string& path) { diff --git a/src/viteo/__init__.py b/src/viteo/__init__.py index 85f6c7d..1777c2a 100644 --- a/src/viteo/__init__.py +++ b/src/viteo/__init__.py @@ -26,9 +26,16 @@ class FrameExtractor(_FrameExtractor): """Hardware-accelerated video frame extractor for Apple Silicon.""" - def __init__(self, path: Optional[str | pathlib.Path] = None): - """Initialize extractor and optionally open a video file.""" - super().__init__() + def __init__(self, path: Optional[str | pathlib.Path] = None, batch_size: int = 8): + """ + Initialize extractor and optionally open a video file. + + Args: + path: Optional path to video file + batch_size: Number of frames to buffer internally (default: 8) + """ + super().__init__(batch_size) + self.batch_size = batch_size if path: if not super().open(str(path)): raise RuntimeError(f"Failed to open video: {path}") @@ -40,19 +47,20 @@ def __exit__(self, *args): pass -def open(path: str | pathlib.Path) -> FrameExtractor: +def open(path: str | pathlib.Path, batch_size: int = 8) -> FrameExtractor: """ Open a video file for frame extraction. Args: path: Path to video file + batch_size: Number of frames to buffer internally (default: 8) Returns: FrameExtractor configured for iteration Example: - with viteo.open("video.mp4") as frames: + with viteo.open("video.mp4", batch_size=16) as frames: for frame in frames: process_frame(frame) """ - return FrameExtractor(path) + return FrameExtractor(path, batch_size=batch_size) diff --git a/tests/test_viteo.py b/tests/test_viteo.py index 92c0b7c..2e98ff6 100644 --- a/tests/test_viteo.py +++ b/tests/test_viteo.py @@ -123,6 +123,39 @@ def test_iterator(sample_video): assert count == 10 +def test_run_to_end(sample_video): + """Test running through all frames to the end.""" + path = sample_video["path"] + if not path.exists(): + pytest.skip(f"Test video not found: {path}") + + with viteo.open(path) as video: + frame_count = 0 + for frame in video: + frame_count += 1 + + assert frame_count == video.total_frames + + +def test_last_frame_is_none(sample_video): + """Test that after all frames are read, the next frame is None and the frame count matches total_frames.""" + path = sample_video["path"] + if not path.exists(): + pytest.skip(f"Test video not found: {path}") + + i = 0 + with viteo.open(path) as video: + while True: + frame = video.next_frame() + if not isinstance(frame, mx.array): + break + + i += 1 + + assert frame is None + assert i == video.total_frames + + def test_reset(sample_video): """Test reset functionality.""" path = sample_video["path"]