If you use this work, please cite:
@inproceedings{fiastre2026captionformer,
title = {CaptionFormer: Learning to Jointly Segment and Caption Object Trajectories in Videos},
author = {Gabriel Fiastre and Antoine Yang and Cordelia Schmid},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2026}
}