## Creating a Video Dataset from YouTube Videos

Use `pytube` to download 

In [None]:
!pip install fiftyone pytube

In [2]:
import os
import shutil

import fiftyone as fo
import eta.core.utils as etau

from pytube import YouTube

In [3]:
DATASET_NAME = 'baskeball-clips'

In [4]:
data_dir = os.path.join(fo.get_default_dataset_dir(DATASET_NAME), 'data')
os.makedirs(data_dir, exist_ok=True)

In [5]:
VIDEO_URLS = [
    "https://www.youtube.com/shorts/3QjfVU5kZXA",
    "https://www.youtube.com/shorts/W_Y34ep90XA",
    "https://www.youtube.com/watch?v=fn2BmgQno4c",
    "https://www.youtube.com/watch?v=1FLPquyyuk8"
]

In [6]:
def _url_to_path(url):
    uuid = url.split('/')[-1].split('=')[-1]
    return os.path.join(data_dir, f"{uuid}.mp4")

In [7]:
def download_video(url):
    video_path = os.path.join(data_dir, _url_to_path(url))
    if os.path.exists(video_path):
        return video_path

    try:
        yt = YouTube(url)
    except:
        print(f"Connection Error for {url}")

    mp4_streams = (
        yt.streams.filter(file_extension="mp4", mime_type="video/mp4")
        .order_by("resolution")
        .desc()
    )
    d_video = mp4_streams[0]  # download video with highest resolution

    with etau.TempDir() as tmp_dir:
        try:
            tmp_path = d_video.download(tmp_dir)
            shutil.move(tmp_path, video_path)
        except Exception as e:
            print(f"Error downloading {url}: {e}")

    return video_path

In [8]:
dataset = fo.Dataset(name=DATASET_NAME, overwrite=True)

In [9]:
samples = []
for url in VIDEO_URLS:
    video_path = download_video(url)
    sample = fo.Sample(filepath=video_path, url=url)
    samples.append(sample)

dataset.add_samples(samples)

 100% |█████████████████████| 4/4 [56.3ms elapsed, 0s remaining, 71.1 samples/s]      


['6642a931cbd9a95f4892cca7',
 '6642a931cbd9a95f4892cca8',
 '6642a931cbd9a95f4892cca9',
 '6642a931cbd9a95f4892ccaa']

In [10]:
session = fo.launch_app(dataset, auto=False)

Session launched. Run `session.show()` to open the App in a cell output.


![Video Dataset](../assets/video_dataset.gif)

In [11]:
dataset.ensure_frames()

Computing metadata...
 100% |█████████████████████| 4/4 [134.1ms elapsed, 0s remaining, 29.8 samples/s] 


In [17]:
sample = dataset.first()

print(sample)

<Sample: {
    'id': '6642a931cbd9a95f4892cca7',
    'media_type': 'video',
    'filepath': '/Users/jacobmarks/fiftyone/baskeball-clips/data/3QjfVU5kZXA.mp4',
    'tags': [],
    'metadata': <VideoMetadata: {
        'size_bytes': 9377284,
        'mime_type': 'video/mp4',
        'frame_width': 1080,
        'frame_height': 1920,
        'frame_rate': 30.0,
        'total_frame_count': 591,
        'duration': 19.7,
        'encoding_str': 'avc1',
    }>,
    'url': 'https://www.youtube.com/shorts/3QjfVU5kZXA',
    'frames': <Frames: 591>,
}>


In [22]:
## frames are 1-indexed
print(sample.frames[1]) ## frame 1
print(sample.frames[sample.metadata.total_frame_count]) ## last frame

<Frame: {'id': '6642a965c3ebe644a163ddcd', 'frame_number': 1}>
<Frame: {'id': '6642a965c3ebe644a163e01b', 'frame_number': 591}>
