## Creating a Video Dataset from YouTube Videos

Use `pytube` to download 

In [1]:
!pip install fiftyone pytube --quiet

In [2]:
import os
import shutil

import fiftyone as fo
import eta.core.utils as etau

from pytube import YouTube

In [3]:
DATASET_NAME = 'basketball-clips'

In [4]:
data_dir = os.path.join(fo.get_default_dataset_dir(DATASET_NAME), 'data')
os.makedirs(data_dir, exist_ok=True)

In [5]:
VIDEO_URLS = [
    "https://www.youtube.com/shorts/3QjfVU5kZXA",
    "https://www.youtube.com/shorts/W_Y34ep90XA",
    "https://www.youtube.com/watch?v=fn2BmgQno4c",
    "https://www.youtube.com/watch?v=1FLPquyyuk8"
]

In [6]:
def _url_to_path(url):
    uuid = url.split('/')[-1].split('=')[-1]
    return os.path.join(data_dir, f"{uuid}.mp4")

In [7]:
def download_video(url):
    video_path = os.path.join(data_dir, _url_to_path(url))
    if os.path.exists(video_path):
        return video_path

    try:
        yt = YouTube(url)
    except:
        print(f"Connection Error for {url}")

    mp4_streams = (
        yt.streams.filter(file_extension="mp4", mime_type="video/mp4")
        .order_by("resolution")
        .desc()
    )
    d_video = mp4_streams[0]  # download video with highest resolution

    with etau.TempDir() as tmp_dir:
        try:
            tmp_path = d_video.download(tmp_dir)
            shutil.move(tmp_path, video_path)
        except Exception as e:
            print(f"Error downloading {url}: {e}")

    return video_path

In [8]:
dataset = fo.Dataset(name=DATASET_NAME, overwrite=True, persistent=True)

In [9]:
samples = []
for url in VIDEO_URLS:
    video_path = download_video(url)
    sample = fo.Sample(filepath=video_path, url=url)
    samples.append(sample)

dataset.add_samples(samples)

 100% |█████████████████████| 4/4 [66.6ms elapsed, 0s remaining, 60.1 samples/s]      


['6642b45f7dd46326a6e6ccce',
 '6642b45f7dd46326a6e6cccf',
 '6642b45f7dd46326a6e6ccd0',
 '6642b45f7dd46326a6e6ccd1']

In [10]:
session = fo.launch_app(dataset, auto=False)

Session launched. Run `session.show()` to open the App in a cell output.


![Video Dataset](../assets/video_dataset.gif)

In [None]:
dataset.ensure_frames()

In [None]:
sample = dataset.first()

print(sample)

In [None]:
## frames are 1-indexed
print(sample.frames[1]) ## frame 1
print(sample.frames[sample.metadata.total_frame_count]) ## last frame

In [None]:
dataset.name = "basketball-clips"