## This notebook explores the data (TFRecord format) using a subsample of the YouTube-8M video level.
## To work with the entire dataset, please refer to the Starter code on the [YouTube-8M github repo](https://github.com/google/youtube-8m).

## Invoke necessary dependencies

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# linear algebra
import numpy as np 

# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd 

#Loading libraries & datasets
import tensorflow as tf

# Input data files should be available in the "/input/" directory.
import os
import sys

from urllib.request import urlopen
import youtube_dl

# Any results you write to the current directory are saved as output.

## Create helper functions

In [2]:
# This function collects the data provided by youtube-dl, such as rendition tables, number of views, etc.
def get_metadata(video_id: str) -> str or None:
    url = 'https://www.youtube.com/watch?v=' + video_id
    ydl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s%(ext)s'})
    try:
        with ydl:
            result = ydl.extract_info(url, download=False)
            return result
    except youtube_dl.utils.DownloadError:
        return None

In [3]:
# For privacy reasons the video IDs in the dataset were provided with a codification. 
# Instructions and further information are available here:
#      https://research.google.com/youtube8m/video_id_conversion.html
def get_real_id(random_id: str) -> str:
    url = 'http://data.yt8m.org/2/j/i/{}/{}.js'.format(random_id[0:2], random_id)
    request = urlopen(url).read()
    real_id = request.decode()
    return real_id[real_id.find(',') + 2:real_id.find(')') - 1]

In [4]:
# We need this function to filter out the fields of metadata we won't be using about each video
def without_keys(d):
    return {x: d[x] for x in d if x in wanted_data}

## Bring in the TensorFlow records

These records are organized by chunks or "shards" in the YT8M website. 
Instruction on how to get them is available here: https://research.google.com/youtube8m/download.html
As it takes almost 96GB of our valuable hard disk, we have only ran experiments over three of them for this article.

In [5]:
# The path to the TensorFlow record
video_lvl_record = "input/train00.tfrecord"

## Iterate the records to obtain labels in the video level

In [None]:
vid_ids = []
labels = []

data = pd.DataFrame()
wanted_data = ['format', 'quality']

# Iterate the contents of the TensorFlow record
for example in tf.python_io.tf_record_iterator(video_lvl_record):
    
    # A TensoFlow Example is a mostly-normalized data format for storing data for
    # training and inference.  It contains a key-value store (features); where
    # each key (string) maps to a Feature message (which is oneof packed BytesList,
    # FloatList, or Int64List). Features for this data set are:
    #     -id
    #     -labels
    #     -mean_audio
    #     -mean_rgb
    tf_example = tf.train.Example.FromString(example)
    
    # Once we have the structured data, we can extract the relevant features (id and labels)
    vid_ids.append(tf_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
    pseudo_id = tf_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8')
    labels = tf_example.features.feature['labels'].int64_list.value
    audio = tf_example.features.feature['mean_rgb'].int64_list.value
    
    # The id provided from the TensoFlow example needs some processing in order to build a valid link to a 
    # YouTube video
    real_id = get_real_id(pseudo_id)
    
    # Get the youtube-dl valuable metadata
    data_video = get_metadata(real_id)
    
    
    if data_video:
        
        # We are interested in expanding the labels information with features such as title, 
        # creator, number of views and duration
        title = data_video['title']
        creator = data_video['creator']
        view_count = data_video['view_count']
        duration = data_video['duration']
        
        # youtube-dl library supplies data regarding formats mixed for both audio and video.
        # We are only interested in mp4 inputs, so we need to separate
        formats_dict = []
        for format_type in data_video['formats']:
            try:
                if(format_type['ext'] == 'mp4'):
                    formats_dict.append({format_type['format']:format_type['tbr']})
            except:
                e = sys.exc_info()
        #       print('error:', e)
                
    # Collect the data in the dataframe
    data = data.append({'id': real_id, 
                        'ladder': formats_dict, 
                        'title': title, 
                        'creator': creator, 
                        'views': view_count,
                        'duration': duration,
                        'labels': labels},
                        ignore_index=True)


[youtube] eguZ69v_vlQ: Downloading webpage
[youtube] eguZ69v_vlQ: Downloading video info webpage
[youtube] eguZ69v_vlQ: Downloading MPD manifest
[youtube] eguZ69v_vlQ: Downloading MPD manifest
[youtube] ER9Hdp04tWs: Downloading webpage
[youtube] ER9Hdp04tWs: Downloading video info webpage
[youtube] ETF2-Zz3J18: Downloading webpage
[youtube] ETF2-Zz3J18: Downloading video info webpage
[youtube] jtvbLq9bYRc: Downloading webpage
[youtube] jtvbLq9bYRc: Downloading video info webpage
[youtube] 6BPXQMxdHog: Downloading webpage
[youtube] 6BPXQMxdHog: Downloading video info webpage
[youtube] -j989rqetQE: Downloading webpage
[youtube] -j989rqetQE: Downloading video info webpage
[youtube] F-4h2WwVr3g: Downloading webpage
[youtube] F-4h2WwVr3g: Downloading video info webpage
[youtube] UZt7rP0poxs: Downloading webpage
[youtube] UZt7rP0poxs: Downloading video info webpage
[youtube] UZt7rP0poxs: Downloading MPD manifest
[youtube] UZt7rP0poxs: Downloading MPD manifest
[youtube] kGFuNGexHJY: Downloadi

## Check the data before using it

In [None]:
display(data)
data.describe()

## Export the data for others to use it

In [None]:
data.to_csv('yt8m_data-3.csv')