In [None]:
!pip install youtube_dl

In [115]:
import youtube_dl
import re
import os
import pandas as pd
#import urllib.request
import requests
import time

# Sandbox for Experimenting

### Translate Video Ids from 8M Data to URLs

The ID field in the TensorFlow record files is a 4-character string (e.g. ABCD). To get the YouTubeID, you can construct a URI like /AB/ABCD.js (note: first 2 characters are repeated!), and append it to the URL data.yt8m.org/2/j/i. As a real example, the ID nXSc can be converted to a video ID via the URL data.yt8m.org/2/j/i/nX/nXSc.js. The format of the file is JSONP, and should be self-explainatory. 

From: https://research.google.com/youtube8m/video_id_conversion.html

In [17]:
# Given videoId from 8m dataset

videoId = 'nXSc'

# Convert videoId to decoder to pull the YouTubeId

code = videoId[:2]+'/'+videoId+'.js'
decoder = 'http://data.yt8m.org/2/j/i/'+code

# Use the .js URL to pull the YouTubeId

YoutubeId = str(requests.get(decoder).content)

In [18]:
YoutubeId

'b\'i("nXSc","0sf943sWZls");\''

In [19]:
# Extract the YoutubeId

YoutubeId = re.sub(r'(b\'i\(\"[A-Za-z0-9]{4}\",\")|(\"\);\')', '', YoutubeId)

In [20]:
YoutubeId

'0sf943sWZls'

### Given the Video's URL, Pull the Closed Captions of the Video

In [123]:
# Settings that feed into the YoutubeDL object that downloads ONLY the captions

ydl_opts = {
    'outtmpl' : 'Captions',
    'writesubtitles' : True,
    'writeautomaticsub' : True,
    #'listsubtitles' : True,
    #'allsubtitles' : True,
    'subtitleslangs': ['en'],
    'skip_download' : True,
    'logger': MyLogger()
}

In [144]:
# Download the captions file to the local directory

URL = 'https://www.youtube.com/watch?v=ZSGhU2gqkvM'

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([URL])

In [145]:
# Open the downloaded captions file, extract the text, then close the file

file = open('Captions.en.vtt','r')
captions = file.read()
file.close()

In [141]:
print(captions)

WEBVTT
Kind: captions
Language: en

00:00:08.500 --> 00:00:17.450 align:start position:0%
 
you

00:00:17.450 --> 00:00:17.460 align:start position:0%
 
 

00:00:17.460 --> 00:00:20.160 align:start position:0%
 
then<00:00:18.460><c> it</c><00:00:18.610><c> goes</c><00:00:18.820><c> that's</c><00:00:19.330><c> my</c><00:00:19.510><c> call</c><00:00:19.840><c> when</c><00:00:20.050><c> I</c><00:00:20.080><c> can</c>

00:00:20.160 --> 00:00:20.170 align:start position:0%
then it goes that's my call when I can
 

00:00:20.170 --> 00:00:25.810 align:start position:0%
then it goes that's my call when I can
take<00:00:20.560><c> it</c><00:00:20.619><c> all</c>

00:00:25.810 --> 00:00:25.820 align:start position:0%
 
 

00:00:25.820 --> 00:00:28.120 align:start position:0%
 
then<00:00:26.820><c> it</c><00:00:26.940><c> goes</c><00:00:27.180><c> this</c><00:00:27.660><c> one</c>

00:00:28.120 --> 00:00:28.130 align:start position:0%
then it goes this one
 

00:00:28.130 --> 00:00:30.020 align

In [147]:
# Use RegEx to remove timestamps and convert the captions to a paragraph of text.

captions = re.sub(r'(WEBVTT\nKind: captions\nLanguage: en\n)|(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n)', '', captions)
captions = re.sub(r'(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n)|(<\d+:\d+:\d+.\d+><c>)|(</c>)', '', captions)
captions = re.sub(r'(\n)', ' ', captions)
print(captions)

  you       then it goes that's my call when I can then it goes that's my call when I can   then it goes that's my call when I can take it all       then it goes this one then it goes this one   then it goes this one when I can take it over when I can take it over   when I can take it over  you       your son to me feel it in the air you your son to me feel it in the air you   your son to me feel it in the air you can't put my finger on but it's right can't put my finger on but it's right   can't put my finger on but it's right there next to the shades thanks to the there next to the shades thanks to the   there next to the shades thanks to the colors of my shoes in the dinner so I'm colors of my shoes in the dinner so I'm   colors of my shoes in the dinner so I'm wasting yeah it'll never seen it like wasting yeah it'll never seen it like   wasting yeah it'll never seen it like this better when you look I'm eating so this better when you look I'm eating so   this better when you look I

In [121]:
# Now that we're done with the captions file, delete it from the directory

os.remove('Captions.en.vtt')

# Actual Code for the Project

**Issues:**
1. Lots of the videos are from different languages and the english automatic closed captions for these are terrible
2. Lots of these videos don't have any talking
3. Automatic closed captions vs. preloaded english closed captions
4. Automatic closed captions sometimes absorb lyrics to a song

### Iterate the Process

In [148]:
# Test video Ids from 8M dataset
VideoIds = ['op00','O900','Oq00','Li00','1300','gG00','xI00','i900','R100','sg00','0900','gk00','VK00',
'UL00','R900','FF00','Jw00','nm00','t600','1J00','LT00','4i00','L700','QA00','Ra00','rC00',
'pf00','8h00','J400','fd00','Ut00','0H00','dH00','Rc00','ie00','Ly00','mQ00','mY00']

# VideoIds = ['gG00']

In [158]:
# Logger for youtube-dl
class MyLogger(object):
    def debug(self, msg):
        pass

    def warning(self, msg):
        pass

    def error(self, msg):
        print(msg)

# Settings that feed into the YoutubeDL object that downloads ONLY the captions

ydl_opts = {
    'outtmpl' : 'Captions',
    'writesubtitles' : True,
    'writeautomaticsub' : True,
    #'listsubtitles' : True,
    #'allsubtitles' : True,
    'subtitleslangs': ['en'],
    'skip_download' : True,
    'logger': MyLogger()
}

In [150]:
def ObtainURL(videoId):
    '''
    This function takes in the VideoId from 8M data and converts it to the video's URL
    '''
    # Convert videoId to decoder to pull the YouTubeId
    code = videoId[:2]+'/'+videoId+'.js'
    decoder = 'http://data.yt8m.org/2/j/i/'+code

    # Use the .js URL to pull the YouTubeId
    YoutubeId = str(requests.get(decoder).content)
    
    # Extract the YoutubeId
    YoutubeId = re.sub(r'(b\'i\(\"[A-Za-z0-9]{4}\",\")|(\"\);\')', '', YoutubeId)
    
    # Convert YoutubeId to URL
    URL = 'https://www.youtube.com/watch?v='+YoutubeId
    
    return URL

In [159]:
# How long does this take?
start_time = time.time()

df = pd.DataFrame(columns=['videoId','captions'])
subtitlesdownloaded = False

for videoId in VideoIds:
    # Convert videoId to URL
    URL = ObtainURL(videoId)
    #print(URL)
    #URL = 'https://www.youtube.com/watch?v=Ye8mB6VsUHw'
    #URL = 'https://www.youtube.com/watch?v=0sf943sWZls'
    
    # Download the captions file to the local directory
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([URL])
    
    # If we successfully download the captions file
    try:    
        # Open the downloaded captions file, extract the text, then close the file
        file = open('Captions.en.vtt','r')
        captions = file.read()
        file.close()
        
        # Use RegEx to remove timestamps and convert the captions to a paragraph of text.
        captions = re.sub(r'(WEBVTT\nKind: captions\nLanguage: en\n)|(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n)', '', captions)
        captions = re.sub(r'(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n)|(<\d+:\d+:\d+.\d+><c>)|(</c>)', '', captions)
        captions = re.sub(r'(\n)', ' ', captions)
        
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.en.vtt')
    
    # If there is no captions file to download
    except FileNotFoundError:
        captions = None
    
    # Append the videoId and captions into the dataframe
    df = df.append({'videoId': videoId, 'captions': captions}, ignore_index=True)

# How long does this take?
print('Runtime: {} seconds'.format(time.time() - start_time))

Runtime: 41.22976517677307 seconds


In [160]:
df

Unnamed: 0,videoId,captions
0,op00,
1,O900,
2,Oq00,
3,Li00,the only chance is very dynamic get here th...
4,1300,
5,gG00,you then it goes that's my call when I...
6,xI00,
7,i900,
8,R100,
9,sg00,
