In [3]:
!pip install youtube_dl

Collecting youtube_dl
[?25l  Downloading https://files.pythonhosted.org/packages/67/99/977114c1c11cb3afdcc685f166ce844d052baae3ba7aa5d9f26da140c409/youtube_dl-2019.11.22-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 3.4MB/s eta 0:00:01
[?25hInstalling collected packages: youtube-dl
Successfully installed youtube-dl-2019.11.22


In [40]:
import youtube_dl
import re
import os
import pandas as pd
import numpy as np
#import urllib.request
import requests
import time

# Sandbox for Experimenting

### Translate Video Ids from 8M Data to URLs

The ID field in the TensorFlow record files is a 4-character string (e.g. ABCD). To get the YouTubeID, you can construct a URI like /AB/ABCD.js (note: first 2 characters are repeated!), and append it to the URL data.yt8m.org/2/j/i. As a real example, the ID nXSc can be converted to a video ID via the URL data.yt8m.org/2/j/i/nX/nXSc.js. The format of the file is JSONP, and should be self-explainatory. 

From: https://research.google.com/youtube8m/video_id_conversion.html

In [2]:
# Given videoId from 8m dataset

videoId = 'nXSc'

# Convert videoId to decoder to pull the YouTubeId

code = videoId[:2]+'/'+videoId+'.js'
decoder = 'http://data.yt8m.org/2/j/i/'+code

# Use the .js URL to pull the YouTubeId

YoutubeId = str(requests.get(decoder).content)

In [3]:
YoutubeId

'b\'i("nXSc","0sf943sWZls");\''

In [4]:
# Extract the YoutubeId

YoutubeId = re.sub(r'(b\'i\(\"[A-Za-z0-9]{4}\",\")|(\"\);\')', '', YoutubeId)

In [5]:
YoutubeId

'0sf943sWZls'

### Given the Video's URL, Pull the Closed Captions of the Video

In [2]:
# Settings that feed into the YoutubeDL object that downloads ONLY the captions
# Logger for youtube-dl
class MyLogger(object):

    def __init__(self):
        self.text = []
        
    def debug(self, msg):
        self.text.append(msg)

    def warning(self, msg):
        pass

    def error(self, msg):
        self.text.append(msg)
logger = MyLogger()
        
ydl_opts = {
    'outtmpl' : 'Captions',
    'quiet' : True,
    'forcetitle' : True,
    'writesubtitles' : True,
    'writeautomaticsub' : True,
    #'listsubtitles' : True,
    #'allsubtitles' : True,
    'subtitleslangs': ['en'],
    'skip_download' : True,
    'logger': logger
}

In [4]:
# Download the captions file to the local directory

#URL = 'https://www.youtube.com/watch?v=ZSGhU2gqkvM'
#URL = 'https://www.youtube.com/watch?v=Ye8mB6VsUHw' # Cookie Monster
#URL = 'https://www.youtube.com/watch?v=0sf943sWZls'
#URL = 'https://www.youtube.com/watch?v=FBQ00Vk7Obs' # No Comments
#URL = 'https://www.youtube.com/watch?v=0pPg_Cth0OQ'
URL = 'https://www.youtube.com/watch?v=mfbSqTB74xM'

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([URL])

In [13]:
# Pull the title of the video from the Logger
title = ""
for text in logger.text:
    if text[0:9] != '[youtube]':
        if text[0:6] != '[info]':
            title = text
title

'Sesame Street: Cookie Monster Sings C is for Cookie'

In [5]:
# Open the downloaded captions file, extract the text, then close the file

file = open('Captions.en.vtt','r')
captions = file.readlines()
#captions = file.read()
file.close()

In [6]:
print(captions)

['WEBVTT\n', 'Kind: captions\n', 'Language: en\n', '\n', '00:00:00.310 --> 00:00:08.200 align:start position:0%\n', ' \n', '[Music]\n', '\n', '00:00:08.200 --> 00:00:08.210 align:start position:0%\n', ' \n', ' \n', '\n', '00:00:08.210 --> 00:00:09.740 align:start position:0%\n', ' \n', 'hey<00:00:09.210><c> what</c><00:00:09.360><c> is</c><00:00:09.450><c> up</c><00:00:09.510><c> guys</c>\n', '\n', '00:00:09.740 --> 00:00:09.750 align:start position:0%\n', 'hey what is up guys\n', ' \n', '\n', '00:00:09.750 --> 00:00:13.339 align:start position:0%\n', 'hey what is up guys\n', 'mkbhd<00:00:10.230><c> here</c><00:00:10.620><c> this</c><00:00:11.280><c> is</c><00:00:11.880><c> the</c><00:00:12.090><c> pixel</c><00:00:12.269><c> for</c><00:00:12.690><c> excel</c>\n', '\n', '00:00:13.339 --> 00:00:13.349 align:start position:0%\n', 'mkbhd here this is the pixel for excel\n', ' \n', '\n', '00:00:13.349 --> 00:00:14.839 align:start position:0%\n', 'mkbhd here this is the pixel for excel\n', '

In [7]:
# Use RegEx to remove timestamps and convert the captions to a paragraph of text.

captions = re.sub(r'(WEBVTT\nKind: captions\nLanguage: en\n)|(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n)', '', captions)
captions = re.sub(r'(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n)|(<\d+:\d+:\d+.\d+><c>)|(</c>)', '', captions)
captions = re.sub(r'(\n)', ' ', captions)
print(captions)

TypeError: expected string or bytes-like object

In [8]:
# Use RegEx to remove timestamps and convert the captions to a paragraph of text.
subtitles = np.array([])
for line in captions:
    line = re.sub(r'WEBVTT\n','',line)
    line = re.sub(r'Kind: captions\n','',line)
    line = re.sub(r'Language: en\n','',line)
    line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n', '', line)
    line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n','',line)
    line = re.sub(r'<\d+:\d+:\d+.\d+><c>','',line)
    line = re.sub(r'</c>', '', line)
    line = re.sub(r'(\n)', '', line)
    if (line != '') and (line != ' '):
        if line not in list(subtitles):
            subtitles = np.append(subtitles, line)
            #print(line)

In [9]:
caption = ''
for i in subtitles:
    caption = caption + ' ' + i

In [10]:
caption

" [Music] hey what is up guys mkbhd here this is the pixel for excel this is one of the phones I was looking forward to the most this entire year for 2019 for a couple of reasons and so now I've been using it daily since its unveiling this is my honest review so let's just start with the wait looks I honestly think it's a pretty decent looking phone it's low-key really clean there's almost no markings along the back just the Google G at the bottom matte black aluminum rails as part of the design all the way around the phone speaker slots at the bottom lined up with the USB type-c port and the colored power button of course on every version and the whole back of the phone on two of the colors is this soft touch matte finish which does a great job of not showing fingerprints it doesn't seem to scratch very easily at all so of the three colors available I'm gonna say this white this Panda version is the best one that power buttons pretty sweet then I'm gonna say oh so orange is in second 

In [12]:
# Now that we're done with the captions file, delete it from the directory

os.remove('Captions.en.vtt')

### Pull the description of each video

In [8]:
# Logger for youtube-dl
class MyLogger(object):

    def __init__(self):
        self.text = []
        
    def debug(self, msg):
        self.text.append(msg)

    def warning(self, msg):
        pass

    def error(self, msg):
        self.text.append(msg)
logger = MyLogger()        
ydl_opts = {
    'outtmpl' : 'Captions',
    'quiet' : True,
    'forcetitle' : True,
    'writesubtitles' : True,
    'writeautomaticsub' : True,
    #'listsubtitles' : True,
    #'allsubtitles' : True,
    'subtitleslangs': ['en'],
    'skip_download' : True,
    'logger': logger,
    'writedescription' : True
    }

In [13]:
#URL = ObtainURL('0H00')
URL = 'https://www.youtube.com/watch?v=Ye8mB6VsUHw'
print(URL)

https://www.youtube.com/watch?v=Ye8mB6VsUHw


In [14]:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([URL])

In [23]:
# If we successfully download the captions.description file
try:    
    # Open the downloaded description file, extract the text, then close the file
    file = open('Captions.description','r')
    description = file.read()
    file.close()

    print(description)
        
    # Now that we're done with the captions file, delete it from the directory
    os.remove('Captions.description')
except:
    print('error')

Cookie's favorite food starts with the letter C -- that's good enough for him!

For more fun games and videos for your preschooler in a safe, child-friendly environment, visit us at http://www.sesamestreet.org

Sesame Street is a production of Sesame Workshop, a nonprofit educational organization which also produces Pinky Dinky Doo, The Electric Company, and other programs for children around the world.


# Actual Code for the Project

**Issues:**
1. Lots of the videos are from different languages and the english automatic closed captions for these are terrible
2. Lots of these videos don't have any talking
3. Automatic closed captions vs. preloaded english closed captions
4. Automatic closed captions sometimes absorb lyrics to a song

### Iterate the Process

In [24]:
# Test video Ids from 8M dataset
VideoIds = ['op00','O900','Oq00','Li00','1300','gG00','xI00','i900','R100','sg00','0900','gk00','VK00',
'UL00','R900','FF00','Jw00','nm00','t600','1J00','LT00','4i00','L700','QA00','Ra00','rC00',
'pf00','8h00','J400','fd00','Ut00','0H00','dH00','Rc00','ie00','Ly00','mQ00','mY00']

# VideoIds = ['gG00']

In [25]:
# Logger for youtube-dl
class MyLogger(object):

    def __init__(self):
        self.text = []
        
    def debug(self, msg):
        self.text.append(msg)

    def warning(self, msg):
        pass

    def error(self, msg):
        self.text.append(msg)
        
# logger = MyLogger()
# # Settings that feed into the YoutubeDL object that downloads ONLY the captions
# ydl_opts = {
#     'outtmpl' : 'Captions',
#     'quiet' : True,
#     'forcetitle' : True,
#     'writesubtitles' : True,
#     'writeautomaticsub' : True,
#     #'listsubtitles' : True,
#     #'allsubtitles' : True,
#     'subtitleslangs': ['en'],
#     'skip_download' : True,
#     'logger': logger
# }

In [26]:
def ObtainURL(videoId):
    '''
    This function takes in the VideoId from 8M data and converts it to the video's URL
    '''
    # Convert videoId to decoder to pull the YouTubeId
    code = videoId[:2]+'/'+videoId+'.js'
    decoder = 'http://data.yt8m.org/2/j/i/'+code

    # Use the .js URL to pull the YouTubeId
    YoutubeId = str(requests.get(decoder).content)
    
    # Extract the YoutubeId
    YoutubeId = re.sub(r'(b\'i\(\"[A-Za-z0-9]{4}\",\")|(\"\);\')', '', YoutubeId)
    
    # Convert YoutubeId to URL
    URL = 'https://www.youtube.com/watch?v='+YoutubeId
    
    return URL

In [28]:
# How long does this take?
start_time = time.time()

df = pd.DataFrame(columns=['videoId','captions'])
subtitlesdownloaded = False

for videoId in VideoIds:
    #Initialize logger variable
    logger = MyLogger()
    # Settings that feed into the YoutubeDL object that downloads ONLY the captions
    ydl_opts = {
        'outtmpl' : 'Captions',
        'quiet' : True,
        'forcetitle' : True,
        'writesubtitles' : True,
        'writeautomaticsub' : True,
        #'listsubtitles' : True,
        #'allsubtitles' : True,
        'subtitleslangs': ['en'],
        'skip_download' : True,
        'logger': logger,
        'writedescription' : True
    }
    
    # Convert videoId to URL
    URL = ObtainURL(videoId)
    print(URL)
    #URL = 'https://www.youtube.com/watch?v=Ye8mB6VsUHw'
    #URL = 'https://www.youtube.com/watch?v=0sf943sWZls'
    
    # Download the captions file to the local directory
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([URL])
        
    # Pull the title of the video from the Logger
    title = ""
    for text in logger.text:
        if text[0:9] != '[youtube]':
            if text[0:6] != '[info]':
                title = text
    print(title)
    
    # If we successfully download a description file
    try:
        # Open the downloaded description file, extract the text, then close the file
        file = open('Captions.description','r')
        description = file.read()
        file.close()
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.description')
    except:
        description = ''
        
    
    # If we successfully download the captions file
    try:    
        # Open the downloaded captions file, extract the text, then close the file
        file = open('Captions.en.vtt','r')
        captions = file.read()
        file.close()
        
        # Use RegEx to remove timestamps and convert the captions to a paragraph of text.
        captions = re.sub(r'(WEBVTT\nKind: captions\nLanguage: en\n)|(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n)', '', captions)
        captions = re.sub(r'(\n\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n)|(<\d+:\d+:\d+.\d+><c>)|(</c>)', '', captions)
        captions = re.sub(r'(\n)', ' ', captions)
        
        captions = title + " " + description + " " + captions
        
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.en.vtt')
    
    # If there is no captions file to download
    except FileNotFoundError:
        captions = title + " " + description
    
    # Append the videoId and captions into the dataframe
    df = df.append({'videoId': videoId, 'captions': captions}, ignore_index=True)

# How long does this take?
print('Runtime: {} seconds'.format(time.time() - start_time))

https://www.youtube.com/watch?v=FBQ00Vk7Obs
Kai Jing Leong 2011 Singapore National Figure Skating Senior - Short Program
https://www.youtube.com/watch?v=1Cb84yXZgZs
Ivana Hong - Montage Gymnastique ❤
https://www.youtube.com/watch?v=ibKjl4ZyA9w
Бездомните кучета в София
https://www.youtube.com/watch?v=FePY0rISaIk
Montage #3 / Infestation / DeathZ
https://www.youtube.com/watch?v=2Y6LifcmLgw
Cross Uringe - 5 Aug 2012
https://www.youtube.com/watch?v=ZSGhU2gqkvM
Opel Vectra B Irmscher Tuning
https://www.youtube.com/watch?v=fwnMW7A1uw8
2004 470 Catalina sold by NWYachtnet
https://www.youtube.com/watch?v=Y3x25oucpFU
РЕЗИНОВАЯ КОКА КОЛА И ПЕПСИ
https://www.youtube.com/watch?v=96u8pQ2dpfQ
Madolche - Colombia Top 8 Regional Omar Hernandez - Yugioh Deck Profile September 2013
https://www.youtube.com/watch?v=56vazFgwvlo
Forest walk 2
https://www.youtube.com/watch?v=uZiuGmdpW4M
Vio-Lence - World In a World [HQ]
https://www.youtube.com/watch?v=YLZbTWmvVyc
WNF UMvC3(12-14-11) m23 Infrit vs Flocker
ht

In [32]:
df.captions[30]

"Naruto: Uzumaki Chronicles 2 - Story mode ch10 p6/6 Naruto: Uzumaki Chronicles 2 - Story mode ch10 p6/6   you       Oh       route you Chevy       I am immortal I am in java       you       you       ah so these are those yes I was able to ah so these are those yes I was able to   ah so these are those yes I was able to retrieve them from the remains of their retrieve them from the remains of their   retrieve them from the remains of their hideout how do you plan to dispose of hideout how do you plan to dispose of   hideout how do you plan to dispose of them well it's not like it'll do any them well it's not like it'll do any   them well it's not like it'll do any harm just to leave them alone but if we harm just to leave them alone but if we   harm just to leave them alone but if we get rid of them uh certain someone wants get rid of them uh certain someone wants   get rid of them uh certain someone wants to do it himself hmm very well it'll be to do it himself hmm very well it'll be

### Fixed Repeating Sentences Issue

In [122]:
# How long does this take?
start_time = time.time()

df = pd.DataFrame(columns=['videoId','captions'])
subtitlesdownloaded = False

for videoId in VideoIds:
    #Initialize logger variable
    logger = MyLogger()
    # Settings that feed into the YoutubeDL object that downloads ONLY the captions
    ydl_opts = {
        'outtmpl' : 'Captions',
        'quiet' : True,
        'forcetitle' : True,
        'writesubtitles' : True,
        'writeautomaticsub' : True,
        #'listsubtitles' : True,
        #'allsubtitles' : True,
        'subtitleslangs': ['en'],
        'skip_download' : True,
        'logger': logger,
        'writedescription' : True
    }
    
    # Convert videoId to URL
    URL = ObtainURL(videoId)
    print(URL)
    #URL = 'https://www.youtube.com/watch?v=Ye8mB6VsUHw'
    #URL = 'https://www.youtube.com/watch?v=0sf943sWZls'
    
    # Download the captions file to the local directory
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([URL])
        
    # Pull the title of the video from the Logger
    title = ""
    for text in logger.text:
        if text[0:9] != '[youtube]':
            if text[0:6] != '[info]':
                title = text
    print(title)
    
    # If we successfully download a description file
    try:
        # Open the downloaded description file, extract the text, then close the file
        file = open('Captions.description','r')
        description = file.read()
        file.close()
        description = re.sub(r'\n+',' ',description)
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.description')
    except:
        description = ''
        
    
    # If we successfully download the captions file
    try:    
        # Open the downloaded captions file, extract the text, then close the file
        file = open('Captions.en.vtt','r')
        captions = file.readlines()
        file.close()
        
        # Use RegEx to remove timestamps and convert the captions to a paragraph of text.
        subtitles = np.array([])
        for line in captions:
            line = re.sub(r'WEBVTT\n','',line)
            line = re.sub(r'Kind: captions\n','',line)
            line = re.sub(r'Language: en\n','',line)
            line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n', '', line)
            line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n','',line)
            line = re.sub(r'<\d+:\d+:\d+.\d+><c>','',line)
            line = re.sub(r'</c>', '', line)
            line = re.sub(r'(\n)', '', line)
            if (line != '') and (line != ' '):
                if line not in list(subtitles):
                    subtitles = np.append(subtitles, line)
        captions = ''
        for i in subtitles:
            captions = captions + ' ' + i
        
        captions = title + " " + description + " " + captions
        
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.en.vtt')
    
    # If there is no captions file to download
    except FileNotFoundError:
        captions = title + " " + description
    
    # Append the videoId and captions into the dataframe
    df = df.append({'videoId': videoId, 'captions': captions}, ignore_index=True)

# How long does this take?
print('Runtime: {} seconds'.format(time.time() - start_time))

https://www.youtube.com/watch?v=FBQ00Vk7Obs
Kai Jing Leong 2011 Singapore National Figure Skating Senior - Short Program
https://www.youtube.com/watch?v=1Cb84yXZgZs
Ivana Hong - Montage Gymnastique ❤
https://www.youtube.com/watch?v=ibKjl4ZyA9w
Бездомните кучета в София
https://www.youtube.com/watch?v=FePY0rISaIk
Montage #3 / Infestation / DeathZ
https://www.youtube.com/watch?v=2Y6LifcmLgw
Cross Uringe - 5 Aug 2012
https://www.youtube.com/watch?v=ZSGhU2gqkvM
Opel Vectra B Irmscher Tuning
https://www.youtube.com/watch?v=fwnMW7A1uw8
2004 470 Catalina sold by NWYachtnet
https://www.youtube.com/watch?v=Y3x25oucpFU
РЕЗИНОВАЯ КОКА КОЛА И ПЕПСИ
https://www.youtube.com/watch?v=96u8pQ2dpfQ
Madolche - Colombia Top 8 Regional Omar Hernandez - Yugioh Deck Profile September 2013
https://www.youtube.com/watch?v=56vazFgwvlo
Forest walk 2
https://www.youtube.com/watch?v=uZiuGmdpW4M
Vio-Lence - World In a World [HQ]
https://www.youtube.com/watch?v=YLZbTWmvVyc
WNF UMvC3(12-14-11) m23 Infrit vs Flocker
ht

In [127]:
df.captions[8]

'Madolche - Colombia Top 8 Regional Omar Hernandez - Yugioh Deck Profile September 2013 Information From: https://www.facebook.com/YGOCastellano --------------------------- Want To Find Me!? My Cardfight Vanguard Channel: http://www.youtube.com/user/VanKohl40 No Limit Gaming on Yugioh: http://www.youtube.com/user/NoLimitGamingTcg Want a Text When I Upload a Video? http://motube.us/mkohl40 My Store: http://www.mkohlgames.com/ My FaceBook: http://www.facebook.com/mkohl40 FaceBook Fan Page: http://www.facebook.com/robbie.kohl FaceBook Group: http://www.facebook.com/groups/487364107996181/ Dueling Network and DevPro: Mkohl40 Pojo.biz Message Board: Robbie Skype: robbie-kohl'

## New code for project

In [6]:
file = open('videoUrls.txt','r')
URLS = file.readlines()
file.close()
URLS = [line.rstrip('\n') for line in URLS]

In [9]:
# Logger for youtube-dl
class MyLogger(object):

    def __init__(self):
        self.text = []
        
    def debug(self, msg):
        self.text.append(msg)

    def warning(self, msg):
        pass

    def error(self, msg):
        self.text.append(msg)

In [12]:
# How long does this take?
start_time = time.time()

df = pd.DataFrame(columns=['title','captions','url'])
subtitlesdownloaded = False

for URL in URLS:
    #Initialize logger variable
    logger = MyLogger()
    # Settings that feed into the YoutubeDL object that downloads ONLY the captions
    ydl_opts = {
        'outtmpl' : 'Captions',
        'quiet' : True,
        'forcetitle' : True,
        'writesubtitles' : True,
        'writeautomaticsub' : True,
        #'listsubtitles' : True,
        #'allsubtitles' : True,
        'subtitleslangs': ['en'],
        'skip_download' : True,
        'logger': logger
    }
    
    # Download the captions file to the local directory
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([URL])
        
    # Pull the title of the video from the Logger
    title = ""
    for text in logger.text:
        if text[0:9] != '[youtube]':
            if text[0:6] != '[info]':
                title = text
    print(title)
        
    
    # If we successfully download the captions file
    try:    
        # Open the downloaded captions file, extract the text, then close the file
        file = open('Captions.en.vtt','r')
        captions = file.readlines()
        file.close()
        
        # Use RegEx to remove timestamps and convert the captions to a paragraph of text.
        subtitles = np.array([])
        for line in captions:
            line = re.sub(r'WEBVTT\n','',line)
            line = re.sub(r'Kind: captions\n','',line)
            line = re.sub(r'Language: en\n','',line)
            line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n', '', line)
            line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n','',line)
            line = re.sub(r'<\d+:\d+:\d+.\d+><c>','',line)
            line = re.sub(r'</c>', '', line)
            line = re.sub(r'(\n)', '', line)
            if (line != '') and (line != ' '):
                if line not in list(subtitles):
                    subtitles = np.append(subtitles, line)
        captions = ''
        for i in subtitles:
            captions = captions + ' ' + i
        
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.en.vtt')
    
    # If there is no captions file to download
    except FileNotFoundError:
        captions = ""
    
    # Append the videoId and captions into the dataframe
    df = df.append({'url':URL, 'title':title, 'captions':captions}, ignore_index=True)

# How long does this take?
print('Runtime: {} seconds'.format(time.time() - start_time))

Google Pixel 4 Review: Inside the Hype Machine!
Unboxing Every Google Pixel 4
Google Pixelbook Go review: Function over form
GoPro Max review: the most accessible 360 camera
Runtime: 4.854249954223633 seconds


In [34]:
import youtube_dl
import re
import os
import numpy as np

# Logger for youtube-dl
class MyLogger(object):

    def __init__(self):
        self.text = []
        
    def debug(self, msg):
        self.text.append(msg)

    def warning(self, msg):
        pass

    def error(self, msg):
        self.text.append(msg)

def pullYoutubeCaptions(URL):
    logger = MyLogger()
    
    # Settings that feed into the YoutubeDL object that downloads ONLY the captions
    ydl_opts = {
        'outtmpl' : 'Captions',
        'quiet' : True,
        'forcetitle' : True,
        'writesubtitles' : True,
        'writeautomaticsub' : True,
        #'listsubtitles' : True,
        #'allsubtitles' : True,
        'subtitleslangs': ['en'],
        'skip_download' : True,
        'logger' : logger
    }
    
    # Download the captions file to the local directory
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([URL])
        
    # Pull the title of the video from the Logger
    title = ""
    for text in logger.text:
        if text[0:9] != '[youtube]':
            if text[0:6] != '[info]':
                title = text
    print(title)
        
    # If we successfully download the captions file
    try:    
        # Open the downloaded captions file, extract the text, then close the file
        file = open('Captions.en.vtt','r')
        captions = file.readlines()
        file.close()
        
        # Use RegEx to remove timestamps and convert the captions to a paragraph of text.
        subtitles = np.array([])
        for line in captions:
            line = str(line)
            line = re.sub(r'WEBVTT\n',"",line)
            line = re.sub(r'Kind: captions\n',"",line)
            line = re.sub(r'Language: en\n',"",line)
            line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d\n', "", line)
            line = re.sub(r'\d\d:\d\d:\d\d.\d\d\d --> \d\d:\d\d:\d\d.\d\d\d align:start position:\d+%\n',"",line)
            line = re.sub(r'<\d+:\d+:\d+.\d+><c>',"",line)
            line = re.sub(r'</c>', "", line)
            line = re.sub(r'(\n)', "", line)
            if (line != '') and (line != ' '):
                if line not in list(subtitles):
                    subtitles = np.append(subtitles, line)
        captions = ""
        for i in subtitles:
            captions = captions + " " + i
        # Now that we're done with the captions file, delete it from the directory
        os.remove('Captions.en.vtt')
    
    # If there is no captions file to download
    except FileNotFoundError:
        captions = ""

    return captions


In [35]:
pullYoutubeCaptions('https://www.youtube.com/watch?v=iNk5Res9bB0')

iPhone 11 Pro review: the BEST camera on a phone
