#### Imports

In [1]:
import pytube
import librosa
import ffmpy
import os
import time
import subprocess
import platform
import IPython

The code is developed and tested on Linux. For Windows, the necessary adaptations for handling paths must be made. 

In [2]:
platform.system() == 'Linux'

True

#### Download YouTube Videos using `pytube`

For downloading YouTube Videos we use the `pytube` library. In the context of `pytube`, we select to download both the audio and video tracks seperately (and then post-process them with software like `FFmpeg` to merge them).

In [3]:
def get_video_itag(stream_lst,
                  res,
                  subtype='mp4'):
    '''Return the `itag` of a YouTube video with specified resolution and subtype.
    If the desired resolution does not exist, the user is prompt to input a new one from a list.
    
    Input:
      stream_lst:  a list of available media formats
      res:         desired resolution, string of the form `xxxp`, where `x` is a number
      subtype:     desired subtype, string -- available options are `mp4` (default) and `webm`
    Output:
      `itag` of YouTube video
    '''
    video_streams = [stream for stream in stream_lst if stream.includes_audio_track == False]
    resolutions = [stream.resolution for stream in video_streams 
                   if stream.resolution != None and stream.subtype == subtype]
    if res not in resolutions:
        print('Select a new video resolution from the list: ', resolutions)
        new_res = input()
        return get_video_itag(stream_lst, new_res, subtype)
    itag = [stream.itag for stream in video_streams if stream.resolution == res and stream.subtype == subtype]
    video_itag = itag[0]
    return video_itag

In [4]:
def get_audio_itag(stream_lst,
                  abr,
                  subtype='mp4'):
    '''Return the `itag` of a YouTube video with specified audio bitrate (`abr`) and subtype.
    If the desired `arb` does not exist, the user is prompt to input a new one from a list.
    
    Input:
      stream_lst:  list of available media formats
      abr:         desired bit rate, string of the form `xxxkpbs`, where `x` is a number
      subtype:     desired subtype, string -- available options are `mp4` (default) and `webm`
    Output:
      `itag` of YouTube video
    '''
    audio_streams = [stream for stream in stream_lst if stream.includes_audio_track == True
                    and stream.includes_video_track == False]
    audio_abrs = [stream.abr for stream in audio_streams if stream.subtype == subtype]
    if abr not in audio_abrs:
        print('Select a new abr variable from the following list: ', audio_abrs)
        new_abr = input()
        return get_audio_itag(stream_lst, new_abr, subtype)
    itag = [stream.itag for stream in audio_streams if stream.abr == abr]
    audio_itag = itag[0]
    return audio_itag

In [5]:
def download_medium(youtube_url,
                   out_dir,
                   audio_filename,
                   video_filename,
                   res,
                   abr,
                   subtype='mp4'):
    '''Download the audio and video from a requested YouTube object.
    Audio and video are downloaded seperately and stored in seperate folders.
    
    Input:
      youtube_url:       url address of requested YouTube video
      out_dir:           parent directory where audio and video files will be stored 
      audio_name:        output audio name
      video_name:        output video name
      res, abr, subtype: arguments of `get_audio_itag` and `get_video_itag` functions
    Output:
      None    
    '''
    yt_obj = pytube.YouTube(youtube_url)                     # YouTube object
    streams = yt_obj.streams.all()                           # list of available media formats
    # [a] video
    # create path
    path_name=os.path.join(out_dir, 'video')
    os.makedirs(path_name, mode=0o777, exist_ok=True)        # create directory
    # get `itag`
    video_itag=get_video_itag(stream_lst=streams, res=res, subtype='mp4')
    # download video
    yt_obj.streams.get_by_itag(video_itag).download(output_path=path_name, filename=video_filename, filename_prefix=None)
    # [b] audio
    # create path
    path_name=os.path.join(out_dir, 'audio')
    os.makedirs(path_name, mode=0o777, exist_ok=True)
    # get `itag`
    audio_itag=get_audio_itag(stream_lst=streams, abr=abr, subtype='mp4')
    # download audio
    yt_obj.streams.get_by_itag(audio_itag).download(output_path=path_name, filename=audio_filename, filename_prefix=None)

In [6]:
def download_media(doc_path,
                  out_dir,
                  res='360p',
                  abr='128kbps',
                  subtype='mp4'
                  ):            
    '''Download audio and video from a YouTube object.
    The url addresses are provided by a text file, with each line being a single url address.
    
    Input:
      doc_path:          text file containg url addresses--each line is a single address
      out_dir:           parent directory where audio and video files will be stored
      res, abr, subtype: see `get_audio_itag` and `get_video_itag` functions
    Output
      None
    '''
    with open(doc_path, 'r') as f:
        yt_urls = f.read().splitlines()
    
    # print(yt_urls)
    
    for idx,url in enumerate(yt_urls):
        audio_name = 'audio_'+str(idx)
        video_name = 'video_'+str(idx)
        download_medium(youtube_url=url, out_dir=out_dir, audio_filename=audio_name, video_filename=video_name, 
                       res=res, abr=abr,subtype='mp4')

In [7]:
download_media(doc_path='../yt_dummy.txt',
              out_dir='../data')

#### Join audio & video files using `FFmpeg` through `ffmpy` 

This is the post-processing step for merging audio and video tracks into a single object, called Video or medium.

In [17]:
def av_merge(in_dir = '../data',
            out_dir = '../data'):
    '''
    Merges a video file with its associated audio file creating a single medium (Video), 
    which it is stored in a directory `out_dir/media/`
    
    Input:
      in_dir:  the directory containing the `audio` and `video` folders
      out_dir: the directory containing the `media` folder where the merged media will be stored
    Output:
      None
    '''
    # [1] match associated audio and video
    # e.g. audio_k is matched with video_k
    
    audio_path = os.path.join(in_dir, 'audio', '')
    video_path = os.path.join(in_dir, 'video', '')
    
    audio_files = os.listdir(audio_path)
    video_files = os.listdir(video_path)
    
    matched_pairs = [(video_name, audio_name)
                    for video_name in video_files for audio_name in audio_files
                    if video_name.split('.')[0].split('_')[-1] == audio_name.split('.')[0].split('_')[-1]]
    
    print(matched_pairs)
    
    matched_pairs = [(pair_1, pair_2) for (pair_1, pair_2) in matched_pairs if 'ipynb' not in pair_1 and 'ipynb' not in pair_2]
    print(matched_pairs)
    
    # [2] preparing the output folder and merging audio and video into a single medium
    
    path_name = os.path.join(out_dir, 'media', '')
    os.makedirs(path_name, mode=0o777, exist_ok=True)

    for idx in range(len(matched_pairs)):
        video = os.path.join(in_dir, 'video', matched_pairs[idx][0])
        audio = os.path.join(in_dir, 'audio', matched_pairs[idx][1])
        output_name = 'medium_'+str(idx)+'.mp4'
        output = os.path.join(path_name, output_name)
        if 'ipynb' in audio or 'ipynb' in video:
            pass
        else:
            inp = {audio:None, video:None}
            oup = {output:['-c', 'copy']}
            ff = ffmpy.FFmpeg(inputs=inp, outputs=oup)
            # print(ff.cmd)
            ff.run()

In [18]:
av_merge()

[('.ipynb_checkpoints', '.ipynb_checkpoints'), ('video_1.mp4', 'audio_1.mp4'), ('video_0.mp4', 'audio_0.mp4'), ('video_2.mp4', 'audio_2.mp4')]
[('video_1.mp4', 'audio_1.mp4'), ('video_0.mp4', 'audio_0.mp4'), ('video_2.mp4', 'audio_2.mp4')]


#### Segmet media files

In this step we step, we segment YouTuve Videos into specific parts. The start/stop points are determined using `librosa`'s `effect.split()` function on the audio tracks. The actual segmentation is performed by `FFmpeg`.

__Segment audio files__

In [None]:
# paths
audio_path = '../data/audio'
audio_filenames = os.listdir(audio_path)
audio_filename = os.path.join(audio_path, audio_filenames[1])
video_path = '../data/audio'
video_filenames = os.listdir(video_path)
video_filename = os.path.join(video_path, video_filenames[1])
media_path = '../data/media'
media_filenames = os.listdir(media_path)
media_filename = os.path.join(media_path, media_filenames[1])

# print(audio_filename)

In [19]:
def segment_audio(audio_dir,
                  audio_fn,
                  top_db=50,
                  thres_duration=100):
    '''
    Determining the cutoff points in an audio track.
    Using `librosa.effects.split()` we determine the non-silent intervals in an audio track based on a threshold value `top_db`.
    The end points of the concurrent intervals may be merged if the duration of the associated non-silent audio parts do not exceed
    a reference threshold value `thes_duration`. 
    
    Input:
      audio_fn: the filename of the audio track, e.g. audio_1.mp4
      top_db:   the threshold (in decibels) below reference to consider as silence
      duration: the threshold duration (in seconds) of a part between cutoff points;
                if None no merging of cutoff points takes place
    Output:
      a list of cutoff points (in seconds)
    '''
    # load audio track
    audio_path = os.path.join(audio_dir, audio_fn)
    audio_track, sr = librosa.load(audio_path, sr=None)
    audio_len = len(audio_track)
    
    # split on silence
    parts = librosa.effects.split(audio_track, top_db=top_db)
    parts_no = len(parts)
    
    if thres_duration != None and parts_no > 1:
        # duration of each non-silent parts
        parts_duration = []
        for part in parts:
            w = list(range(part[0], part[1]+1))                                     # find samples of audio track covered by a part
            # alternatively,
            # w = [idx for idx in range(audio_len) if segment[0] <= idx <= segment[1]]           
            track = audio_track[w]                                                  # restrict audio track to desired samples
            dur = librosa.core.get_duration(track, sr=sr)                           # get duration of cropped track
            # print(dur)
            parts_duration.append(dur)                                              # update: append to list
        # print(parts_duration)
        
        # breakpoints
        parts_ = list(range(parts_no))
        breakpoints = []
        sample = 0                                      # start at the beginning of the list
        while sample in parts_:
            duration = parts_duration[sample]           # initialize: duration of current sample
            if duration > thres_duration:               # if the duration of a sample is larger than the threshold value
                sample += 1                             # move to the next sample - this will be a breakpoint
            while duration <= thres_duration:           # while the threshold duration is not exceeded - breakpoints can be merged
                if sample == parts_[-1]:                # we have reached the last element of the list
                    break                               # break inner while
                sample += 1                             # update sample index
                duration += parts_duration[sample]      # update duration
            breakpoints.append(sample - 1)              # append breakpoint
            if sample == parts_[-1]:                    # we have reached the last element of the list
                break                                   # break outer while    
        # print(breakpoints)
        
        # duration of concurrent parts (parts inbetween breakpoints)
        breakpoints_ = [breakpoint+1 for breakpoint in breakpoints]
        aug_breakpoints_ = [0] + breakpoints_ + [parts_no]
        # print(aug_breakpoints_)
        bps_len = len(aug_breakpoints_)
        durations_count = []

        for idx in range(1, bps_len):                                     # for all elements of `breakpoints` 
            count = 0
            # print(aug_breakpoints_[idx - 1], aug_breakpoints_[idx])
            for jdx in range(aug_breakpoints_[idx - 1], aug_breakpoints_[idx]):
                count += parts_duration[jdx]
            durations_count.append(count)
        # print(durations_count)
        
        # is the last breakpoint legit?
        if durations_count[-2] + durations_count[-1] <= thres_duration:            # if not legit
            # print(durations_count[-2] + durations_count[-1])
            # print(durations_count[-2], durations_count[-1])
            breakpoints = breakpoints[:-1]                                         # remove it
        # print(breakpoints)
        
        # find the segmentation points: the indices of the `audio_track` where we can cut it
        segmentation_points = []
        for breakpoint in breakpoints:
            segmentation_points.append(parts[breakpoint][1])                     # the right-hand extremity of the part
        aug_segmentation_points = [0] + segmentation_points + [audio_len]
        # print(aug_segmentation_points)
        
        # now we can determine the breakpoints in seconds (or minutes)
        cutoffs = [librosa.core.samples_to_time(elem, sr=sr) for elem in aug_segmentation_points]
        return cutoffs
    
    elif thres_duration != None and segments_no == 1:
        print('There are no cutoff points. Consider lowering the top_db variable.')
    else:
        for idx in range(1, parts_no):
            # if idx == 0:
            #    parts[idx][0] = 0
            # if idx == parts_no - 1:
            #    parts[idx][0] = parts[idx-1][1] + 1
            #    part[idx][1] = audio_track[-1]
            # else:
            #    part[idx][0] = parts[idx][1] + 1
            parts[idx][0] = parts[idx-1][1] + 1
        breakpoints = [parts[idx][1] for idx in range(parts_no)]
        aug_breakpoints = [0] + breakpoints + [audio_len]
        cutoffs = [librosa.core.samples_to_time(elem, sr=sr) for elem in aug_breakpoints]
        return cutoffs
    

In [21]:
# segment_audio(audio_dir='../data/audio', audio_fn='audio_2.mp4')

In [22]:
def segment_medium(audio_fn,
                  audio_dir='../data/audio', 
                  media_dir='../data/media',
                  out_dir='../data/segmented',
                  top_db=50,
                  thres_duration=100):
    '''
    Segment a YouTube Video to parts , where the endpoints (start/stop points) are determined by `segment_audio()` fubction.
    The segmented Videos are stored locally in the `out_dir` directory.
    
    Input:
      audio_fn:       see `segment_audio()`
      media_path:     path to where the media files (Video files) are located
      top_db:         see `segment_audio()`
      thres_duration: see `segment_audio()`
    Output:
      None
    '''
    # cutoff points of audio track
    cutoffs = segment_audio(audio_dir=audio_dir, audio_fn=audio_fn, top_db=top_db, thres_duration=thres_duration)
    
    # match medium (Video) to audio track
    media_filenames = os.listdir(media_dir)
    matched_medium = [medium_fn for medium_fn in media_filenames
                     if medium_fn.split('.')[0].split('_')[-1] == audio_fn.split('.')[0].split('_')[-1]]
    matched_medium = matched_medium[0]
    print(matched_medium)
    
    # paths & directories
    medium_path = os.path.join(media_dir, matched_medium)
    medium_ = matched_medium.split('.')[0]
    os.makedirs(out_dir, mode=0o777, exist_ok=True)
    
    # segment Video
    for idx in range(1, len(cutoffs)):
        start = time.strftime('%H:%M:%S', time.gmtime(cutoffs[idx - 1]))
        end = time.strftime('%H:%M:%S', time.gmtime(cutoffs[idx]))
        output_name = medium_+'_'+str(idx)+'.mp4'
        output = os.path.join(out_dir, output_name)

        inp = {medium_path:['-ss', start]}
        oup = {output:['-to', end, '-c', 'copy']}

        ff = ffmpy.FFmpeg(inputs=inp, outputs=oup)
        # print(ff.cmd)
        ff.run()

In [23]:
segment_medium(audio_fn='audio_2.mp4')



medium_2.mp4


In [None]:
# A step-by-step execution

In [None]:
# loading an audio track
audio_path = '../data/audio'
audio_filenames = os.listdir(audio_path)

filename = os.path.join(audio_path, audio_filenames[1])
print(filename)

audio_track, sr = librosa.load(filename, sr=None)           

In [None]:
# basic info
print(audio_track.shape)
audio_len = audio_track.shape[0]
print(audio_len)
print(sr)
print(librosa.core.get_duration(audio_track, sr=sr))
print(librosa.core.get_duration(audio_track, sr=sr)/60)

In [None]:
# playing the audio track
# IPython.display.Audio(data=audio_track[:], rate=sr)

In [None]:
# slicing the audio track into non-silent intervals
# `segments` contain the [start, stop] positions (indices of `audio_track`) of non-silent intervals 
segments = librosa.effects.split(audio_track, top_db=50)         
segments_no = len(segments)
print(segments_no)
print(segments[:20])

In [None]:
# duration of each non-silent segment
segments_duration = []
for segment in segments:
    w = list(range(segment[0], segment[1]+1))                           # find samples of audio track covered by a segment
    # alternatively,
    # w = [idx for idx in range(audio_len) if segment[0] <= idx <= segment[1]]           
    track = audio_track[w]                                              # crop audio track to desired samples
    dur = librosa.core.get_duration(track, sr=sr)                       # get duration of cropped track
    # print(dur)
    segments_duration.append(dur)                                       # update: append to list
print(segments_duration)

In [None]:
# what we would like to do next is to determine how should we merge the elements of the `segments` list
# into concurrent audio pieces, in such a way that the duration of the concurrent audio pieces does not
# exceed a user-defined threshold duration
# in effect we just need to determine where to break the `segments` list so that inbetween breakpoints 
# the list's elements will be consider as a single piece of audio
# need to exclude the last element of the `breakpoints` list

segments_ = list(range(segments_no))

thres = 100
breakpoints = []
accumulated_duration = []
sample = 0                                              # start at the beginning of the list
while sample in segments_:
    duration = segments_duration[sample]                # initialize: duration of current sample
    if duration > thres:                                # if the duration of a sample is larger than the threshold value
        sample += 1                                     # move to the next sample - this will be a breakpoint
    while duration <= thres:                            # while the threshold duration is not exceeded - segments that can be merged
        if sample == segments_[-1]:                     # we have reached the last element of the list
            break                                       # break inner while
        sample += 1                                     # update sample index
        duration += segments_duration[sample]           # update duration
    breakpoints.append(sample - 1)                      # append breakpoint
    if sample == segments_[-1]:                         # we have reached the last element of the list
        break                                           # break outer while    

print(breakpoints)

In [None]:
# find the duration of concurrent segments (segments inbetween breakpoints)

breakpoints_ = [breakpoint+1 for breakpoint in breakpoints]
aug_breakpoints_ = [0] + breakpoints_ + [segments_no]
print(aug_breakpoints_)

bps_len = len(aug_breakpoints_)

durations_count = []

for idx in range(1, bps_len):                                     # for all elements of `breakpoints` 
    count = 0
    # print(aug_breakpoints_[idx - 1], aug_breakpoints_[idx])
    for jdx in range(aug_breakpoints_[idx - 1], aug_breakpoints_[idx]):
        count += segments_duration[jdx]
    durations_count.append(count)
        
print(durations_count)

In [None]:
# alternative way of computating the duration between breakpoints

breakpoints_ = [breakpoint+1 for breakpoint in breakpoints]
aug_breakpoints_ = [0] + breakpoints_ + [segments_no]
print(aug_breakpoints_)

bps_len = len(aug_breakpoints_)

durations_count_ = [sum(segments_duration[aug_breakpoints_[idx - 1] : aug_breakpoints_[idx]]) for idx in range(1, bps_len)]

print(durations_count_)

In [None]:
# is the last breakpoint legit?

if durations_count[-2] + durations_count[-1] <= thres:
    print(durations_count[-2] + durations_count[-1])
    print(durations_count[-2], durations_count[-1])
    
    breakpoints = breakpoints[:-1]

print(breakpoints)

In [None]:
# find the segmentation points: the indices of the `audio_track` where we can cut it

segmentation_points = []

for breakpoint in breakpoints:
    segmentation_points.append(segments[breakpoint][1])                     # the right-hand extremity of the segment

aug_segmentation_points = [0] + segmentation_points + [audio_len]
print(aug_segmentation_points)

In [None]:
# now we can determine the breakpoints in seconds (or minutes)
cutoffs = [librosa.core.samples_to_time(elem, sr=sr) for elem in aug_segmentation_points]
cutoffs_minutes = [elem/60 for elem in cutoffs]
    
print(cutoffs)
print(cutoffs_minutes)

In [None]:
# play segments
audio_segments = []
for idx in range(1, len(aug_segmentation_points)):
    w = list(range(aug_segmentation_points[idx - 1], aug_segmentation_points[idx]))
    audio = audio_track[w]
    audio_segments.append(audio)

In [None]:
audio = audio_segments[0]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[1]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[2]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[3]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[4]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[5]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[6]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
audio = audio_segments[7]
IPython.display.Audio(data=audio[:], rate=sr)

In [None]:
new_segments = segments
for idx in range(1, segments_no):
    # if idx == 0:
    #    segments[idx][0] = 0
    # if idx == segments_no - 1:
        #    segments[idx][0] = segments[idx-1][1] + 1
        #    segments[idx][1] = audio_track[-1]
    # else:
        #    segments[idx][0] = segments[idx][1] + 1
    new_segments[idx][0] = segments[idx-1][1] + 1
breakpoints = [new_segments[idx][1] for idx in range(segments_no)]
aug_breakpoints = [0] + breakpoints + [audio_len]
cutoffs = [librosa.core.samples_to_time(elem, sr=sr) for elem in aug_breakpoints]
print(len(cutoffs))
print(cutoffs[:20])
print(cutoffs[-1])

In [None]:
# match medium to audio
audio_fn = audio_filenames[1]                                     # audio_2.mp4

media_path = '../data/media'
media_filenames = os.listdir(media_path)
matched_medium = [medium_fn for medium_fn in media_filenames
                 if medium_fn.split('.')[0].split('_')[-1] == audio_fn.split('.')[0].split('_')[-1]]
matched_medium[0]

In [None]:
# segment medium: ffmpeg in action

# ffmpeg -i /home/nantembo/VideoPerl/1.mp4 -f avi -vcodec copy -acodec copy -ss 0:14:47 -to 0:58:55 /home/nantembo/VideoPerl/2.mp4
# ffmpeg -i movie.mp4 -ss 00:00:03 -t 00:00:08 -async 1 cut.mp4
#  ffmpeg -ss 00:01:00 -i input.mp4 -to 00:02:00 -c copy output.mp4

medium = os.path.join('../data/media', matched_medium[0])
medium_ = matched_medium[0].split('.')[0]
path = '../data/segmented'
os.makedirs(path, mode=0o777, exist_ok=True)

print(medium)
print(medium_)
print(path)

In [None]:
for idx in range(1, len(cutoffs)):
    start = time.strftime('%H:%M:%S', time.gmtime(cutoffs[idx - 1]))
    end = time.strftime('%H:%M:%S', time.gmtime(cutoffs[idx]))
    output_name = medium_+'_'+str(idx)+'.mp4'
    output = os.path.join(path, output_name)

    inp = {medium:['-ss', start]}
    oup = {output:['-to', end, '-c', 'copy']}

    ff = ffmpy.FFmpeg(inputs=inp, outputs=oup)
    print(ff.cmd)
    ff.run()

In [None]:
start = time.strftime('%H:%M:%S', time.gmtime(cutoffs[0]))
print(start)
end = time.strftime('%H:%M:%S', time.gmtime(cutoffs[1]))
print(end)
output_name = medium_+'_'+str(idx)+'.mp4'
print(output_name)
output = os.path.join(path, output_name)
print(output)
inp = {medium:['-ss', start]}
oup = {output:['-to', end, '-c', 'copy']}

ff = ffmpy.FFmpeg(inputs=inp, outputs=oup)
print(ff.cmd)
# ff.run()