In [15]:
from pytube import YouTube
from pydub import AudioSegment
import os

def download_audio_from_youtube(video_url, output_path):
    try:
        # Download video
        yt = YouTube(video_url)
        video_stream = yt.streams.filter(only_audio=True).first()
        download_path = video_stream.download(output_path=output_path)

        # Convert to MP3 using pydub
        audio = AudioSegment.from_file(download_path)
        mp3_path = os.path.splitext(download_path)[0] + '.mp3'
        audio.export(mp3_path, format="mp3")

        # Remove the original file
        os.remove(download_path)

        print(f"Downloaded and converted audio saved at: {mp3_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    video_url = "https://www.youtube.com/watch?v=6dKiEY0UOtA&t=5s"
    output_path = "test4.mp3"
    download_audio_from_youtube(video_url, output_path)


Downloaded and converted audio saved at: g:\Projects\AudioScraping\test4.mp3\I Visited the Worlds Busiest Train Station.mp3


In [None]:
#Importing the AudioSegment
from pydub import AudioSegment
from pydub.silence import split_on_silence

#Normalise apmplitude
def match_target_amplitude(aChunk, target_dBFS):
    ''' Normalize given audio chunk '''
    change_in_dBFS = target_dBFS - aChunk.dBFS
    return aChunk.apply_gain(change_in_dBFS)

# Load your audio.
song = AudioSegment.from_mp3("test2.mp3/test2.mp3")

#Chunk creation
thresh = float('-inf')
chunks = split_on_silence (song,min_silence_len = 500 ,silence_thresh = thresh)

# Process each chunk with your parameters
for i, chunk in enumerate(chunks):
    silence_chunk = AudioSegment.silent(duration=10)
    audio_chunk = silence_chunk + chunk + silence_chunk
    normalized_chunk = match_target_amplitude(audio_chunk, -20.0)

    print("Exporting chunk{0}.mp3.".format(i))
    normalized_chunk.export(
        "./output8/chunk{0}.mp3".format(i),
        bitrate = "192k",
        format = "mp3"
    )

Exporting chunk0.mp3.
Exporting chunk1.mp3.
Exporting chunk2.mp3.
Exporting chunk3.mp3.
Exporting chunk4.mp3.
Exporting chunk5.mp3.
Exporting chunk6.mp3.
Exporting chunk7.mp3.
Exporting chunk8.mp3.
Exporting chunk9.mp3.
Exporting chunk10.mp3.
Exporting chunk11.mp3.
Exporting chunk12.mp3.
Exporting chunk13.mp3.
Exporting chunk14.mp3.
Exporting chunk15.mp3.
Exporting chunk16.mp3.
Exporting chunk17.mp3.
Exporting chunk18.mp3.
Exporting chunk19.mp3.
Exporting chunk20.mp3.
Exporting chunk21.mp3.
Exporting chunk22.mp3.
Exporting chunk23.mp3.
Exporting chunk24.mp3.
Exporting chunk25.mp3.
Exporting chunk26.mp3.
Exporting chunk27.mp3.
Exporting chunk28.mp3.
Exporting chunk29.mp3.
Exporting chunk30.mp3.


In [None]:
from pydub import AudioSegment
import numpy as np
import os

def calculate_decibels(frame):
    # Convert audio frame to numpy array
    frame_array = np.array(frame.get_array_of_samples())
    # Calculate RMS (Root Mean Square) value
    rms = np.sqrt(np.mean(frame_array**2))
    # Convert RMS to decibels
    if rms > 0:
        decibels = 20 * np.log10(rms)
    else:
        decibels = -float('inf')  # -inf if there is complete silence
    return decibels

def get_decibel_readings(audio_file_path, frame_duration_ms=100):
    audio = AudioSegment.from_file(audio_file_path)
    duration_ms = len(audio)
    decibel_readings = []

    for start_ms in range(0, duration_ms, frame_duration_ms):
        end_ms = start_ms + frame_duration_ms
        frame = audio[start_ms:end_ms]
        decibels = calculate_decibels(frame)
        decibel_readings.append((start_ms, decibels))

    return decibel_readings

if __name__ == "__main__":
    audio_file_path = "test2.mp3/test2.mp3"
    frame_duration_ms = 500

    if not os.path.exists(audio_file_path):
        print("File does not exist.")
    else:
        readings = get_decibel_readings(audio_file_path, frame_duration_ms)
        for start_ms, decibels in readings:
            print(f"Time: {start_ms} ms, Decibels: {decibels:.2f} dB")


Time: 0 ms, Decibels: 27.00 dB
Time: 500 ms, Decibels: -inf dB
Time: 1000 ms, Decibels: 34.71 dB
Time: 1500 ms, Decibels: 38.36 dB
Time: 2000 ms, Decibels: 36.78 dB
Time: 2500 ms, Decibels: 30.63 dB
Time: 3000 ms, Decibels: 26.50 dB
Time: 3500 ms, Decibels: 29.24 dB
Time: 4000 ms, Decibels: 37.25 dB
Time: 4500 ms, Decibels: 28.96 dB
Time: 5000 ms, Decibels: 24.91 dB
Time: 5500 ms, Decibels: 30.73 dB
Time: 6000 ms, Decibels: 24.67 dB
Time: 6500 ms, Decibels: 21.95 dB
Time: 7000 ms, Decibels: 36.21 dB
Time: 7500 ms, Decibels: 36.35 dB
Time: 8000 ms, Decibels: 32.05 dB
Time: 8500 ms, Decibels: 28.89 dB
Time: 9000 ms, Decibels: -inf dB
Time: 9500 ms, Decibels: 17.19 dB
Time: 10000 ms, Decibels: 35.93 dB
Time: 10500 ms, Decibels: 26.97 dB
Time: 11000 ms, Decibels: 29.53 dB
Time: 11500 ms, Decibels: 31.09 dB
Time: 12000 ms, Decibels: 14.39 dB
Time: 12500 ms, Decibels: 24.50 dB
Time: 13000 ms, Decibels: 28.03 dB
Time: 13500 ms, Decibels: 31.97 dB
Time: 14000 ms, Decibels: 37.29 dB
Time: 14500

  rms = np.sqrt(np.mean(frame_array**2))


In [None]:
from pydub import AudioSegment

def split_mp3(file_path, split_length=3000):
    # Load the MP3 file
    audio = AudioSegment.from_mp3(file_path)
    
    # Calculate the number of splits
    total_length = len(audio)
    start = 0
    
    # Split the audio every `split_length` milliseconds
    for i in range(0, total_length, split_length):
        split_audio = audio[start:start+split_length]
        split_filename = f"output6/split_part_{i//split_length}.mp3"
        split_audio.export(split_filename, format="mp3")
        start += split_length

# Example usage
split_mp3("test1.mp3/test1.mp3")

In [None]:
##USING GOOGLE API CLIENT DISCOVERY

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io

# Initialize the YouTube API client
youtube = build('youtube', 'v3', developerKey='YOUR_API_KEY')

def download_captions(caption_id):
    request = youtube.captions().download(
        id=caption_id,
        tfmt='srt'  # Specifies the format of the caption file to download, e.g., 'srt' for SubRip subtitle format.
    )
    
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
    
    fh.seek(0)
    return fh.read().decode('utf-8')

# Example usage
caption_id = 'YOUR_CAPTION_ID' 
transcript = download_captions(caption_id)
print(transcript)

Using the Youtube Transcript API

In [None]:
from pytube import YouTube

def download_video(url, path='videos/'):
    yt = YouTube(url)
    video = yt.streams.filter(file_extension='mp4').first()
    video.download(output_path=path, filename='video.mp4')

In [None]:
from moviepy.editor import *

def extract_audio(video_path, audio_path='audio/'):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path + 'audio.mp3')

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

def download_transcript(video_id):
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    transcript = transcript_list.find_transcript(['en'])  # Assuming English transcripts
    return transcript.fetch()

#USING YT - DLP LIBRARY

In [None]:
import yt_dlp

def download_srt(video_url):
    ydl_opts = {
        'writesubtitles': True,  # Write subtitle file
        'subtitlesformat': 'srt',  # Subtitles format (srt/vtt/ass/...)
        'subtitleslangs': ['en'],  # Languages of subtitles to download (use ['all'] to download all available subtitles)
        'outtmpl': '%(id)s.%(ext)s',  # Save file using the video ID
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

# Example usage
video_url = 'https://www.youtube.com/watch?v=QoKpQMJnBHY&ab_channel=SpeakEnglishSmartly'
download_srt(video_url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=QoKpQMJnBHY&ab_channel=SpeakEnglishSmartly
[youtube] QoKpQMJnBHY: Downloading webpage
[youtube] QoKpQMJnBHY: Downloading ios player API JSON
[youtube] QoKpQMJnBHY: Downloading player 5352eb4f
[youtube] QoKpQMJnBHY: Downloading m3u8 information
[info] QoKpQMJnBHY: Downloading 1 format(s): 616+251
[info] There are no subtitles for the requested languages
[hlsnative] Downloading m3u8 manifest
[hlsnative] Total fragments: 93
[download] Destination: QoKpQMJnBHY.f616.mp4
[download] 100% of   73.11MiB in 00:00:36 at 2.00MiB/s                 
[download] Destination: QoKpQMJnBHY.f251.webm
[download] 100% of    7.09MiB in 00:00:01 at 4.23MiB/s   
[Merger] Merging formats into "QoKpQMJnBHY.webm"
Deleting original file QoKpQMJnBHY.f616.mp4 (pass -k to keep)
Deleting original file QoKpQMJnBHY.f251.webm (pass -k to keep)


YT-DLP FOR A YOUTUBE CHANNEL

In [None]:
import subprocess
import yt_dlp

def get_video_ids(channel_url):
    command = ['yt-dlp', '--get-id', channel_url]
    result = subprocess.run(command, stdout=subprocess.PIPE, text=True)
    video_ids = result.stdout.strip().split('\n')
    return video_ids

def download_srt(video_id):
    ydl_opts = {
        'writesubtitles': True,
        'subtitlesformat': 'srt',
        'subtitleslangs': ['en-CA','en'],
        'outtmpl': '%(id)s.%(ext)s',
    }

    video_url = f'https://www.youtube.com/watch?v={video_id}'
    '''with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])'''

# Example usage
channel_url = 'https://www.youtube.com/c/NotJustBikes/videos'
video_ids = get_video_ids(channel_url)

for video_id in video_ids:
    download_srt(video_id)

YT-DLP REFINED ---- 5/7/24

In [None]:
import yt_dlp

def extract_transcript(video_url):
    ydl_opts = {
        'skip_download': True,  # Skip downloading the video
        'writeautomaticsub': True,  # Download automatic subtitles
        'subtitleslangs': ['en','en-CA'],  # Preferred subtitle language
        'subtitlesformat': 'vtt',  # Subtitle format
        'outtmpl': 'subtitles/%(id)s.%(ext)s',  # Output template
        'quiet': True,  # Suppress verbose output
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

# Example usage
video_url = 'https://youtu.be/6dKiEY0UOtA'
extract_transcript(video_url)

                                                                       

SUBTITLE METADATA

In [None]:
import yt_dlp

def get_subtitle_info(video_url):
    ydl_opts = {
        'skip_download': True,  # We only want to extract info
        'writesubtitles': True,
        'subtitleslangs': [],  # Leave empty to get all available languages
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)
        return info

def print_subtitle_info(info):
    if 'subtitles' in info:
        print(f"Subtitles available for {info['title']} ({info['id']}):")
        for lang, subtitles in info['subtitles'].items():
            print(f"  Language: {lang}")
            for subtitle in subtitles:
                print(f"    Format: {subtitle['ext']}")
                print(f"    URL: {subtitle['url']}")
    else:
        print(f"No subtitles available for {info['title']} ({info['id']}).")

if __name__ == "__main__":
    video_url = "https://youtu.be/6dKiEY0UOtA"
    info = get_subtitle_info(video_url)
    print_subtitle_info(info)


[youtube] Extracting URL: https://youtu.be/6dKiEY0UOtA
[youtube] 6dKiEY0UOtA: Downloading webpage
[youtube] 6dKiEY0UOtA: Downloading ios player API JSON
[youtube] 6dKiEY0UOtA: Downloading m3u8 information
[info] 6dKiEY0UOtA: Downloading subtitles: en-CA
Subtitles available for I Visited the World's Busiest Train Station (6dKiEY0UOtA):
  Language: en-CA
    Format: json3
    URL: https://www.youtube.com/api/timedtext?v=6dKiEY0UOtA&ei=MY2HZv7XK762z7sPhtCO2AQ&caps=asr&opi=112496729&xoaf=5&hl=en&ip=0.0.0.0&ipbits=0&expire=1720184737&sparams=ip%2Cipbits%2Cexpire%2Cv%2Cei%2Ccaps%2Copi%2Cxoaf&signature=7FBE3C624F3F7847E415ADB2A5B5A888000400AE.7116F0FEA90FEF76FFD969168FF86890A24C4800&key=yt8&lang=en-CA&fmt=json3
    Format: srv1
    URL: https://www.youtube.com/api/timedtext?v=6dKiEY0UOtA&ei=MY2HZv7XK762z7sPhtCO2AQ&caps=asr&opi=112496729&xoaf=5&hl=en&ip=0.0.0.0&ipbits=0&expire=1720184737&sparams=ip%2Cipbits%2Cexpire%2Cv%2Cei%2Ccaps%2Copi%2Cxoaf&signature=7FBE3C624F3F7847E415ADB2A5B5A888000400A

EXTRACTING THE TIMESTAMPS INTO AN ARRAY

In [None]:
import re
from datetime import datetime, timedelta

def extract_timestamps_from_vtt(filename):
    # Regular expression to match timestamps
    timestamp_regex = r'(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})'
    
    timestamps = []
    
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
        matches = re.findall(timestamp_regex, content)
        for start, end in matches:
            # Parse timestamps and convert to milliseconds
            start_ms = int(timedelta(hours=int(start[0:2]), minutes=int(start[3:5]), seconds=int(start[6:8]), milliseconds=int(start[9:])).total_seconds() * 1000)
            end_ms = int(timedelta(hours=int(end[0:2]), minutes=int(end[3:5]), seconds=int(end[6:8]), milliseconds=int(end[9:])).total_seconds() * 1000)
            timestamps.append((start_ms, end_ms))
    
    return timestamps

# Example usage
filename = 'subtitles/6dKiEY0UOtA.en.vtt'
timestamps = extract_timestamps_from_vtt(filename)
arr = timestamps
print(timestamps)


[(320, 2709), (2709, 2719), (2719, 4510), (4510, 4520), (4520, 6869), (6869, 6879), (6879, 8750), (8750, 8760), (8760, 10589), (10589, 10599), (10599, 12310), (12310, 12320), (12320, 15589), (15589, 15599), (15599, 17590), (17590, 17600), (17600, 19510), (19510, 19520), (19520, 20990), (20990, 21000), (21000, 22349), (22349, 22359), (22359, 24349), (24349, 24359), (24359, 26349), (26349, 26359), (26359, 29990), (29990, 30000), (30000, 31790), (31790, 31800), (31800, 34430), (34430, 34440), (34440, 37270), (37270, 37280), (37280, 39229), (39229, 39239), (39239, 42110), (42110, 42120), (42120, 43630), (43630, 43640), (43640, 45470), (45470, 45480), (45480, 47229), (47229, 47239), (47239, 50750), (50750, 50760), (50760, 52869), (52869, 52879), (52879, 54990), (54990, 55000), (55000, 57910), (57910, 57920), (57920, 59950), (59950, 59960), (59960, 62150), (62150, 62160), (62160, 64069), (64069, 64080), (64080, 66030), (66030, 66040), (66040, 73030), (73030, 73040), (73040, 74950), (74950, 7

In [None]:
def combine_timestamps(timestamps):
    combined_timestamps = []
    i = 0
    while i < len(timestamps) - 6:
        combined_timestamps.append((timestamps[i][0], timestamps[i + 6][1]))
        i += 7
    while i < len(timestamps):
        combined_timestamps.append(timestamps[i])
        i += 1

    return combined_timestamps

combined_timestamps = combine_timestamps(arr)
print(combined_timestamps)
len(combined_timestamps)

[(320, 8750), (8750, 15599), (15599, 22349), (22349, 30000), (30000, 39229), (39229, 45480), (45480, 54990), (54990, 62160), (62160, 74950), (74950, 81799), (81799, 89910), (89910, 97560), (97560, 106190), (106190, 113600), (113600, 122590), (122590, 128239), (128239, 137390), (137390, 144480), (144480, 154110), (154110, 160760), (160760, 168430), (168430, 175159), (175159, 182309), (182309, 187840), (187840, 197630), (197630, 203560), (203560, 210670), (210670, 215760), (215760, 223670), (223670, 228400), (228400, 236110), (236110, 240640), (240640, 248949), (248949, 255000), (255000, 262069), (262069, 269680), (269680, 277629), (277629, 284120), (284120, 292189), (292189, 298720), (298720, 307469), (307469, 313880), (313880, 321830), (321830, 327960), (327960, 337830), (337830, 344360), (344360, 352550), (352550, 358960), (358960, 367110), (367110, 372639), (372639, 380270), (380270, 385919), (385919, 394110), (394110, 399880), (399880, 410110), (410110, 416240), (416240, 425510), (4

171

AUDIO SEGMENTATION USING TIMESTAMPS

In [16]:
from pydub import AudioSegment
import os

# Example timestamps and files (start and end times in milliseconds)
timestamps = {
    'test4.mp3/test4.mp3': combined_timestamps}

def chunk_audio_files(timestamps, output_folder):
    
    for file, chunks in timestamps.items():
        audio = AudioSegment.from_file(file)
        for i, (start, end) in enumerate(chunks):
            # Slice the audio segment for the given start and end times
            chunk = audio[start:end]
            # Include the output folder in the chunk filename
            chunk_filename = os.path.join(output_folder, f"{os.path.basename(file).rsplit('.', 1)[0]}_chunk{i}.mp3")
            chunk.export(chunk_filename, format="mp3")
            print(f"Exported {chunk_filename}")

# Specify the output folder
output_folder = 'output10'
chunk_audio_files(timestamps, output_folder)

Exported output10\test4_chunk0.mp3
Exported output10\test4_chunk1.mp3
Exported output10\test4_chunk2.mp3
Exported output10\test4_chunk3.mp3
Exported output10\test4_chunk4.mp3
Exported output10\test4_chunk5.mp3
Exported output10\test4_chunk6.mp3
Exported output10\test4_chunk7.mp3
Exported output10\test4_chunk8.mp3
Exported output10\test4_chunk9.mp3
Exported output10\test4_chunk10.mp3
Exported output10\test4_chunk11.mp3
Exported output10\test4_chunk12.mp3
Exported output10\test4_chunk13.mp3
Exported output10\test4_chunk14.mp3
Exported output10\test4_chunk15.mp3
Exported output10\test4_chunk16.mp3
Exported output10\test4_chunk17.mp3
Exported output10\test4_chunk18.mp3
Exported output10\test4_chunk19.mp3
Exported output10\test4_chunk20.mp3
Exported output10\test4_chunk21.mp3
Exported output10\test4_chunk22.mp3
Exported output10\test4_chunk23.mp3
Exported output10\test4_chunk24.mp3
Exported output10\test4_chunk25.mp3
Exported output10\test4_chunk26.mp3
Exported output10\test4_chunk27.mp3
Ex

In [9]:
def milliseconds_to_time_format(milliseconds):
    # Calculate hours, minutes, seconds and milliseconds
    hours = milliseconds // (1000 * 60 * 60)
    milliseconds_remaining = milliseconds % (1000 * 60 * 60)
    minutes = milliseconds_remaining // (1000 * 60)
    milliseconds_remaining %= (1000 * 60)
    seconds = milliseconds_remaining // 1000
    milliseconds_remaining %= 1000
    
    # Format the time string
    time_format = f"{hours:02}:{minutes:02}:{seconds:02}:{milliseconds_remaining:03}"
    
    return time_format

time_format = []
for i in combined_timestamps:
    start_format = milliseconds_to_time_format(i[0])
    end_format = milliseconds_to_time_format(i[1])
    time_format.append((start_format, end_format))

print(time_format)



[('00:00:00:320', '00:00:08:750'), ('00:00:08:750', '00:00:15:599'), ('00:00:15:599', '00:00:22:349'), ('00:00:22:349', '00:00:30:000'), ('00:00:30:000', '00:00:39:229'), ('00:00:39:229', '00:00:45:480'), ('00:00:45:480', '00:00:54:990'), ('00:00:54:990', '00:01:02:160'), ('00:01:02:160', '00:01:14:950'), ('00:01:14:950', '00:01:21:799'), ('00:01:21:799', '00:01:29:910'), ('00:01:29:910', '00:01:37:560'), ('00:01:37:560', '00:01:46:190'), ('00:01:46:190', '00:01:53:600'), ('00:01:53:600', '00:02:02:590'), ('00:02:02:590', '00:02:08:239'), ('00:02:08:239', '00:02:17:390'), ('00:02:17:390', '00:02:24:480'), ('00:02:24:480', '00:02:34:110'), ('00:02:34:110', '00:02:40:760'), ('00:02:40:760', '00:02:48:430'), ('00:02:48:430', '00:02:55:159'), ('00:02:55:159', '00:03:02:309'), ('00:03:02:309', '00:03:07:840'), ('00:03:07:840', '00:03:17:630'), ('00:03:17:630', '00:03:23:560'), ('00:03:23:560', '00:03:30:670'), ('00:03:30:670', '00:03:35:760'), ('00:03:35:760', '00:03:43:670'), ('00:03:43:67

VTT SPLITTING

In [12]:
import re
from datetime import timedelta

def parse_vtt_time(time_str):
    """Convert a VTT time string to a timedelta."""
    hours, minutes, seconds,milliseconds = map(float, re.split('[:.]', time_str))
    return timedelta(hours=hours, minutes=minutes, seconds=seconds, milliseconds=milliseconds)

def split_vtt_by_timestamps(vtt_content, splits):
    """Split VTT content based on given start and end timestamps."""
    lines = vtt_content.split('\n')
    split_contents = [[] for _ in splits]
    current_split_index = 0
    start_time = None

    for line in lines:
        if '-->' in line:
            start_time_str, end_time_str = re.findall(r'\d{2}:\d{2}:\d{2}\.\d{3}', line)
            start_time = parse_vtt_time(start_time_str)
            # Check if the current caption falls within the current split range
            if start_time >= splits[current_split_index][1]:
                current_split_index += 1
            if current_split_index >= len(splits):
                break  # All splits are processed
        if start_time is not None and current_split_index < len(splits) and start_time >= splits[current_split_index][0]:
            split_contents[current_split_index].append(line)

    return split_contents

def write_splits_to_files(split_contents):
    """Write each split content to a separate VTT file."""
    for i, content in enumerate(split_contents):
        with open(f'split_{i+1}.vtt', 'w') as f:
            f.write('WEBVTT\nKind: captions\nLanguage: en\n\n')
            f.write('\n'.join(content))

def process_vtt_file(file_path, splits):
    """Read VTT file, split by timestamps, and write to separate files."""
    with open(file_path, 'r') as file:
        vtt_content = file.read()
    split_contents = split_vtt_by_timestamps(vtt_content, splits)
    write_splits_to_files(split_contents)

file_path = 'subtitles/6dKiEY0UOtA.en.vtt'

splits = []
for i in time_format:
    splits.append((parse_vtt_time(i[0]), parse_vtt_time(i[1])))

process_vtt_file(file_path, splits)

In [13]:
def write_splits_to_files(split_contents, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for i, content in enumerate(split_contents):
        file_path = os.path.join(output_folder, f'split_{i+1}.vtt')
        with open(file_path, 'w') as f:
            f.write('WEBVTT\nKind: captions\nLanguage: en\n\n')
            f.write('\n'.join(content))

def process_vtt_file(file_path, splits, output_folder):
    with open(file_path, 'r') as file:
        vtt_content = file.read()
    split_contents = split_vtt_by_timestamps(vtt_content, splits)
    write_splits_to_files(split_contents, output_folder)

import os

file_path = 'subtitles/6dKiEY0UOtA.en.vtt'
output_folder = 'subtitles_splits' 

splits = []
for i in time_format:
    splits.append((parse_vtt_time(i[0]), parse_vtt_time(i[1])))

process_vtt_file(file_path, splits, output_folder)

EXTRACTING THE TEXT FROM THE VTT FILE

In [19]:
import re

def extract_specific_lines(vtt_filename):
    # Regular expression to match lines with text, timestamps, and formatting tags
    pattern = re.compile(r'.*<\d{2}:\d{2}:\d{2}\.\d{3}><c>.*</c>.*')
    
    with open(vtt_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if re.match(pattern, line):
                print(line.strip())


vtt_filename = 'subtitles_splits/split_171.vtt'
extract_specific_lines(vtt_filename)

def extract_text_from_vtt_line(line):
    # Remove all occurrences of timestamp and formatting tags
    text_only = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>|<c>|</c>', '', line)
    return text_only

# Example usage
vtt_line = "you<00:21:06.960><c> get</c><00:21:07.280><c> but</c><00:21:07.440><c> it's</c><00:21:07.679><c> also</c><00:21:07.919><c> a</c><00:21:08.039><c> great</c><00:21:08.280><c> way</c><00:21:08.400><c> to</c>"
text_only = extract_text_from_vtt_line(vtt_line)
print(text_only)

you<00:21:06.960><c> get</c><00:21:07.280><c> but</c><00:21:07.440><c> it's</c><00:21:07.679><c> also</c><00:21:07.919><c> a</c><00:21:08.039><c> great</c><00:21:08.280><c> way</c><00:21:08.400><c> to</c>
support<00:21:09.000><c> independent</c><00:21:09.600><c> creators</c><00:21:10.400><c> too</c><00:21:11.039><c> thanks</c>
for<00:21:11.480><c> watching</c><00:21:11.840><c> and</c><00:21:12.080><c> maybe</c><00:21:12.360><c> next</c><00:21:12.600><c> time</c><00:21:12.799><c> you'll</c>
be<00:21:13.159><c> watching</c><00:21:13.440><c> early</c><00:21:14.080><c> on</c><00:21:14.559><c> nebula</c>


In [23]:
import re

def extract_specific_lines(vtt_filename):
    # Regular expression to match lines with text, timestamps, and formatting tags
    pattern = re.compile(r'.*<\d{2}:\d{2}:\d{2}\.\d{3}><c>.*</c>.*')
    
    extracted_lines = "" 
    
    with open(vtt_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if re.match(pattern, line):
                extracted_lines += line.strip() + "\n" 
    
    return extracted_lines

vtt_filename = 'subtitles_splits/split_1.vtt'
extracted_text = extract_specific_lines(vtt_filename)

def extract_text_from_vtt_line(line):
    # Remove all occurrences of timestamp and formatting tags
    text_only = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>|<c>|</c>', '', line)
    return text_only

text_only = extract_text_from_vtt_line(extracted_text)
print(text_only)

Tokyo has a lot of trains and that's an
understatement so you might think that
with 14 Rail and Metro lines and shin
kanen high-speed trains departing every



EXTRACTING LINKS OF YOUTUBE VIDEOS

In [24]:
import subprocess
import json

def get_youtube_urls(channel_url):
    command = ['yt-dlp', '-j', '--flat-playlist', channel_url]
    
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        print("Error:", result.stderr)
        return []
    
    video_urls = []
    for line in result.stdout.strip().split('\n'):
        video_data = json.loads(line)
        video_url = f"https://www.youtube.com/watch?v={video_data['id']}"
        video_urls.append(video_url)
    
    return video_urls

channel_url = 'https://www.youtube.com/@NotJustBikes/videos'
video_urls = get_youtube_urls(channel_url)
for url in video_urls:
    print(url)

https://www.youtube.com/watch?v=hTPIs370dPM
https://www.youtube.com/watch?v=6dKiEY0UOtA
https://www.youtube.com/watch?v=JRbnBc-97Ps
https://www.youtube.com/watch?v=g0F_hTGYa0Y
https://www.youtube.com/watch?v=6FwQp0xnFEE
https://www.youtube.com/watch?v=CHZwOAIect4
https://www.youtube.com/watch?v=_yDtLv-7xZ4
https://www.youtube.com/watch?v=6Vil5KC7Bl0
https://www.youtube.com/watch?v=HACaRm2KP6Q
https://www.youtube.com/watch?v=ztpcWUqVpIg
https://www.youtube.com/watch?v=AOc8ASeHYNw
https://www.youtube.com/watch?v=ymcBC7MFRIk
https://www.youtube.com/watch?v=REni8Oi1QJQ
https://www.youtube.com/watch?v=8nZh7A7qTPo
https://www.youtube.com/watch?v=zmp09Fd07oc
https://www.youtube.com/watch?v=kdz6FeQLuHQ
https://www.youtube.com/watch?v=jN7mSXMruEo
https://www.youtube.com/watch?v=EqwasBTzZS8
https://www.youtube.com/watch?v=n94-_yE4IeU
https://www.youtube.com/watch?v=mXLqrMljdfU
https://www.youtube.com/watch?v=VvdQ381K5xg
https://www.youtube.com/watch?v=VpavEMVQpW8
https://www.youtube.com/watch?v=

BATCH PROCESSING FINAL CODE

In [None]:
import subprocess
import json
from pytube import YouTube
from pydub import AudioSegment
import os
import yt_dlp
from pydub import AudioSegment
import re
from datetime import datetime, timedelta

#Obataining the URLS for a Youtube Channel
def get_youtube_urls(channel_url):
    command = ['yt-dlp', '-j', '--flat-playlist', channel_url]
    
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        print("Error:", result.stderr)
        return []
    
    video_urls = []
    for line in result.stdout.strip().split('\n'):
        video_data = json.loads(line)
        video_url = f"https://www.youtube.com/watch?v={video_data['id']}"
        video_urls.append(video_url)
    
    return video_urls

channel_url = 'https://www.youtube.com/@NotJustBikes/videos'
video_urls = get_youtube_urls(channel_url)

#Downloading the audio from the Youtube Channel into MP3 Format
def download_audio_from_youtube(video_url, output_path):
    try:
        yt = YouTube(video_url)
        video_stream = yt.streams.filter(only_audio=True).first()
        download_path = video_stream.download(output_path=output_path)

        audio = AudioSegment.from_file(download_path)
        mp3_path = os.path.splitext(download_path)[0] + '.mp3'
        audio.export(mp3_path, format="mp3")

        os.remove(download_path)

        print(f"Downloaded and converted audio saved at: {mp3_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

i = 0 
for video_url in video_urls:
    output_path = f"test{i}.mp3"
    download_audio_from_youtube(video_url, output_path)
    i += 1
count = i

#Obtaining the VTT Subtitle files for the Youtube Videos
def extract_transcript(video_url):
    ydl_opts = {
        'skip_download': True, 
        'writeautomaticsub': True, 
        'subtitleslangs': ['en','en-CA'],  
        'subtitlesformat': 'vtt',  
        'outtmpl': 'subtitles/%(id)s.%(ext)s',  
        'quiet': True, 
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

for video_url in video_urls:
    extract_transcript(video_url)

#Getting the timestamp splits, Splitting VTT functions
def parse_vtt_time(time_str):
    """Convert a VTT time string to a timedelta."""
    hours, minutes, seconds,milliseconds = map(float, re.split('[:.]', time_str))
    return timedelta(hours=hours, minutes=minutes, seconds=seconds, milliseconds=milliseconds)

def split_vtt_by_timestamps(vtt_content, splits):
    """Split VTT content based on given start and end timestamps."""
    lines = vtt_content.split('\n')
    split_contents = [[] for _ in splits]
    current_split_index = 0
    start_time = None

    for line in lines:
        if '-->' in line:
            start_time_str, end_time_str = re.findall(r'\d{2}:\d{2}:\d{2}\.\d{3}', line)
            start_time = parse_vtt_time(start_time_str)
            # Check if the current caption falls within the current split range
            if start_time >= splits[current_split_index][1]:
                current_split_index += 1
            if current_split_index >= len(splits):
                break  # All splits are processed
        if start_time is not None and current_split_index < len(splits) and start_time >= splits[current_split_index][0]:
            split_contents[current_split_index].append(line)

    return split_contents

def write_splits_to_files(split_contents, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for i, content in enumerate(split_contents):
        file_path = os.path.join(output_folder, f'split_{i+1}.vtt')
        with open(file_path, 'w') as f:
            f.write('WEBVTT\nKind: captions\nLanguage: en\n\n')
            f.write('\n'.join(content))

def process_vtt_file(file_path, splits, output_folder):
    with open(file_path, 'r') as file:
        vtt_content = file.read()
    split_contents = split_vtt_by_timestamps(vtt_content, splits)
    write_splits_to_files(split_contents, output_folder)

def chunk_audio_files(timestamps, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for file, chunks in timestamps.items():
        audio = AudioSegment.from_file(file)
        for i, (start, end) in enumerate(chunks):
            chunk = audio[start:end]
            chunk_filename = os.path.join(output_folder, f"{os.path.basename(file).rsplit('.', 1)[0]}_chunk{i}.mp3")
            chunk.export(chunk_filename, format="mp3")
            print(f"Exported {chunk_filename}")

def extract_timestamps_from_vtt(filename):
    timestamp_regex = r'(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})'
    
    timestamps = []
    
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()
        matches = re.findall(timestamp_regex, content)
        for start, end in matches:
            start_ms = int(timedelta(hours=int(start[0:2]), minutes=int(start[3:5]), seconds=int(start[6:8]), milliseconds=int(start[9:])).total_seconds() * 1000)
            end_ms = int(timedelta(hours=int(end[0:2]), minutes=int(end[3:5]), seconds=int(end[6:8]), milliseconds=int(end[9:])).total_seconds() * 1000)
            timestamps.append((start_ms, end_ms))
    return timestamps

def combine_timestamps(timestamps):
    combined_timestamps = []
    i = 0
    while i < len(timestamps) - 6:
        combined_timestamps.append((timestamps[i][0], timestamps[i + 6][1]))
        i += 7
    while i < len(timestamps):
        combined_timestamps.append(timestamps[i])
        i += 1
    return combined_timestamps

def milliseconds_to_time_format(milliseconds):
    # Calculate hours, minutes, seconds and milliseconds
    hours = milliseconds // (1000 * 60 * 60)
    milliseconds_remaining = milliseconds % (1000 * 60 * 60)
    minutes = milliseconds_remaining // (1000 * 60)
    milliseconds_remaining %= (1000 * 60)
    seconds = milliseconds_remaining // 1000
    milliseconds_remaining %= 1000
    
    # Format the time string
    time_format = f"{hours:02}:{minutes:02}:{seconds:02}:{milliseconds_remaining:03}"
    
    return time_format

#Chunking the audio files and splitting the VTT files
j=0
for filename in os.listdir('subtitles'):
    timestamps = extract_timestamps_from_vtt(os.path.join('subtitles',filename))
    arr = timestamps
    combined_timestamps = combine_timestamps(arr)
    #print(timestamps)
    for i in range(0,count):
        timestamps = {
            f'test{i}.mp3/test{i}.mp3': combined_timestamps}
        output_folder = f'final_outputs/output{i}'
        chunk_audio_files(timestamps, output_folder)
    # Splitting the VTT files
    splits = []
    time_format = []
    for k in combined_timestamps:
        start_format = milliseconds_to_time_format(i[0])
        end_format = milliseconds_to_time_format(i[1])
        time_format.append((start_format, end_format))
    for i in time_format:
        splits.append((parse_vtt_time(i[0]), parse_vtt_time(i[1])))
    process_vtt_file(os.path.join('subtitles',filename), splits,f'subtitles_split{j}')
    j = j+1

#Text Extraction from the VTT files and storing it into a text file
def extract_specific_lines(vtt_filename):
    pattern = re.compile(r'.*<\d{2}:\d{2}:\d{2}\.\d{3}><c>.*</c>.*')
    
    with open(vtt_filename, 'r', encoding='utf-8') as file:
        for line in file:
            if re.match(pattern, line):
                print(line.strip())

def extract_text_from_vtt_line(line):
    text_only = re.sub(r'<\d{2}:\d{2}:\d{2}\.\d{3}>|<c>|</c>', '', line)
    return text_only

matching_directories = []
for item in os.listdir('.'):
    if os.path.isdir(item) and 'subtitles_split' in item:
        matching_directories.append(item)

for filedir in matching_directories:
    for file in matching_directories:
        extracted_text = extract_specific_lines(os.path.join(filedir,file))
        text_only = extract_text_from_vtt_line(extracted_text)
        file_path = f'{filedir}/{file}.txt'
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text_only)
