https://medium.com/mcd-unison/youtube-data-api-v3-in-python-tutorial-with-examples-e829a25d2ebd#3d47

### TODO: rewrite with progress bars using tqdm

In [1]:
!pip install youtube-transcript-api
!pip install pytube
!pip install langchain
!pip install openai
!pip install ffmpeg-python
!pip install pydub
!pip install tiktoken
!pip install google-api-python-client
!pip install numba
!pip install more-itertools
!pip install webvtt-py
!pip install --upgrade google-api-python-client
!pip install --upgrade google-auth-oauthlib google-auth-httplib2



In [2]:
import os
import json
from collections import OrderedDict
import re
import pandas as pd


import webvtt
from pytube import YouTube
from pydub import AudioSegment
from pathlib import Path
import openai

from langchain.document_loaders import YoutubeLoader
from langchain.llms import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import AnalyzeDocumentChain
import googleapiclient.discovery

In [3]:
with open('D:\\NLP_projects\\openai_api_key.txt', 'r') as file:
    OPENAI_API_KEY = file.read().strip()


with open('D:\\NLP_projects\\youtube_api_key.txt', 'r') as file:
    YOUTUBE_API_KEY = file.read().strip()
    
# Set OpenAI API Key    
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:


def sanitize_filename(filename):
    # These are the characters that are not allowed in Windows filenames
    illegal_chars = r'[<>:"/\\|?*]'
    # Replace illegal characters with an underscore
    sanitized_filename = re.sub(illegal_chars, '', filename)
    return sanitized_filename

# Define a function to convert a Timedelta to a string
def timedelta_to_str(td):
    total_seconds = td.total_seconds()
    hours = int(total_seconds // 3600)
    minutes = int((total_seconds % 3600) // 60)
    seconds = total_seconds % 60
    return "{:02}:{:02}:{:06.3f}".format(hours, minutes, seconds)

def add_10mins_to_timedelta(df, cols, audio_segment):
    # Loop over each column specified in the 'cols' list
    for col in cols:
        # Convert the column to Timedelta, handling non-convertible values by setting them to NaT
        df[col] =  pd.to_timedelta(df[col], errors='coerce')
        
        # Since the audio is split into 10-minute segments, we add 10 minutes for each past segment
        # This is done by creating a new Timedelta of 'audio_segment' * 10 minutes and adding it to the existing Timedelta
        df[col] = df[col] +  pd.to_timedelta(audio_segment * 10, unit='minutes' , errors='coerce')   

    # Return the modified DataFrame
    return df


def transcribe_audio(audio_file_path):
    
    """
    This function takes an audio file and splits it into chunks of 10 minutes each.
    The chunks are then exported as .mp3 files.

    Parameters:
    audio_file_path (str): The path to the audio file.

    Returns:
    None
    """
    
    
    # Define the duration of each chunk in milliseconds (10 minutes)
    ten_minutes = 10 * 60 * 1000
    
    
    # Load the audio file
    the_audio = AudioSegment.from_file(audio_file_path)

    # Calculate the duration of the audio file in milliseconds
    the_audio_duration_seconds = the_audio.duration_seconds
    the_audio_duration_milliseconds = the_audio_duration_seconds * 1000 
    
    print('the_audio_duration_seconds: {}'.format(the_audio_duration_seconds))
    print('the_audio_duration_milliseconds {}'.format(the_audio_duration_milliseconds))

    # Get the name of the audio file without the extension
    this_video_name = audio_file_path.name.split('.')[0]

    # Define the directory where the chunks will be exported
    export_path = Path("exported_mp3_chunks")
    # Create the directory if it doesn't exist
    export_path.mkdir(exist_ok=True)



    # Initialize counters
    audio_segment = 0
    this_time_queue = 0
    
    #Create empty DataFrame with specific column names & types
    df_transcript = pd.DataFrame({'start': pd.Series(dtype='str'),
                       'end': pd.Series(dtype='str'),
                       'text': pd.Series(dtype='str')})

    # Loop over the audio file, creating and exporting chunks
    while this_time_queue < the_audio_duration_milliseconds:
        
        
        # Extract a chunk from the audio file
        this_10_minutes = the_audio[this_time_queue:this_time_queue + ten_minutes]
        # Create a filename for the chunk
        # this_file_name = '{}_{:03d}.mp3'.format(this_video_name, audio_segment)
        # We don't need to save the chunks, just overwrite a scratch file
        this_file_name = 'scratch.mp3'
        # Define the path where the chunk will be exported
        export_file_path = export_path / this_file_name
        # Export the chunk as an .mp3 file
        this_10_minutes.export(export_file_path, format="mp3")
        

        
        print()
        print('this_time_queue {}'.format(this_time_queue))
        print('audio_segment {}'.format(audio_segment))
        print(this_time_queue < the_audio_duration_milliseconds)
              
        
        # get the transcript
        audio_file= open(export_file_path, "rb")
        transcript = openai.Audio.transcribe("whisper-1", audio_file, response_format="vtt")
        
        # The name of the file to save the transcript to
        filename = "transcript.vtt"

        # Open the file in write mode
        with open(filename, 'w', encoding='utf-8') as f:
            # Write the transcript to the file
            f.write(transcript)
            
        # Read the transcript into a webvtt object 
        webvtt_obj = webvtt.read(filename) 
        # Convert the list of Caption objects into a list of dictionaries
        data = [{'start': caption.start, 'end': caption.end, 'text': caption.text} for caption in webvtt_obj]  
        
        # If are on the first 10 minutes of this video we need to create 
        # the main dataframe
        print('audio segment {}'.format(audio_segment))
        print()
        if audio_segment == 0:
            # Convert the list of dictionaries into a DataFrame
            df_transcript = pd.DataFrame(data)
            
            # We are on segment 0 so we will convert to numeric but add 0 minutes
            df_transcript = add_10mins_to_timedelta(df_transcript, ['start', 'end'], audio_segment)
            print('df_transcript created')
        else:
            
            try:
                # If we are not on the first 10 minutes create a dataframe for this 10 minutes 
                # and concat with the main dataframe
                df_this_chunk = pd.DataFrame(data)  
                # We will convert to numeric and add 10 minutes for each segment
                df_this_chunk = add_10mins_to_timedelta(df_this_chunk, ['start', 'end'], audio_segment)
                df_transcript = pd.concat([df_transcript, df_this_chunk],ignore_index=True)
                print('concat happened')
            except:
                print('concat failed')
                print(df_this_chunk)
             
        # Update the counters
        this_time_queue += ten_minutes
        audio_segment += 1

    df_transcript['start'] = df_transcript['start'].apply(timedelta_to_str)
    df_transcript['end'] = df_transcript['end'].apply(timedelta_to_str)

    return df_transcript

In [5]:
# Read URLs from CSV file
df = pd.read_csv('youtube_urls.csv')

# Extract URLs from 'url' column
url_list = df['url'].tolist()

# Define the directory where we have stored the audio files
# We can't download through an api without violating Google's TOS
audio_dir = Path("raw_downloaded_audio")

# Create the directory if it doesn't exist
audio_dir.mkdir(exist_ok=True)




# Process each URL
for url in url_list:
    if url:

        dict_meta_data = {}
        # Download and process video
        yt = YouTube(url, use_oauth=True, allow_oauth_cache=True)
        # yt.streams.filter(only_audio=True, abr="160kbps")[0].download(audio_dir)

        # Extract metadata
        video_data = {
            'watch_url': yt.watch_url,
            'video_id': yt.video_id,
            'title': yt.title,
            'channel_id': yt.channel_id,
            'channel_url': yt.channel_url,
            'author': yt.author,
            'description': yt.description,
            'thumbnail_url': yt.thumbnail_url,
            'keywords': yt.keywords,
            'length': yt.length,
            'audio_filename': yt.title + '.webm',
        }

        # Print metadata
        print(yt.watch_url)
        print()
        dict_meta_data[yt.video_id] = video_data
        
        # Generate the path to the audio file and remove windows forbidden characters
        audio_file_path = audio_dir / sanitize_filename(video_data['audio_filename'])
        print(audio_file_path)
        
        # Generate the transcript using OpenAI Whisper
        df_transcript = transcribe_audio(audio_file_path) 
        # convert the dataframe to a dict add it to the video data
        dict_meta_data[yt.video_id]['transcript'] = df_transcript.to_dict('records', into=OrderedDict)
        

              
        
        the_json_filename = '{}_video_metadata.json'.format(yt.video_id)
        with open(the_json_filename, 'w') as json_file:
            json.dump(dict_meta_data, json_file)
            
        print('Metadata saved to {}'.format(the_json_filename))         

https://youtube.com/watch?v=5OLIJvMYwsc

raw_downloaded_audio\Sam Altman GRILLED by Congress - GATO does a roundtable debrief!.webm
the_audio_duration_seconds: 5247.7
the_audio_duration_milliseconds 5247700.0

this_time_queue 0
audio_segment 0
True
audio segment 0

df_transcript created

this_time_queue 600000
audio_segment 1
True
audio segment 1

concat happened

this_time_queue 1200000
audio_segment 2
True
audio segment 2

concat happened

this_time_queue 1800000
audio_segment 3
True
audio segment 3

concat happened

this_time_queue 2400000
audio_segment 4
True
audio segment 4

concat happened

this_time_queue 3000000
audio_segment 5
True
audio segment 5

concat happened

this_time_queue 3600000
audio_segment 6
True
audio segment 6

concat happened

this_time_queue 4200000
audio_segment 7
True
audio segment 7

concat happened

this_time_queue 4800000
audio_segment 8
True
audio segment 8

concat happened
Metadata saved to 5OLIJvMYwsc_video_metadata.json
https://youtube.com/watch?v=si5