In [33]:
from openai import OpenAI
import subprocess
import os
OPENAI_API_KEY = open('openai-key.txt', 'r').read().strip()
client = OpenAI(api_key=OPENAI_API_KEY)

In [32]:
# Helper to get video title from url
def fetch_video_title(url):
    command = ["yt-dlp", "--get-title", url]
    result = subprocess.run(command, capture_output=True, text=True, check=True)
    return result.stdout.strip()

# Helper to get audio from video url
def download_audio(url, save_dir, output_name):
    command = ["yt-dlp", "--extract-audio", "--audio-format", "mp3", "-o", f"{save_dir}{output_name}.mp3", url]
    subprocess.run(command, check=True)

# Helper to trascribe audio file to txt file using OpenAI Whisper, writes to save_dir
def transcribe_all_audio_files(read_dir, save_dir):
    # Iterate over all MP3 files in the audio directory
    for filename in os.listdir(read_dir):
        if filename.endswith(".mp3"):
            audio_file_path = os.path.join(read_dir, filename)
            with open(audio_file_path, "rb") as audio_file:
                print(f"Transcribing {filename}...")
                # Transcribe the audio file using OpenAI Whisper API
                transcription = client.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file
                )
                # Save the transcription to a text file in save_dir
                output_text_file = os.path.join(save_dir, f"{os.path.splitext(filename)[0]}.txt")
                with open(output_text_file, "w") as text_file:
                    text_file.write(transcription.text)

In [30]:
# Directories where audio files are located and where you want the transcription saved 
read_dir = "audio/"
save_dir = "text/"

# List of YouTube URLSs you want to scrape audio from
youtube_urls = [
    "https://www.youtube.com/watch?v=N1Vb2QkeUaw",
    "https://www.youtube.com/watch?v=phoIkpFRDak",
    "https://www.youtube.com/watch?v=soQ2MB0O6bo",
    "https://www.youtube.com/watch?v=wOJaB2YbiNg",
    "https://www.youtube.com/watch?v=k88OIWJENgE",
    "https://www.youtube.com/watch?v=cmbYYM266eY",
    "https://www.youtube.com/watch?v=OUYhOP8xoSI",
    "https://www.youtube.com/watch?v=5xBH8jaLv9E",
    "https://www.youtube.com/watch?v=e9dgmBS4i7w",
    "https://www.youtube.com/watch?v=4A0A5OtlRP8"
]

# Download audio from each of the youtube_urls
for url in youtube_urls:
    title = fetch_video_title(url)
    filename_noext = "_".join(title.split()[:5])
    download_audio(url, save_dir, filename_noext)

In [31]:
# Run transcription for all files in the read_dir directory
transcribe_all_audio_files(read_dir, save_dir)

Transcribing How_to_Print_from_a.mp3...
Transcription for How_to_Print_from_a.mp3 saved as text/How_to_Print_from_a.txt
