# Video to Text & Summarization with OpenAI's Whisper and GPT-3

This notebook implements a sequential data processing pipeline for creating text summaries from video links.

The processing steps are:

1. Download video(s) using _yt-dlp_.
2. Extract audio using _FFMPEG_.
3. Transcribe the audio to text, using OpenAI's _Whisper_.
4. Creating summary of the text, using OpenAI's _GPT-3_.

The output of each is saved to a file inside the respective feature directory inside the DATA directory (see .env config file).

In [1]:
import os, sys
from pathlib import Path
from dotenv import load_dotenv

cwd = Path(os.getcwd())
src = cwd.parent / 'src'
sys.path.append(src)

from src.video2audio.ffmpegaudioextraction import FFmpegAudioExtraction
from src.audio2text.whisperwrapper import WhisperWrapper
from src.text2summary.gpt3wrapper import GPT3Wrapper
from src.util.fs import LocalFSUtil

rel_path = '../'
dotenv_path = Path(rel_path + '.env')
if not dotenv_path.exists():
    raise Exception("Config file not found: ", dotenv_path)
else:
    load_dotenv(dotenv_path=dotenv_path)

In [2]:
# Setup
DATA_DIR = os.getenv('DATA_DIR')
DATA_DIR_VIDEO_FEATURES = os.getenv('DATA_DIR_VIDEO_FEATURES')
DATA_DIR_AUDIO_FEATURES = os.getenv('DATA_DIR_AUDIO_FEATURES')
DATA_DIR_TEXT_FEATURES = os.getenv('DATA_DIR_TEXT_FEATURES')
DATA_DIR_TEXT_SUMMARIES = os.getenv('DATA_DIR_TEXT_SUMMARIES')

# If paths are not absolute, adjust them to be in the parent directory
fsutil = LocalFSUtil()
if not fsutil.is_absolute_path(DATA_DIR): DATA_DIR = fsutil.normalise_path(rel_path + DATA_DIR)
if not fsutil.is_absolute_path(DATA_DIR_VIDEO_FEATURES): DATA_DIR_VIDEO_FEATURES = fsutil.normalise_path(rel_path + DATA_DIR_VIDEO_FEATURES)
if not fsutil.is_absolute_path(DATA_DIR_AUDIO_FEATURES): DATA_DIR_AUDIO_FEATURES = fsutil.normalise_path(rel_path + DATA_DIR_AUDIO_FEATURES)
if not fsutil.is_absolute_path(DATA_DIR_TEXT_FEATURES): DATA_DIR_TEXT_FEATURES = fsutil.normalise_path(rel_path + DATA_DIR_TEXT_FEATURES)
if not fsutil.is_absolute_path(DATA_DIR_TEXT_SUMMARIES): DATA_DIR_TEXT_SUMMARIES = fsutil.normalise_path(rel_path + DATA_DIR_TEXT_SUMMARIES)

for path in [DATA_DIR, DATA_DIR_VIDEO_FEATURES, DATA_DIR_AUDIO_FEATURES, DATA_DIR_TEXT_FEATURES, DATA_DIR_TEXT_SUMMARIES]:
    if fsutil.ensure_path_exists(path):
        print(f"Created '{path}'")

WHISPER_DEFAULT_MODEL = os.getenv('WHISPER_DEFAULT_MODEL')
SPACY_DEFAULT_MODEL = os.getenv('SPACY_DEFAULT_MODEL')
GPT3_DEFAULT_MODEL = os.getenv('GPT3_DEFAULT_MODEL')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

print("*** Configuration ***")
print("Whisper model:", WHISPER_DEFAULT_MODEL)
print("spaCy   model:", SPACY_DEFAULT_MODEL)
print("GPT-3   model:", GPT3_DEFAULT_MODEL)

video_proxy = FFmpegAudioExtraction()
whisper_proxy = WhisperWrapper(WHISPER_DEFAULT_MODEL)
gpt3_proxy = GPT3Wrapper(GPT3_DEFAULT_MODEL, OPENAI_API_KEY)

*** Configuration ***
Whisper model: base.en
spaCy   model: en_core_web_sm
GPT-3   model: text-davinci-003


As an example we will transcribe JFK's famous speech to congress on space exploration from 1961.

In [7]:
video_urls = [
    "https://www.youtube.com/watch?v=8ygoE2YiHCs"
]

In [4]:
# Download videos
from util.videodownloader import YouTubeDownloader
downloader = YouTubeDownloader()
video_files = []
if video_urls:
    for video_nr, video_url in enumerate(video_urls, start=1):
        try:
            video_file = downloader.download(video_url, DATA_DIR_VIDEO_FEATURES)
            video_files.append(video_file)
            print(f"Downloaded ({video_nr}/{len(video_urls)}): {os.path.basename(video_file)}")
        except Exception as e:
            print(f"Failed to download: {video_url} due to '{str(e)}'")
else:
    # Use previously downloaded videos
    video_files = fsutil.list_files(DATA_DIR_VIDEO_FEATURES, ignore_empty=True)
    video_files = filter(fsutil.delete_file_if_empty, video_files)

    for video_nr, video_file in enumerate(video_files, start=1):
        print(f"Found video ({video_nr}/{len(video_files)}): {os.path.basename(video_file)}")


Downloaded (1/1): JFK's Famous Speech to Congress on Space Exploration (1961) ｜ The Kennedy Center [8ygoE2YiHCs].webm


In [6]:
# Process files
for file_nr, video_file in enumerate(video_files, start=1):

    # progress indicator
    progress_str = str(int(100 * file_nr / len(video_files)))
    progress_str = f"{file_nr}/{len(video_files)} ({progress_str}%) *** {os.path.basename(video_file)} "
    progress_str = f"Processing file {progress_str}"
    print(progress_str)

    # prepare filenames
    audio_file = fsutil.source_to_target(video_file, DATA_DIR_AUDIO_FEATURES, '.mp3')
    text_file = fsutil.source_to_target(audio_file, DATA_DIR_TEXT_FEATURES, '.txt')
    summary_file = fsutil.source_to_target(text_file, DATA_DIR_TEXT_SUMMARIES, '-summary.txt')

    fsutil.delete_file_if_empty(audio_file)
    fsutil.delete_file_if_empty(text_file)
    fsutil.delete_file_if_empty(summary_file)

    if not fsutil.exists(audio_file):
        print(f"-> Extracting audio from video ({fsutil.get_size_str(video_file)}) ...")
        video_proxy.convert_to_audio(video_file, audio_file)
    else:
        print("-> Audio has already been extracted.")

    if not fsutil.exists(text_file):
        print(f"-> Transcribing audio to text ({fsutil.get_size_str(audio_file)}) ...")
        print(whisper_proxy)
        whisper_proxy.transcribe_to_file(audio_file, text_file, overwrite=True)
    else:
        print("-> Text has already been transcribed.")

    if not fsutil.exists(summary_file):
        print(f"-> Creating summary text ({fsutil.get_size_str(text_file)}) ...")
        text = fsutil.load_text(text_file)
        text_summary = gpt3_proxy.summarise_to_file(text_file, summary_file)
    else:
        print("-> Summary has already been created.")

    print()

# Select a video to process
# List packages in dir

Processing file 1/1 (100%) *** JFK's Famous Speech to Congress on Space Exploration (1961) ｜ The Kennedy Center [8ygoE2YiHCs].webm 
-> Audio has already been extracted.
-> Text has already been transcribed.
-> Creating summary text (1 KB) ...

