## Summarize youtube videos with natural language processing and automatic speech recognition

### Import necessary libraries

In [None]:
import os
import re
import csv
import torch
from typing import List

import whisper
from yt_dlp import YoutubeDL as YDL
from transformers import AutoTokenizer, pipeline

### Set options

In [None]:
# The URL of the YouTube video to summarize
YOUTUBE_URL = ""

# The size of the ASR model to use
ASR_MODEL_SIZE = "small.en"

# The maximum length of each bullet point (in tokens)
SUMMARY_LENGTH = 128

# Set device to GPU if available, otherwise use CPU
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Play around with other models at https://huggingface.co/models?pipeline_tag=summarization&sort=downloads
NLP_ARCH = 'facebook/bart-large-cnn'

### Helper functions

In [None]:
def create_safe_filename(unsafe_string):
    """Replace all non-alphanumeric characters with underscores and return the modified string."""
    safe_string = re.sub(r'[^\w\s]', '_', unsafe_string)
    safe_string = re.sub(r'_+', '_', safe_string)
    safe_string = safe_string.strip('_')
    return safe_string

def save_transcript_to_csv(asr_result, file_path):
    """
    Save transcription to a CSV file.
    
    Parameters:
        asr_result: The transcription data to save.
        file_path (str): The file path of the CSV file.
    """
    field_names = ['start', 'end', 'text', 'summary']
    with open(file_path, "w", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        for entry in asr_result:
            writer.writerow({k:entry[k] for k in field_names if k in entry.keys()})

def load_csv_to_transcript(file_path):
    """
    Load transcription from a CSV file.
    
    Parameters:
        file_path (str): The file path of the CSV file.
    
    Returns:
        list: The transcription data.
    """
    asr_result = []
    with open(file_path, "r") as csv_file:
        # Create a CSV reader
        reader = csv.DictReader(csv_file)
        for row in reader:
            asr_result.append(row)
    return asr_result

def format_time(t):
    """
    Convert a time in seconds to a string in HH:MM:SS format.
    
    Parameters:
        t (str): The time in seconds.
    
    Returns:
        str: The time in HH:MM:SS format.
    """
    t = round(float(t))
    hh = t // 3600
    t %= 3600
    mm = t // 60
    ss = t % 60
    return f"{hh:02d}:{mm:02d}:{ss:02d}"

def print_timestamped_summaries(summary_file):
    """
    Print timestamped summaries from a summary file.
    
    Parameters:
        summary_file (str): The file path of the summary file.
    """
    # Load the summary segments from the file
    segments = load_csv_to_transcript(summary_file)
    
    # Iterate through the segments and print their start and end times and summaries
    for seg in segments:
        print(f"{format_time(seg['start'])} - {format_time(seg['end'])}")
        print(seg['summary'])
        print()

### Extract the audio from a given youtube video

In [None]:
def download_video(video_url: str):
    """
    Download audio from an uploaded video.
    
    Parameters:
        video_url (str): The URL of the video.
    
    Returns:
        str: The file path of the audio file.
    """
    # Create folder for assets
    with YDL({'quiet':True}) as ydl:
        dir_name = ydl.prepare_filename(ydl.extract_info(video_url, download=False)).split('.')[0]
    dir_name = create_safe_filename(dir_name)
    audio_path = f"{dir_name}/audio.mp3"

    # Download audio file if it does not exist
    if not os.path.exists(audio_path):
        YDL_OPTS = {
            'quiet':True,
            'format': 'bestaudio/best',
            'outtmpl': audio_path,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '128',
        }]}
        with YDL(YDL_OPTS) as ydl:
            ydl.download([video_url])
        
    return audio_path

In [None]:
audio = download_video(YOUTUBE_URL)

### Use an ASR model to transcribe the speech parts of the audio

In [None]:
def transcribe_audio(audio_path: str, asr_model, transcript_path: str = None):
    """
    Transcribe an audio file using the provided ASR model.
    
    Parameters:
        audio_path (str): The file path of the audio file.
        asr_model: The ASR model to use for transcription.
    
    Returns:
        str: The file path of the transcript.
    """
    if transcript_path is None:
        # Get the default transcript filepath
        dir_name = os.path.dirname(audio_path)
        transcript_path = f"{dir_name}/transcript.csv"

    # Save the audio transcript as a csv file at the transcript filepath
    if not os.path.exists(transcript_path):        
        with torch.no_grad():
            asr_result = asr_model.transcribe(audio_path)['segments']
        save_transcript_to_csv(asr_result, transcript_path)
        
    return transcript_path    

In [None]:
asr_model = whisper.load_model(ASR_MODEL_SIZE).to(DEVICE)
transcript_path = transcribe_audio(audio, asr_model)

### Generate summary of transcription
- 2 bullet points on what tokenization is
- overview of what this function does

In [None]:
def tokenize_with_timestamps(asr_result: List[dict]):
    """
    Tokenize transcribed text, chunking into segments of length NLP_MAXLEN 
    while preserving timestamps from ASR transcription.
    
    Parameters:
        asr_result (List[dict]): The transcription segments. Each dict should have a 'text' and 
            'start' and 'end' keys for the transcription text and start and end times, respectively.
    
    Returns:
        List[dict]: A list of dictionaries containing tokenized sentences with 'text', 'tokens', 'start',
            and 'end' keys for the sentence text, tokenized form, and start and end times, respectively.
    """
    # Initialize NLP tokenizer and maximum length of tokens
    NLP_TOKENIZER = AutoTokenizer.from_pretrained(NLP_ARCH)
    NLP_MAXLEN = NLP_TOKENIZER.model_max_length - 5
    # Initialize list to store tokenized sentences and current sentence data
    sent_tokens = []
    d = {}
    # Tokenize each transcription segment
    for seg in asr_result:
        seg_tokens = NLP_TOKENIZER(seg["text"], add_special_tokens=False)['input_ids']
        # If current tokens plus segment tokens exceed maximum length, add current sentence data to list
        if len(seg_tokens) + len(d.get('tokens', [])) >= NLP_MAXLEN:
            sent_tokens.append(d)
            d = {}
        # If current sentence data does not have a start time, set it
        if 'start' not in d.keys(): 
            d['start'] = seg['start']
        # Add segment text and tokens to current sentence data
        curr_tokens = d.get('tokens', [])
        curr_text = d.get('text', "")
        d['text'] = curr_text + seg['text']
        d['tokens'] = curr_tokens + seg_tokens
        d['end'] = seg['end']

    # Add final sentence data to list
    sent_tokens.append(d)
    return sent_tokens


def get_transcript_summary(transcript_file: str, summary_lengths: int = 128):
    """
    Generate a summary of a transcript and save it to a file.
    
    Parameters:
        transcript_file (str): The file path of the transcript.
        summary_lengths (int): The maximum length of the summary.
    
    Returns:
        str: The file path of the summary.
    """
    # Load the transcript from a file
    transcript = load_csv_to_transcript(transcript_file)
    
    # Tokenize the transcript and add timestamps
    timestamped_sentences = tokenize_with_timestamps(transcript)
    
    print("[NLP] Generating summary of transcription")
    
    # Extract the sentences from the timestamped transcript
    sentences = [sent['text'] for sent in timestamped_sentences]
    
    # Initialize the summarization pipeline
    summarizer = pipeline("summarization", model=NLP_ARCH)
    
    # Generate summaries for the sentences
    summaries = summarizer(sentences, max_length=summary_lengths, min_length=20, do_sample=False)
    
    # Add the summaries to the original timestamped sentences
    for a, b in zip(timestamped_sentences, summaries):
        a['summary'] = b['summary_text']
    
    # Save the timestamped summaries to a file
    summary_file = os.path.dirname(transcript_file)+"/summary.csv"
    save_transcript_to_csv(timestamped_sentences, summary_file)
    print("[NLP] Video summary saved at ", summary_file)

    return summary_file

summary = get_transcript_summary(transcript_path, SUMMARY_LENGTH)

### View the summary

In [None]:
print_timestamped_summaries(summary)