# Download YouTube Videos as Audio

In [1]:
import subprocess

# Path to the youtube-dl executable, run `which youtube-dl` to find
youtube_dl_path = "/home/master/.local/bin/yt-dlp"

output_directory = "0_audio/"

# Your list of video URLs
url_list = ["https://www.youtube.com/watch?v=H1cezQRTMq0",
            "https://www.youtube.com/watch?v=I8CST9vXrm4",
           "https://www.youtube.com/watch?v=I28PKy4AIGI",
           "https://www.youtube.com/watch?v=zoE3DqglcgM",
           "https://www.youtube.com/watch?v=bEovhfxJsM4",
           "https://www.youtube.com/watch?v=_5ocRWk9GLc",
           "https://www.youtube.com/watch?v=9lBbqH_1KS4",
           "https://www.youtube.com/watch?v=AcsEAukECRk",
           "https://www.youtube.com/watch?v=4riGbukyZjg"]

for video_url in url_list:
    # Use subprocess to run youtube-dl command
    command = f"{youtube_dl_path} -f 'bestaudio[ext=m4a]' -o '{output_directory}%(title)s.%(ext)s' {video_url}"
    subprocess.run(command, shell=True)

[youtube] Extracting URL: https://www.youtube.com/watch?v=H1cezQRTMq0
[youtube] H1cezQRTMq0: Downloading webpage
[youtube] H1cezQRTMq0: Downloading ios player API JSON
[youtube] H1cezQRTMq0: Downloading android player API JSON
[youtube] H1cezQRTMq0: Downloading m3u8 information
[info] H1cezQRTMq0: Downloading 1 format(s): 140
[download] Destination: 0_audio/Emeritus Lecture—Valedictory： Who Am I？ (Martin Tompa).m4a
[download] 100% of   56.23MiB in 00:00:07 at 7.39MiB/s     
[FixupM4a] Correcting container of "0_audio/Emeritus Lecture—Valedictory： Who Am I？ (Martin Tompa).m4a"
[youtube] Extracting URL: https://www.youtube.com/watch?v=I8CST9vXrm4
[youtube] I8CST9vXrm4: Downloading webpage
[youtube] I8CST9vXrm4: Downloading ios player API JSON
[youtube] I8CST9vXrm4: Downloading android player API JSON
[youtube] I8CST9vXrm4: Downloading player 42a553e1
[youtube] I8CST9vXrm4: Downloading m3u8 information
[info] I8CST9vXrm4: Downloading 1 format(s): 140
[download] Destination: 0_audio/Allen 

# Convert Audio to Transcripts

In [2]:
import os
import subprocess
import whisper

model = whisper.load_model("base")
folder_path = "0_audio/"

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    try:
        result = model.transcribe(file_path)
    except:
      print(f"{filename} failed to be transcribed.")

    output_filename = "1_transcripts/" + os.path.splitext(filename)[0] + ".txt"
    with open(output_filename, 'w') as file:
      file.write(result['text'])

# Convert Transcripts to Summaries

In [3]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-large-book-summary")
model = AutoModelForSeq2SeqLM.from_pretrained("pszemraj/led-large-book-summary")
classifier = pipeline("summarization", model=model, tokenizer=tokenizer)

# Directory path containing the files
directory_path = "./1_transcripts/"

# Loop over the files in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file for reading
        with open(os.path.join(directory_path, filename), "r") as input_file:
            # Read the contents of the file
            file_contents = input_file.read()
            
            try:
              processed_contents = classifier(
                file_contents,
                min_length=1000,
                max_length=2048,
                no_repeat_ngram_size=3,
                encoder_no_repeat_ngram_size=3,
                repetition_penalty=3.5,
                num_beams=4,
              )
            except:
              print(f"{filename} failed to be summarized.")
            
            # Create a new file for writing the extracted information
            output_filename = "./2_summaries/" + filename
            with open(output_filename, "w") as output_file:
                # Write the processed contents to the new file
                output_file.write(processed_contents[0]['summary_text'])

use `min_length` to [control how concise the model output](https://github.com/pszemraj/textsum/wiki/Inference-&-Parameters) is. If you have  a very long video that makes a very long summary, the summary may be too big for the context window of the model that generates reports.

# Convert Summaries to Reports

In [4]:
from openai import OpenAI
import os

#openai api key
key="YOUR_KEY_HERE"

client = OpenAI(api_key=key)

# Directory path containing the files
directory_path = "./2_summaries/"

# Loop over the files in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Open the file for reading
        with open(os.path.join(directory_path, filename), "r") as input_file:
            # Read the contents of the file
            file_contents = input_file.read()

    completion = client.chat.completions.create(
      model="gpt-3.5-turbo",
      temperature=0,
      messages=[
        {"role": "system", "content": "Pretend you are a student attending lecture. Please give your impressions of the talk and how you benefited from what you learned, in 200 words or less. You MUST include the speaker's name in your report."},
        {"role": "user", "content": f"{file_contents}"}
      ])
    
    result = completion.choices[0].message.content

    # Create a new file for writing the extracted information
    output_filename = "./3_reports/" + filename
    with open(output_filename, "w") as output_file:
        # Write the processed contents to the new file
        output_file.write(result)