# YouTube Video Transcript Summarizer and the Hugging Face's Automatic Speech Recognition (ASR) model Project

In [44]:

# Install the pytube library to interact with YouTube videos
! pip install pytube -q

# Download YouTube Video's Audio

In [45]:
# Step 1: Download YouTube Video's Audio
from pytube import YouTube

In [46]:
#VIDEO_URL = "https://www.youtube.com/watch?v=hWLf6JFbZoo" #obama

In [48]:
# Define the YouTube video URL to be downloaded
VIDEO_URL = 'https://www.youtube.com/watch?v=h-JVjs9AAmQ' # Example: Batman video

In [6]:
#VIDEO_URL = 'https://youtu.be/qNJRGHk7sN8'

In [49]:
# Download the audio from the YouTube video and save it as an MP4 file
yt = YouTube(VIDEO_URL)

In [8]:
yt.streams \
  .filter(only_audio=True, file_extension='mp4') \
  .first() \
  .download(filename='ytaudio.mp4')

'/content/ytaudio.mp4'

In [50]:
# Step 2: Convert Audio to WAV Format
# Convert the downloaded MP4 audio file to WAV format with a sample rate of 16 kHz

! ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

# Step-3: English ASR with HuggingSound

*italicized text*

In [51]:
# Install the huggingsound library for Automatic Speech Recognition (ASR)
!pip install huggingsound -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 1.12.1 which is incompatible.
torchdata 0.7.1 requires torch>=2, but you have torch 1.12.1 which is incompatible.
torchtext 0.17.1 requires torch==2.2.1, but you have torch 1.12.1 which is incompatible.
torchvision 0.17.1+cu121 requires torch==2.2.1, but you have torch 1.12.1 which is incompatible.[0m[31m
[0m

In [17]:
!pip install torch==2.2.1



In [12]:
!pip install torchaudio==2.2.1+cu121 torchdata==0.7.1 torchtext==0.17.1 torchvision==0.17.1+cu121



In [52]:
# Step 4: Load the ASR Model
# Import necessary libraries
from huggingsound import SpeechRecognitionModel

In [53]:
import torch
# Determine whether to use GPU or CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
device

'cpu'

In [54]:
# Load the pre-trained ASR model onto the specified device
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = device)

INFO:huggingsound.speech_recognition.model:Loading model...
Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encod

# Audio Chunking

In [55]:
# Step 5: Audio Chunking
# Import librosa library for audio processing
import librosa

In [56]:
# Define the input WAV file path
input_file = '/content/ytaudio.wav'

In [57]:
# Print the sample rate of the input audio file
print(librosa.get_samplerate(input_file))
# Stream over 30 seconds chunks rather than load the full file
stream = librosa.stream(
    input_file,
    block_length=30,
    frame_length=16000,
    hop_length=16000
)

16000


In [58]:
# Step 6: Save Audio Chunks as Separate Files
# Import soundfile library for saving audio files

import soundfile as sf

In [59]:
# Iterate over each 30-second audio chunk and save it as a separate WAV file
for i,speech in enumerate(stream):
  sf.write(f'{i}.wav', speech, 16000)

In [24]:
i

8

# Audio Transcription / ASR / Speech to Text

In [60]:
# Construct a list of file paths for each audio chunk
audio_path =[]
for a in range(i+1):
  audio_path.append(f'/content/{a}.wav')

In [26]:
audio_path

['/content/0.wav',
 '/content/1.wav',
 '/content/2.wav',
 '/content/3.wav',
 '/content/4.wav',
 '/content/5.wav',
 '/content/6.wav',
 '/content/7.wav',
 '/content/8.wav']

In [61]:
# Transcribe each audio chunk using the loaded ASR model
transcriptions = model.transcribe(audio_path)

100%|██████████| 9/9 [04:55<00:00, 32.82s/it]


In [62]:
# Concatenate the transcriptions into a single text
full_transcript = ' '

In [63]:
for item in transcriptions:
  full_transcript += ''.join(item['transcription'])

In [64]:
len(full_transcript)

3091

# Text Summarization

In [65]:
# Initialize a text summarization pipeline using the transformers library
from transformers import pipeline

In [66]:
# Initialize the summarization pipeline
summarization = pipeline('summarization')

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [67]:
# Summarize the full transcript into a concise summary using the pre-trained model
summarized_text = summarization(full_transcript)

In [68]:
# Print the summarized text
summarized_text[0]['summary_text']

" The role of cat woman has been played by litedary access arears from rto kit and shell fifer . Batman is about etwo sides of drauma batman's born of trama andtis film maybes the ridler and tat is kind of the seed from which everything else grew ."

# Text Chunking before Summarization

In [70]:
# Break the full transcript into smaller segments of 1000 characters each
num_iters = int(len(full_transcript) / 1000)
summarized_text = []

for i in range(0, num_iters + 1):
    start = i * 1000
    end = (i + 1) * 1000
    out = summarization(full_transcript[start:end], min_length=5, max_length=20)
    out = out[0]
    out = out['summary_text']
    summarized_text.append(out)

# Print the summarized text for each segment
for text in summarized_text:
    print("Summarized text:")
    print(text)

Summarized text:
 The role of cat woman has been played by litedary access arears from r
Summarized text:
 The key was really trying to trust that mt chose me for a reason realso i
Summarized text:
 "Pinguwin" was truly something spectacular so new onts very interesting do you
Summarized text:
 Le bit that that that made it interesting for me so yet ided lot of people
