In [None]:
!pip install ffmpeg-python
!pip install pyannote.audio
!pip install git+https://github.com/openai/whisper.git
!pip install transformers

In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Google authentication
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# File download from Google Drive
file_id = '1J0LWDdCjUzDQVyEHQ6b_j41baKV4exEu'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('downloaded_video.mp4')

In [None]:
import ffmpeg
import os

input_video = 'downloaded_video.mp4'
output_audio = 'extracted_audio.wav'

# # If there is the file exists, then ffmpeg error will occur... remove the file
# os.remove(output_audio)

ffmpeg.input(input_video).output(output_audio).run()

In [None]:
# from pyannote.audio import Pipeline
# import torch

# # If you want to this code, then you should agree pyannote agreement...
# # Check this URL: https://huggingface.co/pyannote/speaker-diarization
# # But this code takes much time, so I don't use them.
# pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1",
#                                     use_auth_token='USE_YOUR_HUGGING_FACE_TOKEN')

# pipeline.to(torch.device("cuda"))

# # Apply pretrained pipeline
# diarization = pipeline("extracted_audio.wav")

# for turn, _, speaker in diarization.itertracks(yield_label=True):
#     print(f"Speaker {speaker} says from {turn.start:.1f}sec to {turn.end:.1f}sec")

In [None]:
import whisper

# You can use the model "large", but the code need more time
model = whisper.load_model("medium")
result = model.transcribe(output_audio)
print(result["text"])

In [None]:
def create_paragraphs(text, max_tokens=1024):
    sentences = text.split('.')
    paragraphs = []
    current_paragraph = ""

    for sentence in sentences:
        sentence = sentence.strip() + '.'

        if len((current_paragraph + " " + sentence).split()) > max_tokens:
            paragraphs.append(current_paragraph.strip())
            current_paragraph = sentence
        else:
            current_paragraph += " " + sentence

    if current_paragraph:
        paragraphs.append(current_paragraph.strip())

    return paragraphs

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model='facebook/bart-large-cnn')
paragraphs = create_paragraphs(result['text'], 512)

summaries = []

for i, paragraph in enumerate(paragraphs):
   summary = summarizer(paragraph, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
   summaries.append((paragraph, summary))

   print(f'# paragraph {i}')
   print('Summary:', summary)
   print('Body:', paragraph)
   print()