In [None]:
# =====================
# Install Packages
# =====================
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q resemblyzer pydub scikit-learn pandas requests
!sudo apt update && sudo apt install -y ffmpeg

In [None]:
# =====================
# Imports
# =====================
import urllib.request
import os
import requests
from resemblyzer import VoiceEncoder, preprocess_wav
from resemblyzer.hparams import sampling_rate
import numpy as np
from sklearn.cluster import KMeans
import whisper
import pandas as pd
from google.colab import files, drive
from pydub import AudioSegment

# Mount Google Drive
drive.mount('/content/gdrive')

# =====================
# Helper function for robust downloading
# =====================
def download_video_robust(video_url, video_path):
    if os.path.exists(video_path):
        print("Video already downloaded, skipping.")
        return True

    print("Downloading video...")
    try:
        response = requests.get(video_url, stream=True)
        response.raise_for_status()

        with open(video_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Download complete!")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
        if os.path.exists(video_path):
            os.remove(video_path)
        return False

# =====================
# Pipeline function to process one video
# =====================
def process_video(video_url, video_id):
    print(f"\nProcessing video {video_id} from {video_url}")

    video_path = f"video_{video_id}.mp4"
    full_audio_path = f"audio_{video_id}.wav"
    diarized_segments = []

    # Download video
    if not download_video_robust(video_url, video_path):
        return []

    # Extract full audio
    print("Extracting full audio...")
    os.system(f"ffmpeg -i {video_path} -ac 1 -ar 16000 -vn -y {full_audio_path}")

    # Load and split audio into chunks
    print("Splitting audio into 5-minute chunks...")
    audio = AudioSegment.from_wav(full_audio_path)
    chunk_length_ms = 5 * 60 * 1000 # 5 minutes in milliseconds

    num_speakers = 2
    model = whisper.load_model("small")
    encoder = VoiceEncoder()

    for i in range(0, len(audio), chunk_length_ms):
        start_time_offset = i / 1000.0  # Time offset in seconds
        chunk = audio[i:i + chunk_length_ms]
        chunk_path = f"chunk_{video_id}_{i}.wav"
        chunk.export(chunk_path, format="wav")
        print(f"Processing chunk {i/chunk_length_ms + 1} at offset {start_time_offset:.2f}s...")

        # Load chunk for embeddings
        wav = preprocess_wav(chunk_path)
        chunk_size = int(0.75 * sampling_rate)
        step_size = int(0.375 * sampling_rate)
        embeddings, timestamps = [], []

        if len(wav) < chunk_size:
            print("Chunk too short for embeddings, skipping.")
            os.remove(chunk_path)
            continue

        for start_idx in range(0, len(wav) - chunk_size + 1, step_size):
            emb = encoder.embed_utterance(wav[start_idx : start_idx + chunk_size])
            embeddings.append(emb)
            timestamps.append(start_idx / sampling_rate)

        embeddings = np.vstack(embeddings)
        kmeans = KMeans(n_clusters=num_speakers, random_state=0).fit(embeddings)
        labels = kmeans.labels_

        # Transcribe the chunk
        result = model.transcribe(chunk_path, word_timestamps=True, verbose=False)
        segments = result['segments']

        # Assign speaker labels and build list of dicts
        for seg in segments:
            mid = (seg['start'] + seg['end']) / 2
            closest_idx = np.argmin(np.abs(np.array(timestamps) - mid))
            speaker = f"Speaker {labels[closest_idx]}"

            diarized_segments.append({
                "video_id": video_id,
                "video_url": video_url,
                "start": start_time_offset + seg['start'],
                "end": start_time_offset + seg['end'],
                "speaker": speaker,
                "text": seg['text'].strip()
            })

        os.remove(chunk_path) # Clean up chunk file

    os.remove(full_audio_path)
    print("All chunks processed.")
    return diarized_segments

# =====================
# Main execution: process multiple videos and save results
# =====================
video_urls = [
  'https://archive.org/download/tobacco_qno71d00/VTS_01_1_512kb.mp4'
]

# Define the Google Drive directory
output_dir = "/content/gdrive/MyDrive/DiarizedTranscripts-Small"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

i = 0

for idx, url in enumerate(video_urls, start=1):

    print("processing", url)

    #i += 1
    #if i > 101:
    #  print("processed 101, let's take a break")
    #  break

    video_name = url.split("/download/")[1].split("/")
    video_id = video_name[0] + '_' + video_name[1]
    file_name = f"video_{video_id}_transcript.txt"
    file_path = os.path.join(output_dir, file_name)

    # Skip if the transcript already exists
    if os.path.exists(file_path):
        # don't count this against the amount we've already processed
        i -= 1
        print(f"Transcript for video {video_id} already exists, skipping.")
        continue

    # Process the video
    segments = process_video(url, video_id)

    # Save the segments to a text file
    with open(file_path, "w") as f:
        for seg in segments:
            f.write(f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['speaker']}: {seg['text']}\n")

    print(f"\nTranscript for video {video_id} saved to {file_path}")

    # Delete the downloaded video file to free local disk space
    video_path = f"video_{video_id}.mp4"
    if os.path.exists(video_path):
        os.remove(video_path)
        print(f"Deleted {video_path} from local storage.")

print("\nAll processing complete. Transcripts are available in your Google Drive.")