In [1]:
!pip install whisperx
!pip install openai-whisper
!pip install numpy
!pip install huggingface_hub
!pip install requests
!pip install urllib3



In [2]:
import whisper
import whisperx
import time  # Add time module for timing
import urllib3
import requests
import difflib
import numpy as np

In [3]:
def chunk_audio_with_overlap(audio_array,
                             chunk_size_seconds=30,
                             overlap_seconds=3,
                             sample_rate=16000):
    """Split audio into overlapping chunks."""
    chunk_size = chunk_size_seconds * sample_rate
    overlap_size = overlap_seconds * sample_rate
    step = chunk_size - overlap_size

    chunks = []
    for i in range(0, len(audio_array), step):
        chunk = audio_array[i:i + chunk_size]
        if len(chunk) < chunk_size:
            chunk = np.pad(chunk, (0, chunk_size - len(chunk)))
        chunks.append(chunk)
    return chunks


def remove_overlap_text(prev_text,
                        curr_text,
                        window_words=20,
                        similarity_threshold=0.85):
    """
    Remove overlapping part from the start of curr_text that duplicates the end of prev_text,
    based on longest matching sequence of words.

    Args:
        prev_text (str): Previous chunk's full transcription.
        curr_text (str): Current chunk's full transcription.
        window_words (int): Number of words to consider at the overlap boundary.
        similarity_threshold (float): Minimum similarity to consider a match.

    Returns:
        str: Current chunk's transcription with overlapping start removed.
    """

    prev_words = prev_text.split()
    curr_words = curr_text.split()

    # Get last window_words from prev_text and first window_words from curr_text
    prev_tail = prev_words[-window_words:]
    curr_head = curr_words[:window_words]

    # Find longest matching sequence of words at the boundary
    # We'll try decreasing length sequences from window_words down to 1,
    # stopping at the longest matching sequence with similarity >= threshold.

    for overlap_len in range(window_words, 0, -1):
        prev_sub = prev_tail[-overlap_len:]
        curr_sub = curr_head[:overlap_len]

        # Compute similarity between word sequences (join to string and use difflib)
        prev_sub_str = " ".join(prev_sub)
        curr_sub_str = " ".join(curr_sub)
        ratio = difflib.SequenceMatcher(None, prev_sub_str,
                                        curr_sub_str).ratio()

        if ratio >= similarity_threshold:
            # Remove the overlapping words from the start of curr_text
            # Rejoin words skipping the overlap
            return " ".join(curr_words[overlap_len:]).lstrip()

    # No sufficient overlap found, return original curr_text
    return curr_text


In [4]:
asr_model = "whisperx"  # whisperx, whisper
device = "cuda"  # changed from cuda to cpu
audio_file = "SISC.m4a"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16"  # changed to float32 for CPU compatibility
chunk_size_seconds = 30  # adjust this value based on your needs
overlap_seconds = 3  # overlap between chunks
language = "en"  # specify language code: en, fr, de, es, it, ja, zh, etc.

In [5]:
# Load model and audio
if asr_model.lower() == "whisper":
    model = whisper.load_model("turbo", device)
    audio = whisper.load_audio(audio_file)
else:
    model = whisperx.load_model("medium", device, compute_type=compute_type)
    audio = whisperx.load_audio(audio_file)

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded c

No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


In [6]:
# Chunk audio
audio_chunks = chunk_audio_with_overlap(audio, chunk_size_seconds,
                                        overlap_seconds)

In [None]:
# Process each chunk and collect transcriptions
prev_chunk_text = ""
all_transcriptions = []
total_processing_time = 0  # Track total processing time
print(f"\nProcessing audio file: {audio_file}")
print(f"Total chunks: {len(audio_chunks)} (with {overlap_seconds}s overlap)\n")
print("=" * 80)

for i, chunk in enumerate(audio_chunks):
    print(f"\nChunk {i+1}/{len(audio_chunks)}")
    print("-" * 40)

    # Start timing this chunk
    chunk_start_time = time.time()

    # Transcribe the chunk
    result = model.transcribe(chunk, language=language, batch_size=batch_size)

    # Keep only segments after chunk_start
    chunk_text = " ".join(segment["text"].strip()
                          for segment in result["segments"])

    # Remove overlap with previous chunk
    if prev_chunk_text:
        chunk_text = remove_overlap_text(prev_chunk_text, chunk_text)

    # Calculate and display time taken for this chunk
    chunk_time = time.time() - chunk_start_time
    total_processing_time += chunk_time
    print(f"\nChunk processing time: {chunk_time:.2f} seconds")
    print(
        f"Average time per chunk so far: {(total_processing_time/(i+1)):.2f} seconds"
    )
    print(f"{chunk_text}\n")

    # Add to all transcriptions
    all_transcriptions.append(chunk_text)
    prev_chunk_text = chunk_text
    print("-" * 40)

# Print final timing statistics
print(f"\nTotal processing time: {total_processing_time:.2f} seconds")
print(
    f"Average time per chunk: {(total_processing_time/len(audio_chunks)):.2f} seconds"
)

# Combine all transcriptions
final_transcription = " ".join(all_transcriptions)

# Save to file
output_file = audio_file.split(".")[0] + "_transcription.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(final_transcription)

print("\n" + "=" * 80)
print(f"\nFull transcription saved to: {output_file}")


Processing audio file: SISC.m4a
Total chunks: 465 (with 3s overlap)


Chunk 1/465
----------------------------------------


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

