# **1. Setup and Install Libraries**

In [None]:
# Install whisperx dan dependensi lainnya
!pip install -q git+https://github.com/m-bain/whisperx.git -q

In [None]:
!pip3 install -U huggingface_hub -q

In [None]:
!apt-get install libcudnn8

# **2. Upload File Audio**

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Recording.m4a to Recording.m4a


# **3. Run Transcription with WhisperX**

In [None]:
!whisperx "file_name.m4a" \
  --language id \
  --model large-v2 \
  --chunk_size 6 \
  --diarize \
  --min_speakers 2 \
  --max_speakers 8 \
  --hf_token hf_xxxxxxx \
  --align_model indonesian-nlp/wav2vec2-indonesian-javanese-sundanese \ #WhisperX doesn't have Indonesian Language Model
  --output_dir transcript_output #The output will appear in this folder

# **4. Download Output in zip format**

In [None]:
import shutil

shutil.make_archive('transcript_output_zip', 'zip', 'transcript_output')

from google.colab import files
files.download('transcript_output_zip.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **5. (optional) Merge Speaker**

In [None]:
import json

OFFSET = 0  # minute 0 in second (Replace offset with time of the audio if audio splitted into some parts)

# Load file JSON hasil whisperx
with open("transcript_output_file.json", "r") as f:
    segments = json.load(f)["segments"]

merged = []
for seg in segments:
    speaker = seg.get("speaker", "UNKNOWN") #Replace unidentified speaker with "Unknown"

    if not merged:
        merged.append({
            "speaker": speaker,
            "start": seg["start"],
            "end": seg["end"],
            "text": seg["text"]
        })
    else:
        last = merged[-1]
        if speaker == last.get("speaker") and abs(seg["start"] - last["end"]) < 3.0: #Set time which segment will be merged or not
            last["end"] = seg["end"]
            last["text"] += " " + seg["text"]
        else:
            merged.append({
                "speaker": speaker,
                "start": seg["start"],
                "end": seg["end"],
                "text": seg["text"]
            })

# Formatting seconds into ke HH:MM:SS Function
def format_timestamp(seconds):
    total_seconds = int(seconds + OFFSET)
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02}"

# Print merged transcription by speaker with offset
for block in merged:
    start_time = format_timestamp(block['start'])
    end_time = format_timestamp(block['end'])
    print(f"[{start_time} – {end_time}] {block['speaker']}: {block['text']}")