In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained(
        "pyannote/voice-activity-detection",
        use_auth_token=HF_TOKEN
    )

pipeline = pipeline.to(device)

In [None]:
# Process the audio file
audio_file = "test.wav"       ###########
output = pipeline(audio_file)

print(f"Processing {audio_file} on {device}")
print("Voice activity segments:")

# Get all speech segments
speech_segments = list(output.get_timeline().support())

for i, speech in enumerate(speech_segments):
    # active speech between speech.start and speech.end
    print(f"Segment {i+1}: Speech from {speech.start:.2f}s to {speech.end:.2f}s (duration: {speech.duration:.2f}s)")

In [None]:
import os
import shutil
from pydub import AudioSegment


def split_audio_by_segments(audio_path, segments, output_dir="output_segments"):
    """
    Split an audio file into multiple files based on speech segments
    
    Parameters:
    -----------
    audio_path: str
        Path to the input audio file
    segments: list
        List of speech segments (with start and end attributes)
    output_dir: str
        Directory to save the output segments
    """
    # Clear the output directory if it exists
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Load the audio file
    audio = AudioSegment.from_file(audio_path)
    
    # Extract each segment
    for i, segment in enumerate(segments):
        # Convert seconds to milliseconds
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)
        
        # Extract segment
        segment_audio = audio[start_ms:end_ms]
        
        # Generate output filename
        filename = os.path.basename(audio_path)
        name, ext = os.path.splitext(filename)
        output_path = os.path.join(output_dir, f"{name}_segment_{i+1:04d}_{start_ms:08d}ms-{end_ms:08d}ms{ext}")
        
        # Export segment
        segment_audio.export(output_path, format=ext.replace('.', ''))
        print(f"Saved segment {i+1} to {output_path}")

In [None]:
split_audio_by_segments(audio_file, speech_segments)

In [None]:
import librosa
from IPython.display import Audio, display

def play_audio(file_path, sr=None):
    """
    Play an audio file in a Jupyter notebook.
    
    Parameters:
    -----------
    file_path : str
        Path to the audio file to play
    sr : int, optional
        Sample rate to load the audio with. If None, uses the file's native sample rate.
        
    Returns:
    --------
    Audio widget that can be played in the notebook
    
    Example:
    --------
    >>> play_audio('path/to/audio.wav')
    """
    # Load the audio file
    y, sr = librosa.load(file_path, sr=sr)
    
    # Return an audio widget to play the sound
    audio_widget = Audio(data=y, rate=sr)
    display(audio_widget)

In [None]:
play_audio(audio_file)

In [None]:
import os
audio_dir = "./output_segments/"

audio_files = os.listdir(audio_dir)
audio_files.sort()

n_clips = 3

for fname in audio_files[0:n_clips]:
    play_audio(audio_dir + fname)

## **MFA**

In [2]:
import os
import shutil

def combine_nested_audio_folders(base_dir):
    """
    Menggabungkan isi dari audio_part_0/audio, audio_part_1/audio, dst.
    ke dalam satu folder utama bernama 'audio' di base_dir.
    """
    # 1. Tentukan target folder utama (data/raw/audio)
    target_dir = os.path.join(base_dir, "audio")
    
    # Buat folder target jika belum ada
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
        print(f"üìÅ Folder target dibuat: {target_dir}")

    # 2. Daftar folder utama yang akan digabung
    source_folders = ["audio_part_0", "audio_part_1", "audio_part_2"]
    total_dipindah = 0

    for folder_name in source_folders:
        # PERUBAHAN DI SINI: Kita menargetkan sub-folder 'audio' di dalamnya
        nested_source_dir = os.path.join(base_dir, folder_name, "audio")
        
        # Cek apakah sub-folder sumber benar-benar ada
        if not os.path.exists(nested_source_dir):
            print(f"‚ö†Ô∏è Melewati {folder_name}/audio: Folder tidak ditemukan.")
            continue

        print(f"üîÑ Memindahkan isi dari folder: {folder_name}/audio...")
        
        # 3. Pindahkan setiap file dari sub-folder sumber ke folder target utama
        for filename in os.listdir(nested_source_dir):
            source_file = os.path.join(nested_source_dir, filename)
            target_file = os.path.join(target_dir, filename)

            if os.path.isfile(source_file):
                shutil.move(source_file, target_file)
                total_dipindah += 1
        
        # 4. Bersihkan jejak: Hapus sub-folder 'audio', lalu folder utamanya
        try:
            os.rmdir(nested_source_dir) # Hapus direktori dalam (audio)
            
            parent_dir = os.path.join(base_dir, folder_name)
            os.rmdir(parent_dir)        # Hapus direktori luar (audio_part_X)
            
            print(f"‚úÖ Selesai: {folder_name} dan isinya telah bersih dihapus.")
        except OSError:
            print(f"‚ö†Ô∏è Perhatian: Gagal menghapus sisa folder di {folder_name}. Mungkin ada file lain selain audio.")

    print("==================================================")
    print(f"üéØ PROSES PENGGABUNGAN BERSARANG SELESAI!")
    print(f"Total file .flac dipindahkan: {total_dipindah}")
    print(f"Semua file audio sekarang rapi di dalam: {target_dir}")
    print("==================================================")

if __name__ == "__main__":
    # Direktori tempat script dijalankan (sejajar dengan folder audio_part_X)
    BASE_DIRECTORY = "../data/raw/" 
    
    combine_nested_audio_folders(BASE_DIRECTORY)

üìÅ Folder target dibuat: ../data/raw/audio
üîÑ Memindahkan isi dari folder: audio_part_0/audio...
‚úÖ Selesai: audio_part_0 dan isinya telah bersih dihapus.
üîÑ Memindahkan isi dari folder: audio_part_1/audio...
‚úÖ Selesai: audio_part_1 dan isinya telah bersih dihapus.
üîÑ Memindahkan isi dari folder: audio_part_2/audio...
‚úÖ Selesai: audio_part_2 dan isinya telah bersih dihapus.
üéØ PROSES PENGGABUNGAN BERSARANG SELESAI!
Total file .flac dipindahkan: 95572
Semua file audio sekarang rapi di dalam: ../data/raw/audio


In [None]:
# import json
# import os
# import re

# def prepare_mfa_data(jsonl_path):
#     """
#     Membaca file JSONL dan membuat file .txt berdampingan dengan file audio 
#     untuk persiapan dataset Montreal Forced Aligner (MFA).
#     """
#     print(f"Membaca dataset dari: {jsonl_path}...")
    
#     sukses = 0
#     gagal = 0

#     with open(jsonl_path, 'r', encoding='utf-8') as file:
#         for line in file:
#             try:
#                 # 1. Parsing baris JSON
#                 data = json.loads(line)
#                 audio_path = data['audio_path']  # Contoh: "audio/U_00003c3ae1c35c6f.flac"
#                 teks_asli = data['orthographic_text']

#                 # 2. Membersihkan teks (MFA benci tanda baca)
#                 # Hanya menyisakan huruf (a-z) dan spasi
#                 teks_bersih = re.sub(r'[^a-zA-Z\s\']', '', teks_asli).lower().strip()

#                 # 3. Menentukan jalur file .txt output
#                 # Mengganti ekstensi .flac menjadi .txt
#                 txt_path = os.path.splitext(audio_path)[0] + ".txt"

#                 # 4. Membuat file .txt berdampingan dengan file audio
#                 with open(txt_path, 'w', encoding='utf-8') as txt_file:
#                     txt_file.write(teks_bersih)
                
#                 sukses += 1

#             except Exception as e:
#                 print(f"Gagal memproses baris: {line[:50]}... Error: {e}")
#                 gagal += 1

#     print("==================================================")
#     print("PROSES SELESAI!")
#     print(f"Berhasil membuat: {sukses} file .txt")
#     if gagal > 0:
#         print(f"Gagal memproses : {gagal} baris")
#     print("==================================================")

# if __name__ == "__main__":
#     # Masukkan nama file JSONL Anda di sini
#     FILE_JSONL = "../data/raw/train_word_transcripts.jsonl"
    
#     # Eksekusi fungsi
#     prepare_mfa_data(FILE_JSONL)

In [10]:
import json
import os
import re
import shutil

def prepare_and_audit_mfa_data(jsonl_path, base_audio_dir):
    """
    1. Membaca JSONL dan membuat file .txt
    2. Mengaudit folder audio dan menyingkirkan file .flac yang tidak punya pasangan .txt
    """
    print(f"Membaca dataset dari: {jsonl_path}...")
    
    sukses = 0
    gagal = 0

    # ==========================================
    # FASE 1: INJEKSI TEKS
    # ==========================================
    with open(jsonl_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data = json.loads(line)
                # Path asli dari JSONL, misal: "audio/U_000.flac"
                audio_path_rel = data['audio_path'] 
                teks_asli = data['orthographic_text']

                # Bersihkan teks (mengizinkan huruf, spasi, dan apostrof)
                teks_bersih = re.sub(r'[^a-zA-Z\s\']', '', teks_asli).lower().strip()

                # Gabungkan dengan base directory tempat script berjalan
                full_flac_path = os.path.join(os.path.dirname(jsonl_path), audio_path_rel)
                txt_path = os.path.splitext(full_flac_path)[0] + ".txt"

                # Pastikan file flac-nya benar-benar ada sebelum membuat txt
                if os.path.exists(full_flac_path):
                    with open(txt_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(teks_bersih)
                    sukses += 1
                else:
                    # Flac tidak ada di hard disk (mungkin korup saat download)
                    gagal += 1

            except Exception as e:
                gagal += 1

    # ==========================================
    # FASE 2: PATROLI & KARANTINA (AUDIT)
    # ==========================================
    print("\nMemulai patroli audit folder audio...")
    folder_audio = os.path.join(os.path.dirname(jsonl_path), base_audio_dir)
    folder_karantina = os.path.join(os.path.dirname(jsonl_path), "audio_karantina")
    
    flac_yatim = 0
    
    # Cek setiap file di folder audio
    if os.path.exists(folder_audio):
        for filename in os.listdir(folder_audio):
            if filename.endswith(".flac"):
                jalur_flac = os.path.join(folder_audio, filename)
                jalur_txt = os.path.splitext(jalur_flac)[0] + ".txt"
                
                # Jika tidak ada file txt pasangannya
                if not os.path.exists(jalur_txt):
                    # Buat folder karantina jika belum ada
                    os.makedirs(folder_karantina, exist_ok=True)
                    
                    # Pindahkan file flac tersebut ke karantina
                    shutil.move(jalur_flac, os.path.join(folder_karantina, filename))
                    flac_yatim += 1

    # ==========================================
    # LAPORAN AKHIR
    # ==========================================
    print("==================================================")
    print("üéØ PROSES PERSIAPAN & AUDIT MFA SELESAI!")
    print(f"File .txt berhasil dibuat   : {sukses}")
    print(f"Baris JSON gagal diproses   : {gagal}")
    print(f"Audio yatim dikarantina     : {flac_yatim} file (Dipindahkan ke folder 'audio_karantina')")
    print("==================================================")

if __name__ == "__main__":
    # Sesuaikan path ke file JSONL Anda
    FILE_JSONL = "../data/raw/train_word_transcripts.jsonl"
    
    # Folder target di mana audio digabungkan (relatif terhadap letak JSONL)
    BASE_AUDIO_DIR = "audio" 
    
    prepare_and_audit_mfa_data(FILE_JSONL, BASE_AUDIO_DIR)

Membaca dataset dari: ../data/raw/train_word_transcripts.jsonl...

Memulai patroli audit folder audio...
üéØ PROSES PERSIAPAN & AUDIT MFA SELESAI!
File .txt berhasil dibuat   : 95572
Baris JSON gagal diproses   : 0
Audio yatim dikarantina     : 0 file (Dipindahkan ke folder 'audio_karantina')


In [None]:
# !ls ../data/raw/

audio	      noise_part_1		       submission_format_z2HCh3r.jsonl
noise_part_0  submission_format_aqPHQ8m.jsonl  train_word_transcripts.jsonl


In [None]:
# import json
# import os
# import re
# import shutil

# jsonl_path = '../data/raw/train_word_transcripts.jsonl'

# with open(jsonl_path, 'r', encoding='utf-8') as file:
#     for line in file:
#         try:
#             data = json.loads(line)
#             # Path asli dari JSONL, misal: "audio/U_000.flac"
#             audio_path_rel = data['audio_path'] 
#             teks_asli = data['orthographic_text']

#             # Bersihkan teks (mengizinkan huruf, spasi, dan apostrof)
#             teks_bersih = re.sub(r'[^a-zA-Z\s\']', '', teks_asli).lower().strip()

#             # Gabungkan dengan base directory tempat script berjalan
#             full_flac_path = os.path.join(os.path.dirname(jsonl_path), audio_path_rel)
#             txt_path = os.path.splitext(full_flac_path)[0] + ".txt"

#             print(full_flac_path)
#             print(txt_path)

#             break

#             # # Pastikan file flac-nya benar-benar ada sebelum membuat txt
#             # if os.path.exists(full_flac_path):
#             #     with open(txt_path, 'w', encoding='utf-8') as txt_file:
#             #         txt_file.write(teks_bersih)
#             #     sukses += 1
#             # else:
#             #     # Flac tidak ada di hard disk (mungkin korup saat download)
#             #     gagal += 1

#         except Exception as e:
#             gagal += 1

../data/raw/audio/U_00003c3ae1c35c6f.flac
../data/raw/audio/U_00003c3ae1c35c6f.txt


## **Splitting Dataset**

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def split_jsonl_dataset(
        input_file, 
        output_dir, 
        train_ratio=0.8, 
        val_ratio=0.1, 
        random_seed=42
    ):
    """
    Membedah satu file JSONL besar menjadi 3 file terpisah: Train, Val, dan Test.
    """
    print("==================================================")
    print("üî™ MEMULAI OPERASI PEMBEDAHAN DATASET (DATA SPLIT) üî™")
    print("==================================================")
    
    # 1. Pastikan folder output tersedia
    os.makedirs(output_dir, exist_ok=True)
    
    # 2. Muat seluruh data ke dalam memori
    print(f"[Logistik] Membaca file induk: {input_file}")
    df = pd.read_json(input_file, lines=True)
    total_data = len(df)
    print(f"[Logistik] Total amunisi ditemukan: {total_data} baris.")

    # 3. Taktik Pembedahan Tahap 1: Pisahkan Train dan (Val + Test)
    # Kita kunci random_state agar jika script ini di-run ulang, 
    # file audionya tidak berpindah-pindah folder (Reproducible).
    train_df, temp_df = train_test_split(
        df, 
        train_size=train_ratio, 
        random_state=random_seed
    )

    # 4. Taktik Pembedahan Tahap 2: Pecah sisa data menjadi Val dan Test
    # Jika train_ratio = 0.8, maka sisa temp_df adalah 0.2.
    # Untuk mendapatkan Val 0.1 dan Test 0.1, kita belah temp_df tepat 50:50.
    test_ratio_relative = (1.0 - train_ratio - val_ratio) / (1.0 - train_ratio)
    
    val_df, test_df = train_test_split(
        temp_df, 
        test_size=test_ratio_relative, 
        random_state=random_seed
    )

    # 5. Ekspor Data ke JSONL Baru
    train_path = os.path.join(output_dir, "train_split.jsonl")
    val_path = os.path.join(output_dir, "val_split.jsonl")
    test_path = os.path.join(output_dir, "test_split.jsonl")

    # orient='records', lines=True memastikan format output persis seperti input aslinya
    train_df.to_json(train_path, orient='records', lines=True)
    val_df.to_json(val_path, orient='records', lines=True)
    test_df.to_json(test_path, orient='records', lines=True)

    # 6. Laporan Intelijen Akhir
    print("\n[Laporan] Operasi Pembedahan Selesai:")
    print(f"‚úÖ TRAIN SET : {len(train_df)} data ({len(train_df)/total_data*100:.1f}%) -> {train_path}")
    print(f"‚úÖ VAL SET   : {len(val_df)} data ({len(val_df)/total_data*100:.1f}%) -> {val_path}")
    print(f"‚úÖ TEST SET  : {len(test_df)} data ({len(test_df)/total_data*100:.1f}%) -> {test_path}")
    print("==================================================")

if __name__ == "__main__":
    # Sesuaikan path ini dengan lokasi file Anda di Ubuntu
    FILE_INPUT_ASLI = "../data/raw/train_word_transcripts.jsonl"
    FOLDER_OUTPUT = "../data/processed"
    
    split_jsonl_dataset(
        input_file=FILE_INPUT_ASLI,
        output_dir=FOLDER_OUTPUT
    )

üî™ MEMULAI OPERASI PEMBEDAHAN DATASET (DATA SPLIT) üî™
[Logistik] Membaca file induk: ../data/raw/train_word_transcripts.jsonl
[Logistik] Total amunisi ditemukan: 95572 baris.

[Laporan] Operasi Pembedahan Selesai:
‚úÖ TRAIN SET : 76457 data (80.0%) -> ../data/processed/train_split.jsonl
‚úÖ VAL SET   : 9557 data (10.0%) -> ../data/processed/val_split.jsonl
‚úÖ TEST SET  : 9558 data (10.0%) -> ../data/processed/test_split.jsonl


In [None]:
import textgrid
import os
import random
from pathlib import Path

# Lokasi hasil MFA Anda tadi
MFA_OUTPUT_DIR = "../data/processed/mfa_aligned_train"

# 1. Ambil satu file TextGrid secara acak
all_files = [f for f in os.listdir(MFA_OUTPUT_DIR) if f.endswith('.TextGrid')]
random_file = random.choice(all_files)
full_path = os.path.join(MFA_OUTPUT_DIR, random_file)

print(f"üïµÔ∏è Membedah file: {random_file}")
print("="*50)

# 2. Baca isinya menggunakan library textgrid
tg = textgrid.TextGrid.fromFile(full_path)

# 3. TextGrid biasanya punya 2 lapisan (Tier):
#    Tier 0 = 'words' (Kata utuh) -> Ini yang kita butuhkan!
#    Tier 1 = 'phones' (Fonem/bunyi per huruf: h - a - l - o)

words_tier = tg[0] # Mengambil tier kata

print(f"Nama Tier: {words_tier.name}")
print("-" * 30)
print(f"{'KATA':<15} | {'MULAI (detik)':<15} | {'SELESAI (detik)':<15}")
print("-" * 50)

# 4. Loop setiap kata dan waktunya
for interval in words_tier:
    # MFA menandai diam/silence dengan string kosong "" atau "<sil>"
    kata = interval.mark if interval.mark else "<DIAM>"
    start = interval.minTime
    end = interval.maxTime
    
    print(f"{kata:<15} | {start:.4f}          | {end:.4f}")

print("="*50)

üïµÔ∏è Membedah file: U_ad025d5c57312a27_clean.TextGrid
Nama Tier: words
------------------------------
KATA            | MULAI (detik)   | SELESAI (detik)
--------------------------------------------------
<DIAM>          | 0.0000          | 0.0800
right           | 0.0800          | 0.7000
<DIAM>          | 0.7000          | 0.9000
Lihat? MFA sudah memberi tahu kita KAPAN tepatnya setiap kata diucapkan!


In [2]:
import textgrid
import os
import random
from pathlib import Path

# Lokasi hasil MFA Anda tadi
MFA_OUTPUT_DIR = "../data/processed/mfa_aligned_train"

# 1. Ambil satu file TextGrid secara acak
all_files = [f for f in os.listdir(MFA_OUTPUT_DIR) if f.endswith('.TextGrid')]
random_file = random.choice(all_files)
full_path = os.path.join(MFA_OUTPUT_DIR, random_file)

print(f"üïµÔ∏è Membedah file: {random_file}")
print("="*50)

# 2. Baca isinya menggunakan library textgrid
tg = textgrid.TextGrid.fromFile(full_path)

# 3. TextGrid biasanya punya 2 lapisan (Tier):
#    Tier 0 = 'words' (Kata utuh) -> Ini yang kita butuhkan!
#    Tier 1 = 'phones' (Fonem/bunyi per huruf: h - a - l - o)

words_tier = tg[0] # Mengambil tier kata

print(f"Nama Tier: {words_tier.name}")
print("-" * 30)
print(f"{'KATA':<15} | {'MULAI (detik)':<15} | {'SELESAI (detik)':<15}")
print("-" * 50)

# 4. Loop setiap kata dan waktunya
for interval in words_tier:
    # MFA menandai diam/silence dengan string kosong "" atau "<sil>"
    kata = interval.mark if interval.mark else "<DIAM>"
    start = interval.minTime
    end = interval.maxTime
    
    print(f"{kata:<15} | {start:.4f}          | {end:.4f}")

print("="*50)

üïµÔ∏è Membedah file: U_c6c6534d417f290c_clean.TextGrid
Nama Tier: words
------------------------------
KATA            | MULAI (detik)   | SELESAI (detik)
--------------------------------------------------
<DIAM>          | 0.0000          | 0.0400
the             | 0.0400          | 0.1800
one             | 0.1800          | 0.5300
<DIAM>          | 0.5300          | 0.6000
on              | 0.6000          | 0.9600
<DIAM>          | 0.9600          | 1.0100
the             | 1.0100          | 1.2200
right           | 1.2200          | 1.6300
<DIAM>          | 1.6300          | 2.1200
has             | 2.1200          | 2.4800
<DIAM>          | 2.4800          | 2.5500
more            | 2.5500          | 3.0300
<DIAM>          | 3.0300          | 3.6000


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def split_jsonl_dataset(
        input_file, 
        output_dir, 
        train_ratio=0.8, 
        val_ratio=0.1, 
        random_seed=42
    ):
    """
    Membedah satu file JSONL besar menjadi 3 file terpisah: Train, Val, dan Test.
    """
    print("==================================================")
    print("üî™ MEMULAI OPERASI PEMBEDAHAN DATASET (DATA SPLIT) üî™")
    print("==================================================")
    
    # 1. Pastikan folder output tersedia
    os.makedirs(output_dir, exist_ok=True)
    
    # 2. Muat seluruh data ke dalam memori
    print(f"[Logistik] Membaca file induk: {input_file}")
    df = pd.read_json(input_file, lines=True)
    total_data = len(df)
    print(f"[Logistik] Total amunisi ditemukan: {total_data} baris.")

    # 3. Taktik Pembedahan Tahap 1: Pisahkan Train dan (Val + Test)
    # Kita kunci random_state agar jika script ini di-run ulang, 
    # file audionya tidak berpindah-pindah folder (Reproducible).
    train_df, temp_df = train_test_split(
        df, 
        train_size=train_ratio, 
        random_state=random_seed
    )

    # 4. Taktik Pembedahan Tahap 2: Pecah sisa data menjadi Val dan Test
    # Jika train_ratio = 0.8, maka sisa temp_df adalah 0.2.
    # Untuk mendapatkan Val 0.1 dan Test 0.1, kita belah temp_df tepat 50:50.
    test_ratio_relative = (1.0 - train_ratio - val_ratio) / (1.0 - train_ratio)
    
    val_df, test_df = train_test_split(
        temp_df, 
        test_size=test_ratio_relative, 
        random_state=random_seed
    )

    # 5. Ekspor Data ke JSONL Baru
    train_path = os.path.join(output_dir, "split_train.jsonl")
    val_path = os.path.join(output_dir, "split_val.jsonl")
    test_path = os.path.join(output_dir, "split_test.jsonl")

    # orient='records', lines=True memastikan format output persis seperti input aslinya
    train_df.to_json(train_path, orient='records', lines=True)
    val_df.to_json(val_path, orient='records', lines=True)
    test_df.to_json(test_path, orient='records', lines=True)

    # 6. Laporan Intelijen Akhir
    print("\n[Laporan] Operasi Pembedahan Selesai:")
    print(f"‚úÖ TRAIN SET : {len(train_df)} data ({len(train_df)/total_data*100:.1f}%) -> {train_path}")
    print(f"‚úÖ VAL SET   : {len(val_df)} data ({len(val_df)/total_data*100:.1f}%) -> {val_path}")
    print(f"‚úÖ TEST SET  : {len(test_df)} data ({len(test_df)/total_data*100:.1f}%) -> {test_path}")
    print("==================================================")

if __name__ == "__main__":
    # Sesuaikan path ini dengan lokasi file Anda di Ubuntu
    FILE_INPUT_ASLI = "../data/raw/train_word_transcripts.jsonl"
    FOLDER_OUTPUT = "../data/processed"
    
    split_jsonl_dataset(
        input_file=FILE_INPUT_ASLI,
        output_dir=FOLDER_OUTPUT
    )

Tentu saja saya masih ingat! Itu adalah "harta karun" dataset End-to-End yang sudah Anda siapkan dan kompres dengan susah payah di laptop Ubuntu Anda sebelumnya.

Berikut adalah isi dari masing-masing kapsul tersebut:

* üì¶ **`audio_train.zip`**: Berisi amunisi utama, yaitu puluhan ribu file audio mentah `.flac` (sekitar 72.000 file) hasil potongan VAD yang khusus digunakan untuk bahan **belajar/latihan** (Training) model WavLM.
* üì¶ **`audio_val.zip`**: Berisi kumpulan file audio `.flac` (sekitar 9.000 file) yang fungsinya sebagai ujian/tes untuk **memvalidasi** (Validation) seberapa pintar model Anda setelah belajar di setiap epochnya.
* üì¶ **`manifests.zip`**: Ini adalah "kunci jawaban"-nya. Di dalamnya terdapat file `train_manifest.jsonl` dan `val_manifest.jsonl` yang menyimpan teks transkrip (kata apa yang diucapkan di audio yang mana) beserta informasi durasinya.

Ketiga file inilah yang diekstrak secara otomatis ke dalam `/content/data/...` di mesin Colab Anda (seperti yang sukses tereksekusi di *screenshot* pertama Anda). Tujuannya agar GPU Colab bisa menelan data langsung dari SSD lokal yang super cepat, bukan dari Google Drive yang rawan *bottleneck*.

Btw, melihat *screenshot* kedua Anda, proses *training* Epoch 1 sepertinya sudah berjalan sangat stabil di sekitar 1% tanpa *error* OOM lagi!

Bagaimana statusnya sekarang? Apakah proses Colab-nya masih berjalan aman, dan apakah masalah *read-only* di laptop Ubuntu Anda (gambar ketiga) sudah berhasil teratasi dengan *restart* atau `remount`?

In [4]:
import os
import random
import IPython.display as ipd

# 1. Tentukan lokasi folder noise (sesuaikan path-nya jika Anda menjalankan ini di Colab atau folder lain)
noise_dirs = [
    "../data/raw/noise_part_0/audio", 
    "../data/raw/noise_part_1/audio"
]

# 2. Kumpulkan semua file audio ke dalam satu list
all_noise_files = []
for directory in noise_dirs:
    if os.path.exists(directory):
        for filename in os.listdir(directory):
            # Cek apakah file tersebut adalah file audio
            if filename.endswith((".wav", ".flac", ".mp3", ".ogg")):
                all_noise_files.append(os.path.join(directory, filename))

# 3. Pilih acak dan putar
if not all_noise_files:
    print("‚ùå Tidak ada file audio yang ditemukan. Pastikan path folder sudah benar.")
else:
    random_noise_path = random.choice(all_noise_files)
    print(f"üéµ Memutar file noise acak: {random_noise_path}")
    
    # Memunculkan UI Audio Player
    display(ipd.Audio(random_noise_path))

üéµ Memutar file noise acak: ../data/raw/noise_part_0/audio/06-17-02-172.flac


In [1]:
import os
import glob

def bersihkan_file_txt(directories):
    total_dihapus = 0
    
    for directory in directories:
        if not os.path.exists(directory):
            print(f"‚ö†Ô∏è Folder tidak ditemukan: {directory}")
            continue
            
        # Mencari semua file dengan akhiran .txt di dalam folder
        txt_files = glob.glob(os.path.join(directory, "*.txt"))
        
        count = 0
        for file_path in txt_files:
            try:
                os.remove(file_path)
                count += 1
            except Exception as e:
                print(f"Gagal menghapus {file_path}: {e}")
                
        print(f"‚úÖ Berhasil menghapus {count} file .txt dari folder '{directory}'")
        total_dihapus += count
        
    print(f"\nüéâ Pembersihan selesai! Total {total_dihapus} file .txt telah dilenyapkan.")

# Daftar folder yang ingin dibersihkan (sesuaikan path-nya jika perlu)
target_folders = [
    "../data/interim/corpus_test",
    "../data/interim/corpus_talkbank_train",
    "../data/interim/corpus_talkbank_val",
    "../data/interim/corpus_talkbank_test"
]

# Eksekusi fungsi
bersihkan_file_txt(target_folders)

‚úÖ Berhasil menghapus 9047 file .txt dari folder '../data/interim/corpus_test'
‚úÖ Berhasil menghapus 190316 file .txt dari folder '../data/interim/corpus_talkbank_train'
‚úÖ Berhasil menghapus 23769 file .txt dari folder '../data/interim/corpus_talkbank_val'
‚úÖ Berhasil menghapus 23753 file .txt dari folder '../data/interim/corpus_talkbank_test'

üéâ Pembersihan selesai! Total 246885 file .txt telah dilenyapkan.


In [7]:
import os
import shutil

# Tentukan path dasar dan folder target
base_dir = "../data/raw"
source_dirs = ["noise_part_0", "noise_part_1"]
target_dir = os.path.join(base_dir, "noise")

# 1. Buat folder target 'noise' jika belum ada
os.makedirs(target_dir, exist_ok=True)
print(f"üìÅ Folder target siap: {target_dir}")

total_dipindah = 0

# 2. Proses pemindahan file
for folder_name in source_dirs:
    # Menambahkan sub-folder 'audio' ke dalam pencarian
    source_audio_path = os.path.join(base_dir, folder_name, "audio")
    
    if os.path.exists(source_audio_path):
        # Ambil semua file di dalam folder data/raw/noise_part_X/audio
        files = os.listdir(source_audio_path)
        
        for file in files:
            source_file = os.path.join(source_audio_path, file)
            target_file = os.path.join(target_dir, file)
            
            # Pindahkan file (cut & paste)
            if os.path.isfile(source_file):
                shutil.move(source_file, target_file)
                total_dipindah += 1
                
        # 3. Bersihkan folder lama secara berjenjang
        try:
            # Hapus folder 'audio' di dalamnya dulu
            os.rmdir(source_audio_path)
            # Baru hapus folder induknya (noise_part_0 / noise_part_1)
            os.rmdir(os.path.join(base_dir, folder_name))
            print(f"üóëÔ∏è Folder lama dihapus: {os.path.join(base_dir, folder_name)}")
        except OSError as e:
            print(f"‚ö†Ô∏è Folder {folder_name} tidak bisa dihapus otomatis (mungkin tidak kosong): {e}")
    else:
        print(f"‚ö†Ô∏è Folder tidak ditemukan: {source_audio_path}")

print(f"\nüéâ Penggabungan selesai! Total {total_dipindah} file telah dipindahkan ke '{target_dir}'.")

üìÅ Folder target siap: ../data/raw/noise
üóëÔ∏è Folder lama dihapus: ../data/raw/noise_part_0
üóëÔ∏è Folder lama dihapus: ../data/raw/noise_part_1

üéâ Penggabungan selesai! Total 1940 file telah dipindahkan ke '../data/raw/noise'.


In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def split_jsonl_dataset(
        input_file, 
        output_dir, 
        file_prefix="ext_", # Tambahan kecil agar namanya tidak bentrok dengan data utama
        train_ratio=0.8, 
        val_ratio=0.1, 
        random_seed=42
    ):
    """
    Membedah satu file JSONL besar menjadi 3 file terpisah: Train, Val, dan Test.
    """
    print("==================================================")
    print("üî™ MEMULAI OPERASI PEMBEDAHAN DATA EKSTERNAL üî™")
    print("==================================================")
    
    # 1. Pastikan folder output tersedia
    os.makedirs(output_dir, exist_ok=True)
    
    # 2. Muat seluruh data ke dalam memori
    print(f"[Logistik] Membaca file induk: {input_file}")
    df = pd.read_json(input_file, lines=True)
    total_data = len(df)
    print(f"[Logistik] Total amunisi ditemukan: {total_data} baris.")

    # 3. Taktik Pembedahan Tahap 1: Pisahkan Train dan (Val + Test)
    train_df, temp_df = train_test_split(
        df, 
        train_size=train_ratio, 
        random_state=random_seed
    )

    # 4. Taktik Pembedahan Tahap 2: Pecah sisa data menjadi Val dan Test
    test_ratio_relative = (1.0 - train_ratio - val_ratio) / (1.0 - train_ratio)
    
    val_df, test_df = train_test_split(
        temp_df, 
        test_size=test_ratio_relative, 
        random_state=random_seed
    )

    # 5. Ekspor Data ke JSONL Baru dengan Prefix Eksternal
    train_path = os.path.join(output_dir, f"{file_prefix}train_split.jsonl")
    val_path = os.path.join(output_dir, f"{file_prefix}val_split.jsonl")
    test_path = os.path.join(output_dir, f"{file_prefix}test_split.jsonl")

    # orient='records', lines=True memastikan format output persis seperti input aslinya
    train_df.to_json(train_path, orient='records', lines=True)
    val_df.to_json(val_path, orient='records', lines=True)
    test_df.to_json(test_path, orient='records', lines=True)

    # 6. Laporan Intelijen Akhir
    print("\n[Laporan] Operasi Pembedahan Selesai:")
    print(f"‚úÖ TRAIN SET : {len(train_df)} data ({len(train_df)/total_data*100:.1f}%) -> {train_path}")
    print(f"‚úÖ VAL SET   : {len(val_df)} data ({len(val_df)/total_data*100:.1f}%) -> {val_path}")
    print(f"‚úÖ TEST SET  : {len(test_df)} data ({len(test_df)/total_data*100:.1f}%) -> {test_path}")
    print("==================================================")

if __name__ == "__main__":
    # --- JALUR DISESUAIKAN UNTUK DATA EKSTERNAL (TALKBANK) ---
    # Membaca dari root workspace VS Code Anda
    FILE_INPUT_ASLI = "../data/external/talkbank_train_word_transcripts.jsonl"
    
    # Kita buat sub-folder 'processed' di dalam external agar rapi
    FOLDER_OUTPUT = "../data/external/processed"
    
    split_jsonl_dataset(
        input_file=FILE_INPUT_ASLI,
        output_dir=FOLDER_OUTPUT,
        file_prefix="talkbank_" # Menambahkan identitas agar jelas ini data apa
    )

üî™ MEMULAI OPERASI PEMBEDAHAN DATA EKSTERNAL üî™
[Logistik] Membaca file induk: ../data/external/talkbank_train_word_transcripts.jsonl
[Logistik] Total amunisi ditemukan: 255046 baris.

[Laporan] Operasi Pembedahan Selesai:
‚úÖ TRAIN SET : 204036 data (80.0%) -> ../data/external/processed/talkbank_train_split.jsonl
‚úÖ VAL SET   : 25505 data (10.0%) -> ../data/external/processed/talkbank_val_split.jsonl
‚úÖ TEST SET  : 25505 data (10.0%) -> ../data/external/processed/talkbank_test_split.jsonl
