# Notebook for preprocessing Wikipedia (Indonesia) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
import sys
sys.path.insert(0, '/workspace/src/PL-BERT-ID')
from phonemize import phonemize, EnIndPhonemizer

In [3]:
global_phonemizer = EnIndPhonemizer(ipa=True, keep_stress=True, sep="")

In [4]:
import os
os.environ['TRUST_REMOTE_CODE'] = 'True'

In [5]:
# from transformers import TransfoXLTokenizer
from transformers import BertTokenizer
# tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

### Process dataset

Since using load_dataset with the Indonesian Wikipedia (id) resulted in errors (e.g., "Not Found"), we will download and load the dataset manually.

You can download the dataset from this link: https://huggingface.co/datasets/wikimedia/wikipedia/tree/main/20231101.id.

In [6]:
from datasets import load_dataset

# Use a glob pattern to load all Parquet files in the 'wikipedia' folder.
# This pattern will search for all files ending with '.parquet' within the folder.
parquet_folder = "/workspace/src/PL-BERT-ID/wikipedia/*.parquet"

try:
    dataset = load_dataset("parquet", data_files=parquet_folder)
    if isinstance(dataset, dict) or hasattr(dataset, "keys"):
        split_name = "train" if "train" in dataset else list(dataset.keys())[0]
        dataset = dataset[split_name]
    print("Dataset loaded successfully!")
    print(dataset)
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!
Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 665622
})


In [7]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [8]:
# Update the process_shard function with better error handling

import os
import time

num_shards = 5000

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return f"Shard {i} already exists"
    
    start_time = time.time()
    print('Processing shard %d ...' % i)
    
    try:
        shard = dataset.shard(num_shards=num_shards, index=i)
        print(f'Shard {i}: {len(shard)} examples to process')
        
        # Process in smaller batches to avoid memory issues
        processed_dataset = shard.map(
            lambda t: phonemize(t['text'], global_phonemizer, tokenizer), 
            remove_columns=['text'],
            batch_size=100,  # Process in smaller batches
            desc=f"Phonemizing shard {i}"
        )
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        processed_dataset.save_to_disk(directory)
        
        elapsed = time.time() - start_time
        print(f'Shard {i} completed in {elapsed:.2f} seconds')
        return f"Shard {i} completed successfully"
        
    except Exception as e:
        print(f'Shard {i} failed: {str(e)}')
        return Exception(f"Shard {i} failed: {str(e)}")

In [9]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [10]:
# Update the processing cell with better timeout and worker settings

import os
from pebble import ProcessPool
from concurrent.futures import TimeoutError

# Reduce workers to avoid resource contention and increase timeout
max_workers = 24  # Reduced from 20
timeout_seconds = 1800  # Increased to 30 minutes
failed_shards = []

print(f"Processing {num_shards} shards with {max_workers} workers...")
print(f"Timeout set to {timeout_seconds} seconds ({timeout_seconds//60} minutes)")

with ProcessPool(max_workers=max_workers) as pool:
    future = pool.map(process_shard, range(num_shards), timeout=timeout_seconds)
    
    try:
        for i, result in enumerate(future.result()):
            if isinstance(result, Exception):
                print(f"Shard {i} failed: {result}")
                failed_shards.append(i)
            else:
                if i % 10 == 0:  # Progress indicator
                    print(f"Completed {i+1}/{num_shards} shards")
    except Exception as e:
        print(f"Processing error: {e}")

print(f"Processing completed. Failed shards: {len(failed_shards)}")
if failed_shards:
    print(f"Failed shard indices: {failed_shards}")

Processing 5000 shards with 24 workers...
Timeout set to 1800 seconds (30 minutes)
Shard 1 already exists!

Shard 2 already exists!
Shard 3 already exists!Shard 4 already exists!Shard 5 already exists!
Shard 6 already exists!Shard 7 already exists!
Shard 9 already exists!
Shard 8 already exists!Shard 12 already exists!Shard 10 already exists!
Shard 11 already exists!




Shard 13 already exists!
Shard 14 already exists!
Shard 16 already exists!Shard 15 already exists!Shard 24 already exists!Shard 0 already exists!Shard 26 already exists!Shard 18 already exists!Shard 31 already exists!Shard 29 already exists!
Shard 23 already exists!Shard 17 already exists!
Shard 20 already exists!



Shard 27 already exists!Shard 25 already exists!Shard 30 already exists!Shard 28 already exists!

Shard 37 already exists!Shard 35 already exists!Shard 19 already exists!Shard 32 already exists!Shard 34 already exists!Shard 36 already exists!


Shard 33 already exists!
Shard 21 already exists!Shard 39 alre

In [11]:
# Add this cell before processing to check existing shards

def check_existing_shards():
    """Check which shards already exist"""
    existing_shards = []
    pending_shards = []
    
    for i in range(num_shards):
        directory = root_directory + "/shard_" + str(i)
        if os.path.exists(directory):
            existing_shards.append(i)
        else:
            pending_shards.append(i)
    
    print(f"Existing shards: {len(existing_shards)}/{num_shards}")
    print(f"Pending shards: {len(pending_shards)}")
    
    return pending_shards

# Check existing shards first
pending_shards = check_existing_shards()

if not pending_shards:
    print("All shards already processed!")
else:
    print(f"Processing {len(pending_shards)} pending shards...")

Existing shards: 5000/5000
Pending shards: 0
All shards already processed!


In [12]:
# Process only pending shards

if pending_shards:
    max_workers = 6  # Conservative number of workers
    timeout_seconds = 2400  # 40 minutes timeout
    failed_shards = []
    
    print(f"Processing {len(pending_shards)} pending shards...")
    
    with ProcessPool(max_workers=max_workers) as pool:
        future = pool.map(process_shard, pending_shards, timeout=timeout_seconds)
        
        try:
            for i, result in enumerate(future.result()):
                if isinstance(result, Exception):
                    print(f"Shard {pending_shards[i]} failed: {result}")
                    failed_shards.append(pending_shards[i])
                else:
                    if i % 5 == 0:  # Progress indicator every 5 shards
                        print(f"Progress: {i+1}/{len(pending_shards)} shards completed")
        except Exception as e:
            print(f"Processing error: {e}")
    
    print(f"Processing completed. Failed shards: {len(failed_shards)}")
else:
    print("No pending shards to process!")

No pending shards to process!


### Collect all shards to form the processed dataset

In [13]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_0 loaded
shard_1 loaded
shard_10 loaded
shard_100 loaded
shard_1000 loaded
shard_10000 loaded
shard_10001 loaded
shard_10002 loaded
shard_10003 loaded
shard_10004 loaded
shard_10005 loaded
shard_10006 loaded
shard_10007 loaded
shard_10008 loaded
shard_10009 loaded
shard_1001 loaded
shard_10010 loaded
shard_10011 loaded
shard_10012 loaded
shard_10013 loaded
shard_10014 loaded
shard_10015 loaded
shard_10016 loaded
shard_10017 loaded
shard_10018 loaded
shard_10019 loaded
shard_1002 loaded
shard_10020 loaded
shard_10021 loaded
shard_10022 loaded
shard_10023 loaded
shard_10024 loaded
shard_10025 loaded
shard_10026 loaded
shard_10027 loaded
shard_10028 loaded
shard_10029 loaded
shard_1003 loaded
shard_10030 loaded
shard_10031 loaded
shard_10032 loaded
shard_10033 loaded
shard_10034 loaded
shard_10035 loaded
shard_10036 loaded
shard_10037 loaded
shard_10038 loaded
shard_10039 loaded
shard_1004 loaded
shard_10040 loaded
shard_10041 loaded
shard_10042 loaded
shard_10043 loaded
shard_10044

In [14]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (13/13 shards): 100%|██████████| 1304338/1304338 [00:58<00:00, 22238.20 examples/s]

Dataset saved to wikipedia_20231101.id.processed





In [15]:
# check the dataset size
dataset

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 1304338
})

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [16]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [17]:
special_token = config['dataset_params']['word_separator']

In [18]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

  0%|          | 0/10190 [00:00<?, ?it/s]

100%|██████████| 10190/10190 [01:46<00:00, 95.32it/s] 


In [19]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode([word.lower()])[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

100%|██████████| 197348/197348 [00:02<00:00, 66427.27it/s]


In [20]:
lower_tokens = (list(set(lower_tokens)))

In [21]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

100%|██████████| 197348/197348 [00:21<00:00, 9332.29it/s] 


In [22]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [23]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [24]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [25]:
# Inspect shard content - lihat isi phoneme dari shard tertentu

from datasets import load_from_disk

def inspect_shard(shard_number, num_samples=5):
    """
    Melihat isi phoneme dari shard tertentu
    
    Args:
        shard_number: nomor shard yang ingin dilihat (0-99)
        num_samples: jumlah contoh yang ingin ditampilkan
    """
    shard_directory = f"{root_directory}/shard_{shard_number}"
    
    if not os.path.exists(shard_directory):
        print(f"Shard {shard_number} tidak ada di {shard_directory}")
        return
    
    try:
        # Load shard
        shard_data = load_from_disk(shard_directory)
        print(f"Shard {shard_number} info:")
        print(f"- Total samples: {len(shard_data)}")
        print(f"- Columns: {shard_data.column_names}")
        print("="*60)
        
        # Tampilkan beberapa contoh
        for i in range(min(num_samples, len(shard_data))):
            sample = shard_data[i]
            print(f"\nSample {i+1}:")
            print(f"Input IDs: {sample['input_ids'][:10]}{'...' if len(sample['input_ids']) > 10 else ''}")
            print(f"Phonemes: {sample['phonemes'][:10]}{'...' if len(sample['phonemes']) > 10 else ''}")
            
            # Decode beberapa token untuk melihat isi sebenarnya
            if len(sample['input_ids']) > 0:
                decoded_words = [tokenizer.decode([token_id]) for token_id in sample['input_ids'][:5]]
                print(f"Decoded words (first 5): {decoded_words}")
            
            print(f"Phonemes (first 5): {sample['phonemes'][:5]}")
            print("-"*40)
            
    except Exception as e:
        print(f"Error loading shard {shard_number}: {e}")

# Contoh penggunaan: lihat isi shard 0
inspect_shard(0, num_samples=3)

Shard 0 info:
- Total samples: 6657
- Columns: ['id', 'url', 'title', 'input_ids', 'phonemes']

Sample 1:
Input IDs: [24, 24, 2, 24, 24, 82734, 24, 3159, 22, 24]...
Phonemes: ['ˈasam', 'dˌɛoksˌiribˌonuklˈɛat', ',', 'lˈɛbih', 'dˈikənal', 'dˈɛŋan', 'siŋkˈatan', 'dnˈa', '(', 'bahˈasa']...
Decoded words (first 5): ['<unk>', '<unk>', ',', '<unk>', '<unk>']
Phonemes (first 5): ['ˈasam', 'dˌɛoksˌiribˌonuklˈɛat', ',', 'lˈɛbih', 'dˈikənal']
----------------------------------------

Sample 2:
Input IDs: [5305, 35337, 8445, 1260, 40509, 39, 21, 24, 24, 24]...
Phonemes: ['muhˈammad', 'ˈanwar', 'ˈɛl', '-', 'sˈadat', ';', ')', 'adˈalah', 'səˈɔraŋ', 'pˌolitˈikus']...
Decoded words (first 5): ['Muhammad', 'Anwar', 'el', '-', 'Sadat']
Phonemes (first 5): ['muhˈammad', 'ˈanwar', 'ˈɛl', '-', 'sˈadat']
----------------------------------------

Sample 3:
Input IDs: [56142, 27291, 24, 24, 90205, 5021, 24, 47071, 24, 24]...
Phonemes: ['dˈatuk', 'ˈazhar', 'mˈansɔr', 'adˈalah', 'ˈɔraŋ', 'mˌalajsˈia', 'pərtˈama

In [29]:
import pandas as pd
from tqdm import tqdm

def create_excel_from_dataset(dataset_sample, filename="wikipedia_processed.xlsx"):
    """
    Membuat file Excel dari sampel dataset yang telah diproses.
    
    Args:
        dataset_sample: Sampel dari dataset yang akan diproses.
        filename: Nama file Excel yang akan disimpan.
    """
    processed_data = []
    
    print(f"Processing {len(dataset_sample)} samples to create Excel file...")
    
    for sample in tqdm(dataset_sample):
        input_ids = sample['input_ids']
        phonemes = sample['phonemes']
        
        # Decode input_ids kembali menjadi teks
        # Setiap ID di-decode secara terpisah dan digabungkan dengan spasi
        decoded_text = ' '.join([tokenizer.decode([token_id]) for token_id in input_ids])
        
        # Gabungkan daftar phonemes menjadi satu string
        phonemes_str = ' '.join(phonemes)
        
        processed_data.append({
            'text': decoded_text,
            'phonemes': phonemes_str,
            'input_ids': str(input_ids)  # Simpan sebagai string agar mudah dibaca di Excel
        })
        
    # Buat DataFrame dari data yang diproses
    df = pd.DataFrame(processed_data)
    
    # Simpan DataFrame ke file Excel
    df.to_excel(filename, index=False)
    print(f"\nDataset berhasil disimpan ke {filename}")

# Ambil sebagian kecil dari dataset untuk dibuatkan Excel (misalnya 1000 baris pertama)
num_samples_for_excel = 1000

# TAMBAHKAN KODE INI UNTUK MENJALANKAN FUNGSI:
if len(dataset) > 0:
    # Ambil sampel dari dataset
    dataset_sample = dataset.select(range(min(num_samples_for_excel, len(dataset))))
    
    # Jalankan fungsi untuk membuat Excel
    create_excel_from_dataset(dataset_sample, "wikipedia_processed.xlsx")
else:
    print("Dataset kosong atau belum dimuat!")

Processing 1000 samples to create Excel file...


100%|██████████| 1000/1000 [00:13<00:00, 75.43it/s]



Dataset berhasil disimpan ke wikipedia_processed.xlsx
