# Notebook for preprocessing Wikipedia (Indonesia) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
import sys
sys.path.insert(0, '/workspace/src/PL-BERT-ID')
from phonemize import phonemize, EnIndPhonemizer

In [3]:
global_phonemizer = EnIndPhonemizer(ipa=True, keep_stress=True, sep="")

In [4]:
import os
os.environ['TRUST_REMOTE_CODE'] = 'True'

In [5]:
from transformers import TransfoXLTokenizer
tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to

  from .autonotebook import tqdm as notebook_tqdm
`TransfoXL` was deprecated due to security issues linked to `pickle.load` in `TransfoXLTokenizer`. See more details on this model's documentation page: `https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/transfo-xl.md`.


### Process dataset

Since using load_dataset with the Indonesian Wikipedia (id) resulted in errors (e.g., "Not Found"), we will download and load the dataset manually.

You can download the dataset from this link: https://huggingface.co/datasets/wikimedia/wikipedia/tree/main/20231101.id.

In [6]:
from datasets import load_dataset

# Use a glob pattern to load all Parquet files in the 'wikipedia' folder.
# This pattern will search for all files ending with '.parquet' within the folder.
parquet_folder = "/workspace/src/PL-BERT-ID/wikipedia/*.parquet"

try:
    dataset = load_dataset("parquet", data_files=parquet_folder)
    if isinstance(dataset, dict) or hasattr(dataset, "keys"):
        split_name = "train" if "train" in dataset else list(dataset.keys())[0]
        dataset = dataset[split_name]
    print("Dataset loaded successfully!")
    print(dataset)
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!
Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 665622
})


In [7]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [8]:
# Update the process_shard function with better error handling

import os
import time

num_shards = 100

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return f"Shard {i} already exists"
    
    start_time = time.time()
    print('Processing shard %d ...' % i)
    
    try:
        shard = dataset.shard(num_shards=num_shards, index=i)
        print(f'Shard {i}: {len(shard)} examples to process')
        
        # Process in smaller batches to avoid memory issues
        processed_dataset = shard.map(
            lambda t: phonemize(t['text'], global_phonemizer, tokenizer), 
            remove_columns=['text'],
            batch_size=100,  # Process in smaller batches
            desc=f"Phonemizing shard {i}"
        )
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        processed_dataset.save_to_disk(directory)
        
        elapsed = time.time() - start_time
        print(f'Shard {i} completed in {elapsed:.2f} seconds')
        return f"Shard {i} completed successfully"
        
    except Exception as e:
        print(f'Shard {i} failed: {str(e)}')
        return Exception(f"Shard {i} failed: {str(e)}")

In [9]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [10]:
# Update the processing cell with better timeout and worker settings

import os
from pebble import ProcessPool
from concurrent.futures import TimeoutError

# Reduce workers to avoid resource contention and increase timeout
max_workers = 8  # Reduced from 20
timeout_seconds = 1800  # Increased to 30 minutes
failed_shards = []

print(f"Processing {num_shards} shards with {max_workers} workers...")
print(f"Timeout set to {timeout_seconds} seconds ({timeout_seconds//60} minutes)")

with ProcessPool(max_workers=max_workers) as pool:
    future = pool.map(process_shard, range(num_shards), timeout=timeout_seconds)
    
    try:
        for i, result in enumerate(future.result()):
            if isinstance(result, Exception):
                print(f"Shard {i} failed: {result}")
                failed_shards.append(i)
            else:
                if i % 10 == 0:  # Progress indicator
                    print(f"Completed {i+1}/{num_shards} shards")
    except Exception as e:
        print(f"Processing error: {e}")

print(f"Processing completed. Failed shards: {len(failed_shards)}")
if failed_shards:
    print(f"Failed shard indices: {failed_shards}")

Processing 100 shards with 8 workers...
Timeout set to 1800 seconds (30 minutes)
Processing shard 0 ...Processing shard 1 ...Processing shard 2 ...Processing shard 3 ...



Processing shard 4 ...Processing shard 5 ...

Processing shard 7 ...Processing shard 6 ...

Shard 0: 6657 examples to processShard 1: 6657 examples to process
Shard 2: 6657 examples to processShard 3: 6657 examples to process
Shard 4: 6657 examples to process

Shard 5: 6657 examples to process

Shard 6: 6657 examples to processShard 7: 6657 examples to process



Phonemizing shard 5: 100%|██████████| 6657/6657 [10:13<00:00, 10.86 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 48519.10 examples/s]
Phonemizing shard 7:  48%|████▊     | 3178/6657 [10:13<01:57, 29.52 examples/s]

Shard 5 completed in 613.96 seconds
Processing shard 8 ...
Shard 8: 6657 examples to process


Phonemizing shard 7: 100%|██████████| 6657/6657 [13:39<00:00,  8.12 examples/s]s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 35236.56 examples/s]
Phonemizing shard 0:  21%|██        | 1404/6657 [13:40<57:10,  1.53 examples/s]  

Shard 7 completed in 820.88 seconds
Processing shard 9 ...
Shard 9: 6657 examples to process


Phonemizing shard 4: 100%|██████████| 6657/6657 [14:46<00:00,  7.51 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 32117.76 examples/s]
Phonemizing shard 8:  45%|████▍     | 2992/6657 [04:32<12:08,  5.03 examples/s]

Shard 4 completed in 887.73 seconds


Phonemizing shard 2:  62%|██████▏   | 4116/6657 [14:46<03:26, 12.32 examples/s]

Processing shard 10 ...
Shard 10: 6657 examples to process


Phonemizing shard 6: 100%|██████████| 6657/6657 [17:00<00:00,  6.52 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 25324.05 examples/s]
Phonemizing shard 8:  61%|██████    | 4064/6657 [06:46<04:42,  9.19 examples/s]

Shard 6 completed in 1021.60 seconds
Processing shard 11 ...
Shard 11: 6657 examples to process


Phonemizing shard 2: 100%|██████████| 6657/6657 [20:11<00:00,  5.50 examples/s]] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 19311.75 examples/s]


Shard 2 completed in 1212.50 seconds
Processing shard 12 ...
Shard 12: 6657 examples to process


Phonemizing shard 3: 100%|██████████| 6657/6657 [21:25<00:00,  5.18 examples/s]] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 20383.26 examples/s]


Shard 3 completed in 1287.12 seconds
Processing shard 13 ...
Shard 13: 6657 examples to process


Phonemizing shard 8: 100%|██████████| 6657/6657 [12:59<00:00,  8.54 examples/s]] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 26381.51 examples/s]


Shard 8 completed in 780.18 seconds
Processing shard 14 ...
Shard 14: 6657 examples to process


Phonemizing shard 10: 100%|██████████| 6657/6657 [10:27<00:00, 10.62 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 36278.72 examples/s]


Shard 10 completed in 627.94 seconds
Processing shard 15 ...
Shard 15: 6657 examples to process


Phonemizing shard 1: 100%|██████████| 6657/6657 [28:23<00:00,  3.91 examples/s]]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 13907.30 examples/s]


Shard 1 completed in 1704.51 seconds
Processing shard 16 ...
Shard 16: 6657 examples to process


Phonemizing shard 16:  14%|█▍        | 921/6657 [01:34<08:31, 11.22 examples/s]] 

Processing error: [Errno Task timeout] 1800


Phonemizing shard 11:  86%|████████▋ | 5750/6657 [12:57<06:00,  2.52 examples/s]

Processing shard 17 ...
Shard 17: 6657 examples to process


Phonemizing shard 11: 100%|██████████| 6657/6657 [14:29<00:00,  7.66 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 33141.34 examples/s]


Shard 11 completed in 870.14 seconds
Processing shard 18 ...
Shard 18: 6657 examples to process


Phonemizing shard 9: 100%|██████████| 6657/6657 [18:18<00:00,  6.06 examples/s]] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 21258.07 examples/s]


Shard 9 completed in 1099.76 seconds
Processing shard 19 ...

Phonemizing shard 14:  70%|███████   | 4673/6657 [08:45<02:31, 13.07 examples/s]


Shard 19: 6657 examples to process


Phonemizing shard 13: 100%|██████████| 6657/6657 [11:08<00:00,  9.96 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 37694.29 examples/s]


Shard 13 completed in 669.49 seconds

Phonemizing shard 18:  12%|█▏        | 766/6657 [01:04<06:51, 14.33 examples/s]


Shard 20 already exists!
Processing shard 21 ...
Shard 21: 6657 examples to process


Phonemizing shard 12: 100%|██████████| 6657/6657 [13:23<00:00,  8.28 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 29191.49 examples/s]
Phonemizing shard 16:  45%|████▍     | 2995/6657 [05:11<11:01,  5.53 examples/s]

Shard 12 completed in 804.76 seconds
Processing shard 22 ...
Shard 22: 6656 examples to process


Phonemizing shard 15: 100%|██████████| 6657/6657 [09:07<00:00, 12.15 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 37010.77 examples/s]


Shard 15 completed in 548.76 seconds
Processing shard 23 ...
Shard 23: 6656 examples to process


Phonemizing shard 14: 100%|██████████| 6657/6657 [12:21<00:00,  8.97 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 35403.34 examples/s]


Shard 14 completed in 742.91 seconds
Processing shard 24 ...
Shard 24: 6656 examples to process


Phonemizing shard 21: 100%|██████████| 6657/6657 [04:08<00:00, 26.74 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 76526.98 examples/s]


Shard 21 completed in 249.79 seconds
Processing shard 25 ...
Shard 25: 6656 examples to process


Phonemizing shard 22: 100%|██████████| 6656/6656 [06:07<00:00, 18.12 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 50842.18 examples/s]


Shard 22 completed in 368.16 seconds
Processing shard 26 ...
Shard 26: 6656 examples to process


Phonemizing shard 16: 100%|██████████| 6657/6657 [12:11<00:00,  9.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 29003.67 examples/s]


Shard 16 completed in 732.34 seconds
Processing shard 27 ...
Shard 27: 6656 examples to process


Phonemizing shard 19: 100%|██████████| 6657/6657 [09:33<00:00, 11.60 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 39353.69 examples/s]


Shard 19 completed in 574.84 seconds
Processing shard 28 ...
Shard 28: 6656 examples to process

Phonemizing shard 18:  99%|█████████▉| 6614/6657 [10:02<00:06,  6.48 examples/s]




Phonemizing shard 18: 100%|██████████| 6657/6657 [10:07<00:00, 10.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 38757.23 examples/s]


Shard 18 completed in 608.93 seconds
Processing shard 29 ...
Shard 29: 6656 examples to process


Phonemizing shard 23: 100%|██████████| 6656/6656 [08:35<00:00, 12.91 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 38001.16 examples/s]


Shard 23 completed in 516.42 seconds
Processing shard 30 ...
Shard 30: 6656 examples to process


Phonemizing shard 17: 100%|██████████| 6657/6657 [13:11<00:00,  8.41 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 36268.59 examples/s]


Shard 17 completed in 792.51 seconds
Processing shard 31 ...
Shard 31: 6656 examples to process


Phonemizing shard 24: 100%|██████████| 6656/6656 [09:40<00:00, 11.46 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 39070.13 examples/s]


Shard 24 completed in 581.89 seconds
Shard 32 already exists!
Shard 33 already exists!
Shard 34 already exists!
Shard 35 already exists!
Shard 36 already exists!
Shard 37 already exists!
Shard 38 already exists!
Shard 39 already exists!
Shard 40 already exists!
Shard 41 already exists!
Shard 42 already exists!
Shard 43 already exists!
Shard 44 already exists!
Shard 45 already exists!
Shard 46 already exists!
Processing shard 47 ...


Phonemizing shard 31:  19%|█▉        | 1258/6656 [02:05<06:30, 13.83 examples/s]

Shard 47: 6656 examples to process


Phonemizing shard 25: 100%|██████████| 6656/6656 [11:28<00:00,  9.67 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 31197.21 examples/s]


Shard 25 completed in 689.22 seconds
Processing shard 48 ...
Shard 48: 6656 examples to process


Phonemizing shard 27: 100%|██████████| 6656/6656 [08:51<00:00, 12.53 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 37159.58 examples/s]


Shard 27 completed in 532.04 seconds
Processing shard 49 ...
Shard 49: 6656 examples to process


Phonemizing shard 47: 100%|██████████| 6656/6656 [04:56<00:00, 22.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 59701.95 examples/s]


Shard 47 completed in 297.34 seconds


Phonemizing shard 30:  55%|█████▍    | 3657/6656 [07:14<05:04,  9.84 examples/s]

Processing shard 50 ...
Shard 50: 6656 examples to process


Phonemizing shard 26: 100%|██████████| 6656/6656 [10:33<00:00, 10.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 32962.26 examples/s]


Shard 26 completed in 634.26 seconds


Phonemizing shard 30:  55%|█████▌    | 3685/6656 [07:18<05:14,  9.45 examples/s]

Processing shard 51 ...
Shard 51: 6656 examples to process


Phonemizing shard 29: 100%|██████████| 6656/6656 [09:58<00:00, 11.12 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 32154.07 examples/s]


Shard 29 completed in 599.68 seconds
Processing shard 52 ...
Shard 52: 6656 examples to process


Phonemizing shard 28: 100%|██████████| 6656/6656 [10:52<00:00, 10.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 33858.06 examples/s]


Shard 28 completed in 653.55 seconds
Processing shard 53 ...
Shard 53: 6656 examples to process


Phonemizing shard 31: 100%|██████████| 6656/6656 [11:01<00:00, 10.06 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 29184.63 examples/s]


Shard 31 completed in 662.34 seconds
Processing shard 54 ...
Shard 54: 6656 examples to process


Phonemizing shard 30: 100%|██████████| 6656/6656 [12:28<00:00,  8.90 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 26473.23 examples/s]


Shard 30 completed in 749.18 seconds
Processing shard 55 ...
Shard 55: 6656 examples to process


Phonemizing shard 50: 100%|██████████| 6656/6656 [06:48<00:00, 16.31 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 54811.03 examples/s]


Shard 50 completed in 409.02 seconds
Processing shard 56 ...
Shard 56: 6656 examples to process


Phonemizing shard 48: 100%|██████████| 6656/6656 [09:44<00:00, 11.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 35385.64 examples/s]


Shard 48 completed in 585.62 seconds
Processing shard 57 ...
Shard 57: 6656 examples to process


Phonemizing shard 49: 100%|██████████| 6656/6656 [09:55<00:00, 11.18 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 35358.88 examples/s]


Shard 49 completed in 596.53 seconds
Processing shard 58 ...
Shard 58: 6656 examples to process


Phonemizing shard 51: 100%|██████████| 6656/6656 [10:44<00:00, 10.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 34275.37 examples/s]


Shard 51 completed in 645.90 seconds
Processing shard 59 ...
Shard 59: 6656 examples to process


Phonemizing shard 52: 100%|██████████| 6656/6656 [13:25<00:00,  8.26 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 28793.03 examples/s]


Shard 52 completed in 806.76 seconds


Phonemizing shard 54:  75%|███████▌  | 5007/6656 [10:51<02:38, 10.41 examples/s]

Processing shard 60 ...
Shard 60: 6656 examples to process


Phonemizing shard 59: 100%|██████████| 6656/6656 [05:15<00:00, 21.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 60933.36 examples/s]


Shard 59 completed in 316.13 seconds
Processing shard 61 ...
Shard 61: 6656 examples to process


Phonemizing shard 53: 100%|██████████| 6656/6656 [14:00<00:00,  7.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 29645.12 examples/s]


Shard 53 completed in 841.38 seconds
Processing shard 62 ...
Shard 62: 6656 examples to process


Phonemizing shard 55: 100%|██████████| 6656/6656 [12:17<00:00,  9.02 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 29628.10 examples/s]


Shard 55 completed in 738.59 seconds
Processing shard 63 ...
Shard 63: 6656 examples to process


Phonemizing shard 54: 100%|██████████| 6656/6656 [14:30<00:00,  7.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 28936.25 examples/s]


Shard 54 completed in 871.23 seconds
Processing shard 64 ...
Shard 64: 6656 examples to process


Phonemizing shard 56: 100%|██████████| 6656/6656 [11:52<00:00,  9.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 34867.41 examples/s]


Shard 56 completed in 713.78 seconds
Processing shard 65 ...
Shard 65: 6656 examples to process


Phonemizing shard 57: 100%|██████████| 6656/6656 [13:13<00:00,  8.39 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 27490.25 examples/s]


Shard 57 completed in 794.65 seconds
Processing shard 66 ...
Shard 66: 6656 examples to process


Phonemizing shard 58: 100%|██████████| 6656/6656 [12:23<00:00,  8.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 30756.11 examples/s]
Phonemizing shard 64:  33%|███▎      | 2218/6656 [03:02<04:27, 16.58 examples/s]

Shard 58 completed in 744.09 seconds
Processing shard 67 ...
Shard 67: 6656 examples to process


Phonemizing shard 63: 100%|██████████| 6656/6656 [07:41<00:00, 14.43 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 43408.40 examples/s]
Phonemizing shard 60:  82%|████████▏ | 5483/6656 [10:22<02:22,  8.24 examples/s]

Shard 63 completed in 462.15 seconds
Processing shard 68 ...
Shard 68: 6656 examples to process


Phonemizing shard 64: 100%|██████████| 6656/6656 [08:15<00:00, 13.45 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 44882.02 examples/s]


Shard 64 completed in 496.00 seconds
Processing shard 69 ...
Shard 69: 6656 examples to process


Phonemizing shard 62: 100%|██████████| 6656/6656 [10:31<00:00, 10.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 31468.15 examples/s]


Shard 62 completed in 632.80 seconds
Shard 70 already exists!
Shard 71 already exists!
Processing shard 72 ...
Shard 72: 6656 examples to process


Phonemizing shard 60: 100%|██████████| 6656/6656 [12:15<00:00,  9.06 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 28909.64 examples/s]


Shard 60 completed in 736.04 seconds
Shard 73 already exists!
Processing shard 74 ...
Shard 74: 6656 examples to process


Phonemizing shard 65: 100%|██████████| 6656/6656 [08:33<00:00, 12.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 45939.26 examples/s]


Shard 65 completed in 514.49 seconds
Processing shard 75 ...
Shard 75: 6656 examples to process


Phonemizing shard 67: 100%|██████████| 6656/6656 [07:15<00:00, 15.28 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 47100.85 examples/s]


Shard 67 completed in 436.46 seconds
Processing shard 76 ...
Shard 76: 6656 examples to process


Phonemizing shard 61: 100%|██████████| 6656/6656 [12:51<00:00,  8.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 27716.45 examples/s]
Phonemizing shard 66:  73%|███████▎  | 4855/6656 [07:57<02:46, 10.79 examples/s]

Shard 61 completed in 772.39 seconds
Processing shard 77 ...
Shard 77: 6656 examples to process


Phonemizing shard 69: 100%|██████████| 6656/6656 [03:58<00:00, 27.94 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 76658.57 examples/s]


Shard 69 completed in 239.08 seconds
Processing shard 78 ...
Shard 78: 6656 examples to process


Phonemizing shard 74: 100%|██████████| 6656/6656 [03:38<00:00, 30.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 84367.75 examples/s]


Shard 74 completed in 218.88 seconds
Processing shard 79 ...
Shard 79: 6656 examples to process


Phonemizing shard 66: 100%|██████████| 6656/6656 [10:00<00:00, 11.08 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 35385.46 examples/s]


Shard 66 completed in 601.52 seconds


Phonemizing shard 78:   3%|▎         | 174/6656 [00:15<06:21, 16.99 examples/s]

Processing shard 80 ...
Shard 80: 6656 examples to process


Phonemizing shard 72: 100%|██████████| 6656/6656 [06:59<00:00, 15.87 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 54199.27 examples/s]


Shard 72 completed in 420.45 seconds
Processing shard 81 ...


Phonemizing shard 76:  54%|█████▍    | 3589/6656 [04:56<04:52, 10.48 examples/s]

Shard 81: 6656 examples to process


Phonemizing shard 75: 100%|██████████| 6656/6656 [08:26<00:00, 13.13 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 41397.70 examples/s]


Shard 75 completed in 507.92 seconds
Processing shard 82 ...
Shard 82: 6656 examples to process


Phonemizing shard 68: 100%|██████████| 6656/6656 [10:46<00:00, 10.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 35990.59 examples/s]


Shard 68 completed in 647.75 seconds
Processing shard 83 ...
Shard 83: 6656 examples to process


Phonemizing shard 77: 100%|██████████| 6656/6656 [07:56<00:00, 13.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 40166.68 examples/s]


Shard 77 completed in 477.26 seconds
Processing shard 84 ...

Phonemizing shard 82:  16%|█▌        | 1077/6656 [01:09<07:49, 11.88 examples/s]


Shard 84: 6656 examples to process


Phonemizing shard 76: 100%|██████████| 6656/6656 [09:19<00:00, 11.89 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 36299.31 examples/s]


Shard 76 completed in 560.79 seconds
Processing shard 85 ...


Phonemizing shard 80:  87%|████████▋ | 5809/6656 [07:08<00:18, 45.22 examples/s]

Shard 85: 6656 examples to process


Phonemizing shard 80: 100%|██████████| 6656/6656 [08:03<00:00, 13.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 46694.19 examples/s]


Shard 80 completed in 484.55 seconds
Shard 86 already exists!
Processing shard 87 ...


Phonemizing shard 85:  16%|█▌        | 1047/6656 [00:54<01:37, 57.80 examples/s]

Shard 87: 6656 examples to process


Phonemizing shard 79: 100%|██████████| 6656/6656 [09:31<00:00, 11.64 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 40273.70 examples/s]


Shard 79 completed in 572.82 seconds
Processing shard 88 ...
Shard 88: 6656 examples to process


Phonemizing shard 78: 100%|██████████| 6656/6656 [09:38<00:00, 11.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 37177.20 examples/s]
Phonemizing shard 88:   1%|          | 49/6656 [00:05<27:45,  3.97 examples/s]

Shard 78 completed in 579.58 seconds
Processing shard 89 ...
Shard 89: 6656 examples to process


Phonemizing shard 81: 100%|██████████| 6656/6656 [07:09<00:00, 15.48 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 45247.11 examples/s]


Shard 81 completed in 430.89 seconds
Processing shard 90 ...

Phonemizing shard 82:  62%|██████▏   | 4144/6656 [05:12<04:00, 10.44 examples/s]


Shard 90: 6656 examples to process


Phonemizing shard 82: 100%|██████████| 6656/6656 [07:58<00:00, 13.92 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 46184.28 examples/s]
Phonemizing shard 88:  36%|███▌      | 2374/6656 [03:25<04:23, 16.24 examples/s]

Shard 82 completed in 479.19 seconds
Processing shard 91 ...
Shard 91: 6656 examples to process


Phonemizing shard 84: 100%|██████████| 6656/6656 [06:55<00:00, 16.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 46729.20 examples/s]


Shard 84 completed in 416.18 seconds
Processing shard 92 ...
Shard 92: 6656 examples to process


Phonemizing shard 83: 100%|██████████| 6656/6656 [07:50<00:00, 14.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 43904.29 examples/s]


Shard 83 completed in 471.15 seconds
Processing shard 93 ...
Shard 93: 6656 examples to process


Phonemizing shard 85: 100%|██████████| 6656/6656 [08:10<00:00, 13.57 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 48463.05 examples/s]


Shard 85 completed in 491.46 seconds
Processing shard 94 ...
Shard 94: 6656 examples to process


Phonemizing shard 87: 100%|██████████| 6656/6656 [07:56<00:00, 13.96 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 43732.14 examples/s]


Shard 87 completed in 477.85 seconds
Processing shard 95 ...
Shard 95: 6656 examples to process


Phonemizing shard 89: 100%|██████████| 6656/6656 [08:48<00:00, 12.60 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 39263.94 examples/s]


Shard 89 completed in 529.19 seconds
Processing shard 96 ...
Shard 96: 6656 examples to process

Phonemizing shard 88: 100%|█████████▉| 6653/6656 [08:54<00:00, 12.10 examples/s]




Phonemizing shard 88: 100%|██████████| 6656/6656 [08:54<00:00, 12.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 45335.65 examples/s]


Shard 88 completed in 535.63 seconds

Phonemizing shard 92:  58%|█████▊    | 3849/6656 [05:22<07:58,  5.87 examples/s]


Processing shard 97 ...
Shard 97: 6656 examples to process


Phonemizing shard 90: 100%|██████████| 6656/6656 [08:50<00:00, 12.54 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 37501.36 examples/s]
Phonemizing shard 91:  64%|██████▎   | 4233/6656 [06:04<03:31, 11.45 examples/s]

Shard 90 completed in 531.70 seconds
Processing shard 98 ...
Shard 98: 6656 examples to process


Phonemizing shard 91: 100%|██████████| 6656/6656 [09:19<00:00, 11.90 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 36335.83 examples/s]


Shard 91 completed in 560.06 seconds
Processing shard 99 ...
Shard 99: 6656 examples to process


Phonemizing shard 92: 100%|██████████| 6656/6656 [10:25<00:00, 10.64 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 31391.19 examples/s]


Shard 92 completed in 626.57 seconds


Phonemizing shard 93: 100%|██████████| 6656/6656 [10:40<00:00, 10.39 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 31741.07 examples/s]


Shard 93 completed in 641.64 seconds


Phonemizing shard 94: 100%|██████████| 6656/6656 [09:47<00:00, 11.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 36484.84 examples/s]
Phonemizing shard 99:  52%|█████▏    | 3483/6656 [03:05<01:58, 26.78 examples/s]

Shard 94 completed in 588.88 seconds


Phonemizing shard 95: 100%|██████████| 6656/6656 [09:37<00:00, 11.53 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 47470.87 examples/s]


Shard 95 completed in 578.06 seconds


Phonemizing shard 97: 100%|██████████| 6656/6656 [08:04<00:00, 13.73 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 53132.57 examples/s]


Shard 97 completed in 485.76 seconds


Phonemizing shard 98: 100%|██████████| 6656/6656 [07:43<00:00, 14.35 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 28243.07 examples/s]


Shard 98 completed in 464.74 seconds


Phonemizing shard 96: 100%|██████████| 6656/6656 [08:57<00:00, 12.38 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 72810.49 examples/s]


Shard 96 completed in 538.35 seconds


Phonemizing shard 99: 100%|██████████| 6656/6656 [06:18<00:00, 17.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6656/6656 [00:00<00:00, 77864.02 examples/s]


Shard 99 completed in 379.85 seconds
Processing completed. Failed shards: 0


In [11]:
# Add this cell before processing to check existing shards

def check_existing_shards():
    """Check which shards already exist"""
    existing_shards = []
    pending_shards = []
    
    for i in range(num_shards):
        directory = root_directory + "/shard_" + str(i)
        if os.path.exists(directory):
            existing_shards.append(i)
        else:
            pending_shards.append(i)
    
    print(f"Existing shards: {len(existing_shards)}/{num_shards}")
    print(f"Pending shards: {len(pending_shards)}")
    
    return pending_shards

# Check existing shards first
pending_shards = check_existing_shards()

if not pending_shards:
    print("All shards already processed!")
else:
    print(f"Processing {len(pending_shards)} pending shards...")

Existing shards: 99/100
Pending shards: 1
Processing 1 pending shards...


In [12]:
# Process only pending shards

if pending_shards:
    max_workers = 6  # Conservative number of workers
    timeout_seconds = 2400  # 40 minutes timeout
    failed_shards = []
    
    print(f"Processing {len(pending_shards)} pending shards...")
    
    with ProcessPool(max_workers=max_workers) as pool:
        future = pool.map(process_shard, pending_shards, timeout=timeout_seconds)
        
        try:
            for i, result in enumerate(future.result()):
                if isinstance(result, Exception):
                    print(f"Shard {pending_shards[i]} failed: {result}")
                    failed_shards.append(pending_shards[i])
                else:
                    if i % 5 == 0:  # Progress indicator every 5 shards
                        print(f"Progress: {i+1}/{len(pending_shards)} shards completed")
        except Exception as e:
            print(f"Processing error: {e}")
    
    print(f"Processing completed. Failed shards: {len(failed_shards)}")
else:
    print("No pending shards to process!")

Processing 1 pending shards...
Processing shard 0 ...
Shard 0: 6657 examples to process


Phonemizing shard 0: 100%|██████████| 6657/6657 [33:03<00:00,  3.36 examples/s]  
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 11291.57 examples/s]


Shard 0 completed in 1984.56 seconds
Progress: 1/1 shards completed
Processing completed. Failed shards: 0


### Collect all shards to form the processed dataset

In [13]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_0 loaded
shard_1 loaded
shard_10 loaded
shard_11 loaded
shard_12 loaded
shard_13 loaded
shard_14 loaded
shard_15 loaded
shard_16 loaded
shard_17 loaded
shard_18 loaded
shard_19 loaded
shard_2 loaded
shard_20 loaded
shard_21 loaded
shard_22 loaded
shard_23 loaded
shard_24 loaded
shard_25 loaded
shard_26 loaded
shard_27 loaded
shard_28 loaded
shard_29 loaded
shard_3 loaded
shard_30 loaded
shard_31 loaded
shard_32 loaded
shard_33 loaded
shard_34 loaded
shard_35 loaded
shard_36 loaded
shard_37 loaded
shard_38 loaded
shard_39 loaded
shard_4 loaded
shard_40 loaded
shard_41 loaded
shard_42 loaded
shard_43 loaded
shard_44 loaded
shard_45 loaded
shard_46 loaded
shard_47 loaded
shard_48 loaded
shard_49 loaded
shard_5 loaded
shard_50 loaded
shard_51 loaded
shard_52 loaded
shard_53 loaded
shard_54 loaded
shard_55 loaded
shard_56 loaded
shard_57 loaded
shard_58 loaded
shard_59 loaded
shard_6 loaded
shard_60 loaded
shard_61 loaded
shard_62 loaded
shard_63 loaded
shard_64 loaded
shard_65 loaded

In [14]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (7/7 shards): 100%|██████████| 665622/665622 [00:29<00:00, 22564.21 examples/s]

Dataset saved to wikipedia_20231101.id.processed





In [15]:
# check the dataset size
dataset

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 665622
})

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [16]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [17]:
special_token = config['dataset_params']['word_separator']

In [18]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

100%|██████████| 5200/5200 [00:50<00:00, 102.45it/s]


In [19]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode([word.lower()])[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

100%|██████████| 188335/188335 [00:02<00:00, 69927.92it/s]


In [20]:
lower_tokens = (list(set(lower_tokens)))

In [21]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

100%|██████████| 188335/188335 [00:19<00:00, 9614.14it/s] 


In [22]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [23]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [24]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [27]:
# Inspect shard content - lihat isi phoneme dari shard tertentu

from datasets import load_from_disk

def inspect_shard(shard_number, num_samples=5):
    """
    Melihat isi phoneme dari shard tertentu
    
    Args:
        shard_number: nomor shard yang ingin dilihat (0-99)
        num_samples: jumlah contoh yang ingin ditampilkan
    """
    shard_directory = f"{root_directory}/shard_{shard_number}"
    
    if not os.path.exists(shard_directory):
        print(f"Shard {shard_number} tidak ada di {shard_directory}")
        return
    
    try:
        # Load shard
        shard_data = load_from_disk(shard_directory)
        print(f"Shard {shard_number} info:")
        print(f"- Total samples: {len(shard_data)}")
        print(f"- Columns: {shard_data.column_names}")
        print("="*60)
        
        # Tampilkan beberapa contoh
        for i in range(min(num_samples, len(shard_data))):
            sample = shard_data[i]
            print(f"\nSample {i+1}:")
            print(f"Input IDs: {sample['input_ids'][:10]}{'...' if len(sample['input_ids']) > 10 else ''}")
            print(f"Phonemes: {sample['phonemes'][:10]}{'...' if len(sample['phonemes']) > 10 else ''}")
            
            # Decode beberapa token untuk melihat isi sebenarnya
            if len(sample['input_ids']) > 0:
                decoded_words = [tokenizer.decode([token_id]) for token_id in sample['input_ids'][:5]]
                print(f"Decoded words (first 5): {decoded_words}")
            
            print(f"Phonemes (first 5): {sample['phonemes'][:5]}")
            print("-"*40)
            
    except Exception as e:
        print(f"Error loading shard {shard_number}: {e}")

# Contoh penggunaan: lihat isi shard 0
inspect_shard(0, num_samples=3)

Shard 0 info:
- Total samples: 6657
- Columns: ['id', 'url', 'title', 'input_ids', 'phonemes']

Sample 1:
Input IDs: [24, 24, 2, 24, 24, 82734, 24, 3159, 22, 24]...
Phonemes: ['ˈasam', 'dˌɛoksˌiribˌonuklˈɛat', ',', 'lˈɛbih', 'dˈikənal', 'dˈɛŋan', 'siŋkˈatan', 'dnˈa', '(', 'bahˈasa']...
Decoded words (first 5): ['<unk>', '<unk>', ',', '<unk>', '<unk>']
Phonemes (first 5): ['ˈasam', 'dˌɛoksˌiribˌonuklˈɛat', ',', 'lˈɛbih', 'dˈikənal']
----------------------------------------

Sample 2:
Input IDs: [5305, 35337, 8445, 1260, 40509, 39, 21, 24, 24, 24]...
Phonemes: ['muhˈammad', 'ˈanwar', 'ˈɛl', '-', 'sˈadat', ';', ')', 'adˈalah', 'səˈɔraŋ', 'pˌolitˈikus']...
Decoded words (first 5): ['Muhammad', 'Anwar', 'el', '-', 'Sadat']
Phonemes (first 5): ['muhˈammad', 'ˈanwar', 'ˈɛl', '-', 'sˈadat']
----------------------------------------

Sample 3:
Input IDs: [56142, 27291, 24, 24, 90205, 5021, 24, 47071, 24, 24]...
Phonemes: ['dˈatuk', 'ˈazhar', 'mˈansɔr', 'adˈalah', 'ˈɔraŋ', 'mˌalajsˈia', 'pərtˈama