# Notebook for preprocessing Wikipedia (Indonesia) dataset

In [1]:
!rm -rf wiki_phoneme
!rm -rf wikipedia_20231101.id.processed

### Initilizing phonemizer and tokenizer

In [2]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [3]:
import sys
sys.path.insert(0, '/workspace/src/PL-BERT-ID')
from phonemize import phonemize, EnIndPhonemizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
global_phonemizer = EnIndPhonemizer(ipa=True, keep_stress=True, sep="")

In [5]:
import os
os.environ['TRUST_REMOTE_CODE'] = 'True'

In [6]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('flax-community/gpt2-small-indonesian')

### Process dataset

Since using load_dataset with the Indonesian Wikipedia (id) resulted in errors (e.g., "Not Found"), we will download and load the dataset manually.

You can download the dataset from this link: https://huggingface.co/datasets/wikimedia/wikipedia/tree/main/20231101.id.

In [7]:
from datasets import load_dataset

# Use a glob pattern to load all Parquet files in the 'wikipedia' folder.
# This pattern will search for all files ending with '.parquet' within the folder.
parquet_folder = "/workspace/src/PL-BERT-ID/wikipedia/*.parquet"

try:
    dataset = load_dataset("parquet", data_files=parquet_folder)
    if isinstance(dataset, dict) or hasattr(dataset, "keys"):
        split_name = "train" if "train" in dataset else list(dataset.keys())[0]
        dataset = dataset[split_name]
    print("Dataset loaded successfully!")
    print(dataset)
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!
Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 665622
})


In [8]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [9]:
# Update the process_shard function with better error handling

import os
import time

num_shards = 100

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return f"Shard {i} already exists"
    
    start_time = time.time()
    print('Processing shard %d ...' % i)
    
    try:
        shard = dataset.shard(num_shards=num_shards, index=i)
        print(f'Shard {i}: {len(shard)} examples to process')
        
        # Process in smaller batches to avoid memory issues
        processed_dataset = shard.map(
            lambda t: phonemize(t['text'], global_phonemizer, tokenizer), 
            remove_columns=['text'],
            batch_size=100,  # Process in smaller batches
            desc=f"Phonemizing shard {i}"
        )
        
        if not os.path.exists(directory):
            os.makedirs(directory)
        processed_dataset.save_to_disk(directory)
        
        elapsed = time.time() - start_time
        print(f'Shard {i} completed in {elapsed:.2f} seconds')
        return f"Shard {i} completed successfully"
        
    except Exception as e:
        print(f'Shard {i} failed: {str(e)}')
        return Exception(f"Shard {i} failed: {str(e)}")

In [10]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [11]:
# Update the processing cell with better timeout and worker settings

import os
from pebble import ProcessPool
from concurrent.futures import TimeoutError

# Reduce workers to avoid resource contention and increase timeout
max_workers = 24  # Reduced from 20
timeout_seconds = 2400  # Increased to 30 minutes
failed_shards = []

print(f"Processing {num_shards} shards with {max_workers} workers...")
print(f"Timeout set to {timeout_seconds} seconds ({timeout_seconds//60} minutes)")

with ProcessPool(max_workers=max_workers) as pool:
    future = pool.map(process_shard, range(num_shards), timeout=timeout_seconds)
    
    try:
        for i, result in enumerate(future.result()):
            if isinstance(result, Exception):
                print(f"Shard {i} failed: {result}")
                failed_shards.append(i)
            else:
                if i % 10 == 0:  # Progress indicator
                    print(f"Completed {i+1}/{num_shards} shards")
    except Exception as e:
        print(f"Processing error: {e}")

print(f"Processing completed. Failed shards: {len(failed_shards)}")
if failed_shards:
    print(f"Failed shard indices: {failed_shards}")

Processing 100 shards with 24 workers...
Timeout set to 2400 seconds (40 minutes)
Processing shard 0 ...Processing shard 1 ...Processing shard 2 ...
Processing shard 3 ...Processing shard 4 ...Processing shard 5 ...



Processing shard 6 ...Processing shard 8 ...Processing shard 7 ...
Processing shard 9 ...
Processing shard 10 ...


Processing shard 11 ...Processing shard 12 ...Shard 0: 6657 examples to process
Shard 2: 6657 examples to processProcessing shard 16 ...Shard 3: 6657 examples to processShard 1: 6657 examples to processShard 7: 6657 examples to processProcessing shard 15 ...Shard 5: 6657 examples to process

Shard 4: 6657 examples to processProcessing shard 14 ...
Processing shard 13 ...Processing shard 17 ...Shard 8: 6657 examples to processProcessing shard 18 ...Processing shard 19 ...Processing shard 20 ...
Shard 9: 6657 examples to processProcessing shard 21 ...


Shard 6: 6657 examples to processProcessing shard 22 ...




Processing shard 23 ...


Shard 10: 6657 examp

Phonemizing shard 5:  23% 1562/6657 [39:58<55:21,  1.53 examples/s]s]] 

Processing error: [Errno Task timeout] 2400


Phonemizing shard 17:  13% 876/6657 [39:59<58:36,  1.64 examples/s]  ]

Processing shard 24 ...

Processing shard 25 ...Processing shard 26 ...
Shard 24: 6656 examples to processShard 25: 6656 examples to process

Processing shard 27 ...Shard 26: 6656 examples to process

Shard 27: 6656 examples to processProcessing shard 28 ...

Processing shard 29 ...Shard 28: 6656 examples to process

Processing shard 30 ...Shard 29: 6656 examples to process

Processing shard 31 ...Shard 30: 6656 examples to process

Shard 31: 6656 examples to processProcessing shard 32 ...

Processing shard 33 ...Shard 32: 6656 examples to process

Processing shard 34 ...
Shard 34: 6656 examples to processShard 33: 6656 examples to process

Processing shard 35 ...
Processing shard 36 ...Shard 35: 6656 examples to process

Shard 36: 6656 examples to processProcessing shard 37 ...
Shard 37: 6656 examples to process
Processing shard 38 ...

Processing shard 39 ...Shard 38: 6656 examples to process

Processing shard 40 ...
Processing shard 41 ...Shard 40: 6656 examples to processShard 39: 

Phonemizing shard 44: 100% 6656/6656 [08:23<00:00, 13.22 examples/s]s]]
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 17307.40 examples/s]
Phonemizing shard 43:  96% 6394/6656 [08:26<00:23, 10.94 examples/s]

Shard 44 completed in 510.24 seconds

Phonemizing shard 41:  50% 3314/6656 [08:27<03:15, 17.09 examples/s]




Phonemizing shard 36:  77% 5109/6656 [08:26<01:53, 13.67 examples/s]

Processing shard 48 ...


Phonemizing shard 39:  47% 3134/6656 [08:26<03:08, 18.70 examples/s]

Shard 48: 6656 examples to process

Phonemizing shard 34:  50% 3341/6656 [08:26<05:09, 10.73 examples/s]




Phonemizing shard 45: 100% 6656/6656 [08:30<00:00, 13.05 examples/s]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 13330.60 examples/s]
Phonemizing shard 41:  51% 3382/6656 [08:31<02:37, 20.75 examples/s]

Shard 45 completed in 514.80 seconds

Phonemizing shard 46:  96% 6385/6656 [08:30<00:19, 13.65 examples/s]


Processing shard 49 ...

Phonemizing shard 39:  48% 3227/6656 [08:30<02:37, 21.72 examples/s]




Phonemizing shard 43:  97% 6457/6656 [08:30<00:15, 12.78 examples/s]

Shard 49: 6656 examples to process

Phonemizing shard 36:  78% 5165/6656 [08:31<02:14, 11.12 examples/s]




Phonemizing shard 46: 100% 6656/6656 [08:46<00:00, 12.64 examples/s]]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 17167.66 examples/s]
Phonemizing shard 36:  81% 5394/6656 [08:48<01:37, 12.98 examples/s]

Shard 46 completed in 531.58 seconds
Processing shard 50 ...
Shard 50: 6656 examples to process


Phonemizing shard 43: 100% 6656/6656 [08:55<00:00, 12.43 examples/s]  
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 21085.55 examples/s]
Phonemizing shard 33:  53% 3558/6656 [08:56<06:03,  8.52 examples/s]

Shard 43 completed in 539.86 seconds
Processing shard 51 ...

Phonemizing shard 36:  83% 5509/6656 [08:56<01:26, 13.28 examples/s]




Phonemizing shard 41:  57% 3813/6656 [08:56<01:49, 25.96 examples/s]

Shard 51: 6656 examples to process


Phonemizing shard 38: 100% 6656/6656 [09:26<00:00, 11.74 examples/s]s] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 14022.92 examples/s]
Phonemizing shard 40:  51% 3364/6656 [09:28<02:11, 25.04 examples/s]

Shard 38 completed in 571.89 seconds


Phonemizing shard 36:  90% 5958/6656 [09:28<00:35, 19.43 examples/s]

Processing shard 52 ...
Shard 52: 6656 examples to process


Phonemizing shard 41: 100% 6656/6656 [10:08<00:00, 10.94 examples/s]s]]
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 14765.26 examples/s]


Shard 41 completed in 611.72 seconds

Phonemizing shard 40:  64% 4277/6656 [10:08<01:48, 21.92 examples/s]


Processing shard 53 ...
Shard 53: 6656 examples to process


Phonemizing shard 36: 100% 6656/6656 [10:15<00:00, 10.82 examples/s]]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 17128.06 examples/s]
Phonemizing shard 33:  66% 4365/6656 [10:15<05:15,  7.26 examples/s]

Shard 36 completed in 619.78 seconds
Processing shard 54 ...
Shard 54: 6656 examples to process


Phonemizing shard 40: 100% 6656/6656 [11:45<00:00,  9.43 examples/s]]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 14807.59 examples/s]
Phonemizing shard 33:  78% 5220/6656 [11:46<02:21, 10.14 examples/s]

Shard 40 completed in 709.95 seconds
Processing shard 55 ...
Shard 55: 6656 examples to process


Phonemizing shard 39: 100% 6656/6656 [13:48<00:00,  8.04 examples/s]]]]
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 12567.66 examples/s]
Phonemizing shard 34:  59% 3925/6656 [13:49<02:59, 15.20 examples/s]

Shard 39 completed in 832.86 seconds


Phonemizing shard 33:  93% 6199/6656 [13:48<01:21,  5.59 examples/s]

Processing shard 56 ...
Shard 56: 6656 examples to process


Phonemizing shard 37: 100% 6656/6656 [15:31<00:00,  7.14 examples/s]s] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 14365.41 examples/s]


Shard 37 completed in 936.04 seconds
Processing shard 57 ...
Shard 57: 6656 examples to process


Phonemizing shard 34: 100% 6656/6656 [16:40<00:00,  6.66 examples/s]]  
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 15233.05 examples/s]


Shard 34 completed in 1004.56 seconds
Processing shard 58 ...


Phonemizing shard 51:   1% 57/6656 [07:42<9:46:39,  5.33s/ examples] 

Shard 58: 6656 examples to process


Phonemizing shard 33: 100% 6656/6656 [18:32<00:00,  5.98 examples/s]]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 14259.67 examples/s]
Phonemizing shard 24:   4% 261/6656 [18:32<42:55,  2.48 examples/s]

Shard 33 completed in 1117.50 seconds
Processing shard 59 ...
Shard 59: 6656 examples to process


Phonemizing shard 35: 100% 6656/6656 [21:27<00:00,  5.17 examples/s]]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 12595.66 examples/s]


Shard 35 completed in 1292.86 seconds
Processing shard 60 ...
Shard 60: 6656 examples to process


Phonemizing shard 42: 100% 6656/6656 [26:16<00:00,  4.22 examples/s]]] 
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 9457.28 examples/s] 


Shard 42 completed in 1580.21 seconds

Phonemizing shard 53:   3% 204/6656 [16:06<12:17:40,  6.86s/ examples]


Processing shard 61 ...
Shard 61: 6656 examples to process


Phonemizing shard 32: 100% 6656/6656 [33:57<00:00,  3.27 examples/s]]  
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 10053.68 examples/s]


Shard 32 completed in 2041.04 seconds


Phonemizing shard 47:  74% 4903/6656 [33:56<1:54:18,  3.91s/ examples]

Processing shard 62 ...
Shard 62: 6656 examples to process


Phonemizing shard 60:   3% 176/6656 [18:25<7:44:47,  4.30s/ examples]  

Processing shard 63 ...
Processing shard 64 ...


Phonemizing shard 30:  10% 676/6656 [39:55<4:59:09,  3.00s/ examples]

Shard 64: 6656 examples to process
Shard 63: 6656 examples to process


Phonemizing shard 55:   6% 375/6656 [28:09<6:00:30,  3.44s/ examples]]

Processing shard 65 ...
Processing shard 66 ...Shard 65: 6656 examples to process

Phonemizing shard 50:  54% 3611/6656 [31:07<1:22:15,  1.62s/ examples]



Processing shard 67 ...
Shard 66: 6656 examples to processProcessing shard 68 ...



Phonemizing shard 50:  54% 3612/6656 [31:07<59:23,  1.17s/ examples]  

Shard 67: 6656 examples to processProcessing shard 69 ...Shard 68: 6656 examples to process


Processing shard 70 ...Shard 69: 6656 examples to process

Shard 70: 6656 examples to process


Phonemizing shard 50:  54% 3613/6656 [31:08<46:33,  1.09 examples/s]]

Processing shard 71 ...


Phonemizing shard 52:   9% 579/6656 [30:28<4:03:45,  2.41s/ examples]

Shard 71: 6656 examples to process


Phonemizing shard 55:   8% 514/6656 [36:38<3:05:20,  1.81s/ examples]  

Processing shard 72 ...
Shard 72: 6656 examples to process


Phonemizing shard 51:  11% 761/6656 [39:33<3:41:41,  2.26s/ examples] 

Processing shard 73 ...


Phonemizing shard 70:  36% 2429/6656 [08:31<3:32:26,  3.02s/ examples]

Shard 73: 6656 examples to process


Phonemizing shard 62:   3% 228/6656 [14:49<1:44:42,  1.02 examples/s]]

Processing shard 74 ...
Shard 74: 6656 examples to process

Phonemizing shard 73:   1% 90/6656 [00:15<13:37,  8.04 examples/s]




Phonemizing shard 72:   4% 234/6656 [00:27<09:40, 11.07 examples/s]s] 

Processing shard 75 ...
Shard 75: 6656 examples to process


Phonemizing shard 53:   9% 618/6656 [39:18<2:11:00,  1.30s/ examples] 

Processing shard 76 ...
Shard 76: 6656 examples to process


Phonemizing shard 69:   6% 372/6656 [10:07<3:02:00,  1.74s/ examples] 

Processing shard 77 ...


Phonemizing shard 72:  16% 1039/6656 [01:39<06:42, 13.95 examples/s]

Shard 77: 6656 examples to process


Phonemizing shard 73:  12% 832/6656 [01:42<08:48, 11.02 examples/s]]] 

Processing shard 78 ...


Phonemizing shard 73:  13% 834/6656 [01:42<08:39, 11.22 examples/s]

Shard 78: 6656 examples to process

Phonemizing shard 58:  10% 676/6656 [33:33<4:51:38,  2.93s/ examples]




Phonemizing shard 61:   7% 459/6656 [25:27<4:14:56,  2.47s/ examples]]

Processing shard 79 ...
Shard 79: 6656 examples to process


Phonemizing shard 70:  38% 2554/6656 [13:48<29:27,  2.32 examples/s]s] 

Processing shard 80 ...
Shard 80: 6656 examples to process


Phonemizing shard 59:  25% 1632/6656 [36:56<1:01:27,  1.36 examples/s] 

Processing shard 81 ...
Shard 81: 6656 examples to process

Phonemizing shard 61:   8% 552/6656 [29:13<4:43:51,  2.79s/ examples]




Phonemizing shard 74:  48% 3172/6656 [07:50<00:54, 63.72 examples/s]s]

Processing shard 82 ...
Shard 82: 6656 examples to process


Phonemizing shard 66:   5% 327/6656 [18:33<2:57:29,  1.68s/ examples]] 

Processing shard 83 ...
Shard 83: 6656 examples to process


Phonemizing shard 73:  33% 2208/6656 [12:55<06:54, 10.73 examples/s]]] 

Processing shard 84 ...
Shard 84: 6656 examples to process


Phonemizing shard 71:  78% 5193/6656 [26:15<00:24, 59.91 examples/s]]] 

Processing shard 85 ...
Shard 85: 6656 examples to process


Phonemizing shard 83:   5% 345/6656 [15:21<5:03:36,  2.89s/ examples]]es]

Processing shard 86 ...
Shard 86: 6656 examples to process


Phonemizing shard 73: 100% 6656/6656 [25:42<00:00,  4.32 examples/s]]]
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 24623.63 examples/s]


Shard 73 completed in 1544.58 seconds
Processing shard 87 ...


Phonemizing shard 69:  18% 1197/6656 [34:15<5:09:59,  3.41s/ examples]

Shard 87: 6656 examples to process


Phonemizing shard 71: 100% 6656/6656 [34:35<00:00,  3.21 examples/s]]]
Saving the dataset (1/1 shards): 100% 6656/6656 [00:00<00:00, 18341.16 examples/s]


Shard 71 completed in 2078.49 seconds
Processing shard 88 ...
Shard 88: 6656 examples to process


Phonemizing shard 88:   2% 123/6656 [05:17<1:48:16,  1.01 examples/s]  

Processing shard 89 ...


Phonemizing shard 81:  11% 765/6656 [24:22<1:44:55,  1.07s/ examples]

Processing shard 90 ...
Shard 89: 6656 examples to process
Shard 90: 6656 examples to process


Phonemizing shard 68:  12% 800/6656 [39:57<3:49:47,  2.35s/ examples]]

Processing shard 91 ...
Processing shard 92 ...
Shard 91: 6656 examples to processShard 92: 6656 examples to process



Phonemizing shard 75:  10% 691/6656 [31:00<7:20:03,  4.43s/ examples] 

Processing shard 93 ...

Phonemizing shard 86:   1% 88/6656 [05:59<4:34:17,  2.51s/ examples]




Phonemizing shard 89:   0% 0/6656 [00:00<?, ? examples/s]

Shard 93: 6656 examples to processProcessing shard 94 ...

Processing shard 95 ...

Phonemizing shard 90:   0% 0/6656 [00:00<?, ? examples/s]




Phonemizing shard 81:  12% 767/6656 [24:24<1:40:36,  1.03s/ examples]

Shard 95: 6656 examples to process

Phonemizing shard 75:  10% 692/6656 [31:00<5:29:19,  3.31s/ examples]

Shard 94: 6656 examples to processProcessing shard 96 ...




Phonemizing shard 82:  11% 724/6656 [23:16<4:28:04,  2.71s/ examples]

Shard 96: 6656 examples to process


Phonemizing shard 87:  16% 1053/6656 [14:08<4:36:38,  2.96s/ examples] s]

Processing shard 97 ...

Phonemizing shard 78:  13% 885/6656 [38:08<1:02:26,  1.54 examples/s]]


Shard 97: 6656 examples to process


Phonemizing shard 76:  15% 966/6656 [39:18<1:43:03,  1.09s/ examples]]

Processing shard 98 ...
Shard 98: 6656 examples to process


Phonemizing shard 77:  19% 1283/6656 [38:46<5:43:58,  3.84s/ examples]

Processing shard 99 ...
Shard 99: 6656 examples to process


Phonemizing shard 99:  43% 2881/6656 [39:55<40:31,  1.55 examples/s]   

Processing completed. Failed shards: 0


In [12]:
# Add this cell before processing to check existing shards

def check_existing_shards():
    """Check which shards already exist"""
    existing_shards = []
    pending_shards = []
    
    for i in range(num_shards):
        directory = root_directory + "/shard_" + str(i)
        if os.path.exists(directory):
            existing_shards.append(i)
        else:
            pending_shards.append(i)
    
    print(f"Existing shards: {len(existing_shards)}/{num_shards}")
    print(f"Pending shards: {len(pending_shards)}")
    
    return pending_shards

# Check existing shards first
pending_shards = check_existing_shards()

if not pending_shards:
    print("All shards already processed!")
else:
    print(f"Processing {len(pending_shards)} pending shards...")

Existing shards: 17/100
Pending shards: 83
Processing 83 pending shards...


In [None]:
# Process only pending shards

if pending_shards:
    max_workers = 24  # Conservative number of workers
    timeout_seconds = 3000  # 40 minutes timeout
    failed_shards = []
    
    print(f"Processing {len(pending_shards)} pending shards...")
    
    with ProcessPool(max_workers=max_workers) as pool:
        future = pool.map(process_shard, pending_shards, timeout=timeout_seconds)
        
        try:
            for i, result in enumerate(future.result()):
                if isinstance(result, Exception):
                    print(f"Shard {pending_shards[i]} failed: {result}")
                    failed_shards.append(pending_shards[i])
                else:
                    if i % 5 == 0:  # Progress indicator every 5 shards
                        print(f"Progress: {i+1}/{len(pending_shards)} shards completed")
        except Exception as e:
            print(f"Processing error: {e}")
    
    print(f"Processing completed. Failed shards: {len(failed_shards)}")
else:
    print("No pending shards to process!")

Processing 83 pending shards...
Processing shard 2 ...Processing shard 0 ...Processing shard 1 ...Processing shard 3 ...

Processing shard 4 ...

Processing shard 5 ...Processing shard 6 ...Processing shard 8 ...
Processing shard 7 ...
Processing shard 9 ...Processing shard 10 ...

Processing shard 11 ...


Processing shard 14 ...Shard 0: 6657 examples to processShard 3: 6657 examples to processProcessing shard 13 ...Shard 5: 6657 examples to processProcessing shard 15 ...Shard 1: 6657 examples to processShard 6: 6657 examples to processShard 7: 6657 examples to processShard 2: 6657 examples to process
Processing shard 16 ...
Shard 4: 6657 examples to processShard 8: 6657 examples to process

Processing shard 12 ...

Processing shard 20 ...

Processing shard 17 ...Processing shard 19 ...

Processing shard 22 ...


Shard 10: 6657 examples to process
Shard 9: 6657 examples to processProcessing shard 18 ...Processing shard 21 ...




Shard 14: 6657 examples to processShard 15: 6657 exampl

Phonemizing shard 10:   9% 612/6657 [49:56<5:24:37,  3.22s/ examples]]]

Processing error: [Errno Task timeout] 3000


Phonemizing shard 23:  16% 1084/6656 [49:57<3:18:50,  2.14s/ examples]

Processing shard 24 ...
Processing shard 25 ...Shard 24: 6656 examples to process

Processing shard 26 ...Shard 25: 6656 examples to process

Shard 26: 6656 examples to processProcessing shard 27 ...

Shard 27: 6656 examples to processProcessing shard 28 ...

Processing shard 29 ...Shard 28: 6656 examples to process

Processing shard 30 ...
Shard 29: 6656 examples to process
Processing shard 31 ...Shard 30: 6656 examples to process

Processing shard 47 ...Shard 31: 6656 examples to process



Phonemizing shard 21:  52% 3487/6657 [49:56<25:48,  2.05 examples/s]

Shard 47: 6656 examples to processProcessing shard 48 ...

Processing shard 49 ...Shard 48: 6656 examples to process
Processing shard 50 ...

Processing shard 51 ...
Shard 50: 6656 examples to process
Shard 51: 6656 examples to process
Shard 49: 6656 examples to processProcessing shard 52 ...Processing shard 53 ...


Processing shard 54 ...
Shard 52: 6656 examples to processShard 53: 6656 examples to process
Shard 54: 6656 examples to process



Phonemizing shard 20:  49% 3250/6657 [49:59<04:39, 12.21 examples/s]s]

Processing shard 55 ...Processing shard 56 ...



Phonemizing shard 25:   0% 0/6656 [00:00<?, ? examples/s]

Shard 55: 6656 examples to processProcessing shard 57 ...

Processing shard 58 ...
Shard 56: 6656 examples to processShard 58: 6656 examples to processProcessing shard 59 ...Shard 57: 6656 examples to process


Phonemizing shard 24:   0% 0/6656 [00:00<?, ? examples/s]






Phonemizing shard 28:   0% 0/6656 [00:00<?, ? examples/s]

Shard 59: 6656 examples to process

Phonemizing shard 26:   0% 0/6656 [00:00<?, ? examples/s]


Processing shard 60 ...

Phonemizing shard 31:   0% 0/6656 [00:00<?, ? examples/s]

Processing shard 61 ...

Shard 60: 6656 examples to processShard 61: 6656 examples to processProcessing shard 62 ...


Shard 62: 6656 examples to process


Phonemizing shard 59:  43% 2879/6656 [49:53<2:26:07,  2.32s/ examples] 

Processing shard 63 ...
Processing shard 64 ...
Shard 63: 6656 examples to processShard 64: 6656 examples to process
Processing shard 65 ...

Shard 65: 6656 examples to process


Phonemizing shard 64:   0% 0/6656 [00:00<?, ? examples/s]examples/s]]]

Processing shard 66 ...
Processing shard 67 ...

Phonemizing shard 65:   0% 0/6656 [00:00<?, ? examples/s]

Shard 66: 6656 examples to process

Phonemizing shard 63:   0% 0/6656 [00:00<?, ? examples/s]



Processing shard 68 ...Shard 67: 6656 examples to process
Shard 68: 6656 examples to processProcessing shard 69 ...


Processing shard 70 ...Shard 69: 6656 examples to process

Processing shard 72 ...Shard 70: 6656 examples to process

Processing shard 74 ...
Processing shard 75 ...
Shard 72: 6656 examples to processProcessing shard 76 ...Shard 74: 6656 examples to process


Phonemizing shard 57:  14% 955/6656 [49:56<5:39:43,  3.58s/ examples]


Shard 75: 6656 examples to process
Processing shard 78 ...Processing shard 77 ...


Processing shard 79 ...Shard 76: 6656 examples to process

Processing shard 80 ...Shard 79: 6656 examples to processShard 78: 6656 examples to process

Shard 77: 6656 examples to process

Phonemizing shard 55:  12% 803/6656 [49:56<8:16:28,  5.09s/ examples] 



Shard 80: 6656 examples to process


Phonemizing shard 61:  13% 840/6656 [49:57<5:56:45,  3.68s/ examples]]

Processing shard 81 ...Processing shard 82 ...

Shard 82: 6656 examples to processProcessing shard 83 ...Processing shard 84 ...
Shard 81: 6656 examples to process


Processing shard 86 ...Shard 84: 6656 examples to processShard 83: 6656 examples to processProcessing shard 85 ...


Processing shard 87 ...
Shard 86: 6656 examples to process

Processing shard 88 ...
Shard 85: 6656 examples to process
Shard 87: 6656 examples to process
Shard 88: 6656 examples to process


Phonemizing shard 78:   0% 0/6656 [00:00<?, ? examples/s]

### Collect all shards to form the processed dataset

In [None]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

In [None]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

In [None]:
# check the dataset size
dataset

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [None]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [None]:
special_token = config['dataset_params']['word_separator']

In [None]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

In [None]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode([word.lower()])[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

In [None]:
lower_tokens = (list(set(lower_tokens)))

In [None]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

In [None]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

### Test the dataset with dataloader


In [None]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

In [None]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [None]:
# Inspect shard content - lihat isi phoneme dari shard tertentu

from datasets import load_from_disk

def inspect_shard(shard_number, num_samples=5):
    """
    Melihat isi phoneme dari shard tertentu
    
    Args:
        shard_number: nomor shard yang ingin dilihat (0-99)
        num_samples: jumlah contoh yang ingin ditampilkan
    """
    shard_directory = f"{root_directory}/shard_{shard_number}"
    
    if not os.path.exists(shard_directory):
        print(f"Shard {shard_number} tidak ada di {shard_directory}")
        return
    
    try:
        # Load shard
        shard_data = load_from_disk(shard_directory)
        print(f"Shard {shard_number} info:")
        print(f"- Total samples: {len(shard_data)}")
        print(f"- Columns: {shard_data.column_names}")
        print("="*60)
        
        # Tampilkan beberapa contoh
        for i in range(min(num_samples, len(shard_data))):
            sample = shard_data[i]
            print(f"\nSample {i+1}:")
            print(f"Input IDs: {sample['input_ids'][:10]}{'...' if len(sample['input_ids']) > 10 else ''}")
            print(f"Phonemes: {sample['phonemes'][:10]}{'...' if len(sample['phonemes']) > 10 else ''}")
            
            # Decode beberapa token untuk melihat isi sebenarnya
            if len(sample['input_ids']) > 0:
                decoded_words = [tokenizer.decode([token_id]) for token_id in sample['input_ids'][:5]]
                print(f"Decoded words (first 5): {decoded_words}")
            
            print(f"Phonemes (first 5): {sample['phonemes'][:5]}")
            print("-"*40)
            
    except Exception as e:
        print(f"Error loading shard {shard_number}: {e}")

# Contoh penggunaan: lihat isi shard 0
inspect_shard(0, num_samples=3)

In [None]:
import pandas as pd
from tqdm import tqdm

def create_excel_from_dataset(dataset_sample, filename="wikipedia_processed.xlsx"):
    """
    Membuat file Excel dari sampel dataset yang telah diproses.
    
    Args:
        dataset_sample: Sampel dari dataset yang akan diproses.
        filename: Nama file Excel yang akan disimpan.
    """
    processed_data = []
    
    print(f"Processing {len(dataset_sample)} samples to create Excel file...")
    
    for sample in tqdm(dataset_sample):
        input_ids = sample['input_ids']
        phonemes = sample['phonemes']
        
        # Decode input_ids kembali menjadi teks
        # Setiap ID di-decode secara terpisah dan digabungkan dengan spasi
        decoded_text = ' '.join([tokenizer.decode([token_id]) for token_id in input_ids])
        
        # Gabungkan daftar phonemes menjadi satu string
        phonemes_str = ' '.join(phonemes)
        
        processed_data.append({
            'text': decoded_text,
            'phonemes': phonemes_str,
            'input_ids': str(input_ids)  # Simpan sebagai string agar mudah dibaca di Excel
        })
        
    # Buat DataFrame dari data yang diproses
    df = pd.DataFrame(processed_data)
    
    # Simpan DataFrame ke file Excel
    df.to_excel(filename, index=False)
    print(f"\nDataset berhasil disimpan ke {filename}")

# Ambil sebagian kecil dari dataset untuk dibuatkan Excel (misalnya 1000 baris pertama)
num_samples_for_excel = 1000

# TAMBAHKAN KODE INI UNTUK MENJALANKAN FUNGSI:
if len(dataset) > 0:
    # Ambil sampel dari dataset
    dataset_sample = dataset.select(range(min(num_samples_for_excel, len(dataset))))
    
    # Jalankan fungsi untuk membuat Excel
    create_excel_from_dataset(dataset_sample, "wikipedia_processed.xlsx")
else:
    print("Dataset kosong atau belum dimuat!")