# Notebook for preprocessing Wikipedia (Indonesia) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
from phonemize import phonemize

In [3]:
# filepath: /workspace/PL-BERT/preprocess.ipynb
# Tambahkan kode phonemizer kustom Anda di sini
import subprocess
import re
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import pandas as pd
from lingua import Language, LanguageDetectorBuilder
import warnings
from tqdm import tqdm 

warnings.filterwarnings("ignore", message="Trying to detect language from a single word.")

languages = [Language.ENGLISH, Language.INDONESIAN]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

@lru_cache(maxsize=100_000)
def detect_lang(word: str) -> str:
    result = detector.detect_language_of(word)
    if result is None:
        return "id"
    return "en" if result == Language.ENGLISH else "id"

@lru_cache(maxsize=100_000)
def phonemize_word(word: str, ipa: bool, keep_stress: bool, sep: str) -> str:
    lang = detect_lang(word)
    lang_map = {"id": "id", "en": "en-us"}
    voice = lang_map.get(lang, "id")
    cmd = ["espeak-ng", "-v", voice, "-q", f"--sep={sep}", word]
    if ipa:
        cmd.insert(3, "--ipa")
    else:
        cmd.insert(3, "-x")
    try:
        result = subprocess.run(cmd, capture_output=True)
        phonemes = result.stdout.decode("utf-8", errors="ignore").strip()
        phonemes = phonemes.replace("\ufeff", "")
        if not keep_stress:
            phonemes = re.sub(r"[ˈˌ]", "", phonemes)
        return phonemes
    except (subprocess.TimeoutExpired, Exception):
        return word

class EnIndPhonemizer:
    def __init__(self, ipa=True, keep_stress=False, sep="", max_workers=None):
        self.ipa = ipa
        self.keep_stress = keep_stress
        self.sep = sep
        self.max_workers = max_workers or 4

    def phonemize(self, text: str) -> str:
        if not text:
            return ""
        words = text.strip().split()
        phonemized_words = [
            phonemize_word(w, self.ipa, self.keep_stress, self.sep) for w in words
        ]
        return " ".join(phonemized_words)

    def process_in_parallel(self, texts: list[str]) -> list[str]:
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            return list(
                tqdm(
                    executor.map(self.phonemize, texts),
                    total=len(texts),
                    desc="Phonemizing Sentences"
                )
            )

In [4]:
import phonemizer
global_phonemizer = EnIndPhonemizer(ipa=True, keep_stress=True, sep="|")

In [5]:
import os
os.environ['TRUST_REMOTE_CODE'] = 'True'

In [6]:
from transformers import TransfoXLTokenizer
tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to

  from .autonotebook import tqdm as notebook_tqdm
`TransfoXL` was deprecated due to security issues linked to `pickle.load` in `TransfoXLTokenizer`. See more details on this model's documentation page: `https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/transfo-xl.md`.


### Process dataset

Since using load_dataset with the Indonesian Wikipedia (id) resulted in errors (e.g., "Not Found"), we will download and load the dataset manually.

You can download the dataset from this link: https://huggingface.co/datasets/wikimedia/wikipedia/tree/main/20231101.id.

In [7]:
from datasets import load_dataset

# Use a glob pattern to load all Parquet files in the 'wikipedia' folder.
# This pattern will search for all files ending with '.parquet' within the folder.
parquet_folder = "/workspace/src/PL-BERT-ID/wikipedia/*.parquet"

try:
    dataset = load_dataset("parquet", data_files=parquet_folder)
    if isinstance(dataset, dict) or hasattr(dataset, "keys"):
        split_name = "train" if "train" in dataset else list(dataset.keys())[0]
        dataset = dataset[split_name]
    print("Dataset loaded successfully!")
    print(dataset)
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully!
Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 665622
})


In [8]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [9]:
import os
num_shards =100

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: {
        'phonemes': global_phonemizer.phonemize(t['text']),
        'input_ids': tokenizer.encode(t['text'])  # Tambahkan tokenization jika diperlukan
    }, remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [10]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [12]:
import os
from pebble import ProcessPool
from concurrent.futures import TimeoutError

max_workers = 20
failed_shards = []
with ProcessPool(max_workers=max_workers) as pool:
    future = pool.map(process_shard, range(num_shards), timeout=600)
    for i, result in enumerate(future.result()):
        if isinstance(result, Exception):
            print(f"Shard {i} failed: {result}")
            failed_shards.append(i)

Processing shard 9 ...Processing shard 3 ...Processing shard 1 ...Processing shard 0 ...Processing shard 2 ...

Shard 11 already exists!Shard 8 already exists!
Shard 4 already exists!
Shard 6 already exists!Shard 7 already exists!Shard 10 already exists!Shard 5 already exists!
Shard 14 already exists!Shard 13 already exists!Shard 12 already exists!

Shard 15 already exists!
Shard 16 already exists!





Shard 17 already exists!


Shard 18 already exists!Shard 20 already exists!Shard 21 already exists!
Shard 22 already exists!Shard 23 already exists!Shard 19 already exists!
Shard 24 already exists!
Shard 25 already exists!
Shard 26 already exists!





Shard 29 already exists!Shard 28 already exists!Shard 27 already exists!Shard 30 already exists!Shard 31 already exists!



Shard 32 already exists!

Shard 34 already exists!Shard 33 already exists!



Shard 35 already exists!



Shard 37 already exists!Shard 36 already exists!
Shard 40 already exists!Shard 41 already exists!Shard 38 already exists!Shard 39 already exists!Shard 42 already exists!




Shard 43 already exists!


Shard 44 already exists!



Shard 46 already exists!



Shard 45 already exists!Shard 47 already exists!





Shard 48 already exists!



Shard 49 already exists!Shard 51 already exists!Shard 50 already exists!Shard 52 already exists!Shard 55 already exists!Shard 53 already exists!

Shard 54 already exists!
Shard 56 already exists!



Shard 58 already exists!Shard 57 already exists!Shard 59 already exists!




Shard 60 already exists!Shard 61 already exists!Shard 62 already exists!Shard 64 already exists!Shard 63 already exists!Shard 65 already exists!Shard 66 already exists!Shard 71 already exists!







Shard 70 already exists!Shard 67 already exists!Shard 69 already exists!Shard 68 already exists!Shard 73 already exists!Shard 75 already exists!Shard 74 already exists!Shard 72 already exists!

Saving the dataset (0/1 shards):   0%|          | 0/6657 [00:00<?, ? examples/s]






Shard 77 already exists!

Shard 76 already exists!Shard 82 already exists!
Shard 80 already exists!Shard 78 already exists!Shard 81 already exists!Shard 79 already exists!






Saving the dataset (0/1 shards):   0%|          | 0/6657 [00:00<?, ? examples/s]



Shard 83 already exists!Shard 84 already exists!

Shard 90 already exists!Shard 87 already exists!Shard 85 already exists!Shard 89 already exists!Shard 86 already exists!Shard 88 already exists!Shard 91 already exists!


Saving the dataset (0/1 shards):   0%|          | 0/6657 [00:00<?, ? examples/s]







Shard 94 already exists!Shard 92 already exists!Shard 96 already exists!Shard 97 already exists!Shard 95 already exists!Shard 93 already exists!Shard 99 already exists!Shard 98 already exists!

Saving the dataset (0/1 shards):   0%|          | 0/6657 [00:00<?, ? examples/s]











Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 38615.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 38395.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 38670.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 38238.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6657/6657 [00:00<00:00, 38071.96 examples/s]


In [13]:
if failed_shards:
    print("Retrying failed shards...")
    with ProcessPool(max_workers=max_workers) as pool:
        pool.map(process_shard, failed_shards, timeout=600)

### Collect all shards to form the processed dataset

In [14]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_0 loaded
shard_1 loaded
shard_10 loaded
shard_11 loaded
shard_12 loaded
shard_13 loaded
shard_14 loaded
shard_15 loaded
shard_16 loaded
shard_17 loaded
shard_18 loaded
shard_19 loaded
shard_2 loaded
shard_20 loaded
shard_21 loaded
shard_22 loaded
shard_23 loaded
shard_24 loaded
shard_25 loaded
shard_26 loaded
shard_27 loaded
shard_28 loaded
shard_29 loaded
shard_3 loaded
shard_30 loaded
shard_31 loaded
shard_32 loaded
shard_33 loaded
shard_34 loaded
shard_35 loaded
shard_36 loaded
shard_37 loaded
shard_38 loaded
shard_39 loaded
shard_4 loaded
shard_40 loaded
shard_41 loaded
shard_42 loaded
shard_43 loaded
shard_44 loaded
shard_45 loaded
shard_46 loaded
shard_47 loaded
shard_48 loaded
shard_49 loaded
shard_5 loaded
shard_50 loaded
shard_51 loaded
shard_52 loaded
shard_53 loaded
shard_54 loaded
shard_55 loaded
shard_56 loaded
shard_57 loaded
shard_58 loaded
shard_59 loaded
shard_6 loaded
shard_60 loaded
shard_61 loaded
shard_62 loaded
shard_63 loaded
shard_64 loaded
shard_65 loaded

In [15]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (3/3 shards): 100%|██████████| 665637/665637 [00:11<00:00, 58972.94 examples/s]

Dataset saved to wikipedia_20220301.en.processed





In [16]:
# check the dataset size
dataset

Dataset({
    features: ['id', 'url', 'title', 'phonemes', 'input_ids'],
    num_rows: 665637
})

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [17]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [18]:
special_token = config['dataset_params']['word_separator']

In [19]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

100%|██████████| 5200/5200 [00:38<00:00, 133.38it/s]


In [20]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode([word.lower()])[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

100%|██████████| 172557/172557 [00:02<00:00, 69194.34it/s]


In [21]:
lower_tokens = (list(set(lower_tokens)))

In [22]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

100%|██████████| 172557/172557 [00:15<00:00, 11251.27it/s]


In [23]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [24]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [25]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

Bad pipe message: %s [b'"Chromium";v="140", "Not=A?Brand";v="24", "Microsoft Edge']
Bad pipe message: %s [b'v="140"\r\nsec-ch-ua-mobile: ?0\r\nse', b'ch-ua-platform: "Windows"\r\nUpgrade-Insecure-Requests: 1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0;', b'in64; x64) AppleWebKit/537.36 (', b'TML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0\r\nAccept: tex']
Bad pipe message: %s [b'html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exch', b'ge;v=b3;q=0.7\r\nSec-Fetch-Site: none\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-User: ?1\r\nSec-Fetch-Des']
Bad pipe message: %s [b'ol: max-age=0\r\nsec-ch-ua: "Chromium";v="140", "Not=A?Brand";v="24", "Microsoft Edge";v="140"\r\nsec-ch-ua-mobile: ?0\r']
Bad pipe message: %s [b'ec-ch-ua-', b'atform: "Windows"\r\nUpgrade-Insecure-Requests: 1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb']
Bad pipe message: %s [b't/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safa