In [2]:
import datasets
from datasets import load_from_disk
from datasets import concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def num_tokens_from_string(string: str, encoder) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoder.encode(string))
    return num_tokens

def count_tokens_in_dataset(dataset, field_name, num_tokens_from_string, encoder):
    total_tokens = 0
    
    for item in dataset:
        text = item[field_name]
        tokens = num_tokens_from_string(text, encoder)
        total_tokens += tokens
    
    return total_tokens

## Dataset Work

In [20]:
cbn_wiki_id = load_from_disk("dataset/paralel_id_cbn_16k")
cbn_wiki_jv = load_from_disk("dataset/paralel_jv_cbn_3k")
cbn_wiki_su = load_from_disk("dataset/paralel_su_cbn_3k")

In [21]:
id_wiki_id = load_from_disk("synthetic/id_titles/id_wiki-id/translated")
id_wiki_jv = load_from_disk("synthetic/id_titles/id_wiki-jv/translated")
id_wiki_su = load_from_disk("synthetic/id_titles/id_wiki-su/translated")

In [22]:
from datasets import concatenate_datasets

id_cbn = concatenate_datasets([cbn_wiki_id, id_wiki_id])
jv_cbn = concatenate_datasets([cbn_wiki_jv, id_wiki_jv])
su_cbn = concatenate_datasets([cbn_wiki_su, id_wiki_su])

In [25]:
len(su_cbn)

28551

In [26]:
id_cbn.save_to_disk("dataset/paralel_id_cbn_127k")
jv_cbn.save_to_disk("dataset/paralel_jv_cbn_24k")
su_cbn.save_to_disk("dataset/paralel_su_cbn_28k")

Saving the dataset (1/1 shards): 100%|██████████| 127648/127648 [00:01<00:00, 78528.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 23849/23849 [00:00<00:00, 121734.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 28551/28551 [00:00<00:00, 198725.47 examples/s]


In [27]:
paralel_bali_dict = load_from_disk("dataset/paralel_dataset_from_bali_dict")  

In [29]:
paralel_bali_dict = paralel_bali_dict.remove_columns(["prompt"])

In [31]:
paralel_60k = load_from_disk("dataset/paralel_dataset_60k")

In [33]:
def convert_dataset(data):
    data["id"] = data["custom_id"]
    return data

In [35]:
paralel_60k = paralel_60k.map(convert_dataset, remove_columns=["custom_id", "text", "prompt_text"])

Map: 100%|██████████| 59257/59257 [00:08<00:00, 7163.49 examples/s] 


In [39]:
paralel_300k = load_from_disk("dataset/paralel_dataset_300k")

In [40]:
paralel_300k

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 296203
})

In [41]:
paralel_bali_dict

Dataset({
    features: ['id', 'balinese', 'indonesian', 'cirebonese'],
    num_rows: 22248
})

In [42]:
paralel_60k

Dataset({
    features: ['indonesian', 'cirebonese', 'balinese', 'id'],
    num_rows: 59257
})

In [100]:
348695 + 43818

392513

In [95]:
paralel_377k = load_from_disk("dataset/paralel_3_lang/paralel_dataset_377k_dedup")

In [96]:
paralel_44k = load_from_disk("dataset/paralel_3_lang/paralel_dataset_44k")

In [97]:
paralel_377k

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 348695
})

In [99]:
paralel_44k

Dataset({
    features: ['id', 'indonesian', 'cirebonese', 'balinese'],
    num_rows: 43818
})

In [101]:
from datasets import concatenate_datasets

paralel_400k = concatenate_datasets([paralel_377k, paralel_44k])

In [102]:
paralel_400k.save_to_disk("dataset/paralel_3_lang/paralel_dataset_400k")

Saving the dataset (4/4 shards): 100%|██████████| 392513/392513 [00:27<00:00, 14353.79 examples/s]


In [116]:
paralel_400k[456]

{'id': '826b756d-1984-4a9b-9e44-603ef375821c',
 'cirebonese': 'Benteng Tiworo punika salah satunggal badan peninggalan sajarah saking Karaton Tiworo. Badan lan abha saking kubu punika taksih bisa tag saksikan hinga babak yasa. Benteng Tiworo kapanggihan ing Kelurahan Waumere, anatap Tiworo Kepulauan, kabupaten Muna Barat, Sulawesi Tenggara.\n\nBenteng Tiworo kadamel saking watu-watu alit lan ageng ingkang asusun rapi, lan ngadeg kuwat ing pusér Kota anatap Tiworo Kepulauan. Dhuwuré bervariasi, antawis telu ngantos papat meter. Kubu punika akělit amba kirang langkung loro hektar. Lokasi punika pinakangga saking imamat Karaton Tiworo. Hinga sapunika lokasi Benteng Tiworopun taksih disaklarkan, sanadyan badan aslinipun sampun ngalami owah-owahan.\n\nSajarah\nMiturut sajarah, Benteng Tiworo dibangun ing abad XVI dening Raja Muna ingkang La Ode Asmana. Pembuatan kubu punika saking bahan watu ingkang dikangkat masyarakat kalawan cara serendeng sepanjang 150 kilometer. Watu-watu ingkang dipun

In [52]:
id_cbn = load_from_disk("dataset/paralel_2_lang/paralel_id_cbn_127k_dedup")

In [53]:
existing_id_text = set(paralel_377k["indonesian"])

In [54]:
def filter_id_cbn(data):
    return data["text"] not in existing_id_text

In [55]:
filtered_id_cbn = id_cbn.filter(filter_id_cbn)

Filter: 100%|██████████| 92504/92504 [00:13<00:00, 6913.87 examples/s]


In [57]:
filtered_id_cbn.save_to_disk("dataset/paralel_2_lang/paralel_id_cbn_127k_filtered")

Saving the dataset (1/1 shards): 100%|██████████| 44743/44743 [00:00<00:00, 53675.53 examples/s]


In [78]:
from datasets import load_dataset

id_mmlu = load_dataset("indolem/IndoMMLU")

In [79]:
bali_mmlu = id_mmlu["test"].filter(lambda x: x["subject"] == "Balinese")

In [80]:
bali_mmlu[0]

{'subject': 'Balinese',
 'group': 'Local languages and cultures',
 'level': 'SD',
 'class': '2',
 'question': 'Dugas Galungane titiang mabakti nganggon udeng baru. Udeng basa Bali alusne...',
 'options': "['A. destar', 'B. kampuh', 'C. wastra']",
 'answer': 'A',
 'is_for_fewshot': '0'}

In [81]:
import ast
import uuid

def transform_data(item):
    question_id = str(uuid.uuid4())

    # Parse the options string into a list
    options_str = item['options']
    options_list = ast.literal_eval(options_str)
    
    # Extract labels and texts from options
    labels = []
    texts = []
    for opt in options_list:
        # Split by dot and strip whitespace
        parts = opt.split('.', 1)
        labels.append(parts[0].strip().lower())  # Get 'A', 'B', 'C' and convert to lowercase
        texts.append(parts[1].strip())  # Get the actual option text
    
    # Create the transformed dictionary
    transformed = {
        "context": "",
        "question": item['question'],
        "choices": {
            "label": labels,
            "text": texts
        },
        "answer": item['answer'].lower(),  # Convert answer to lowercase to match labels
        "category": [],
        "grade": int(item['class']),
        "question_id": question_id
    }
    
    return transformed

In [82]:
bali_mmlu = bali_mmlu.map(transform_data, remove_columns=['options', 'class', 'subject', 'group', 'level', 'class', 'is_for_fewshot'])

Map: 100%|██████████| 471/471 [00:00<00:00, 7187.59 examples/s]


In [83]:
bali_mmlu[0]

{'question': 'Dugas Galungane titiang mabakti nganggon udeng baru. Udeng basa Bali alusne...',
 'answer': 'a',
 'context': '',
 'choices': {'label': ['a', 'b', 'c'], 'text': ['destar', 'kampuh', 'wastra']},
 'category': [],
 'grade': 2,
 'question_id': 'e22caaf5-7596-4d76-a04e-61637f60a8c1'}

In [84]:
bali_mmlu.save_to_disk("dataset/bali_mmlu")

Saving the dataset (1/1 shards): 100%|██████████| 471/471 [00:00<00:00, 60177.81 examples/s]


In [85]:
paralel_377k = load_from_disk("dataset/paralel_3_lang/paralel_dataset_377k_dedup")

In [86]:
paralel_377k

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 348695
})

In [587]:
paralel_400k = load_from_disk("dataset/paralel_3_lang/combined_paralel_dataset_705k_dedup_clean")

In [588]:
print(count_tokens_in_dataset(paralel_400k, "indonesian", num_tokens_from_string, encoding))

217599839


In [589]:
print(count_tokens_in_dataset(paralel_400k, "cirebonese", num_tokens_from_string, encoding))

238299598


In [590]:
print(count_tokens_in_dataset(paralel_400k, "balinese", num_tokens_from_string, encoding))

239976914


In [266]:
import random

random.choice(paralel_400k)

{'text': 'Joel Robles Blázquez (lahir 17 Juni 1990), sing dikenal kanthi jeneng Joel, ya iku pemain bal-balan profésional Spanyol sing main dadi kiper kanggo klub Liga Utama Inggris Everton F.C. Robles miwiti karir profésional ing Atlético Madrid, wiwitané ing tim C ing taun 2008, sadurungé maju ing tim B ing taun sabanjuré, banjur diunggahaké dadi pemain senior. Dhèwèké dipindhah menyang Rayo Vallecano ing Januari 2012, lan setaun sangarepé nggabung karo klub Liga Utama Wigan Athletic, ing endi dhèwèké dadi bagéyan saka pamenang FA Cup Final 2013 sadurungé gabung karo Everton.\n\nRobles main ing pertandhingan internasional kanggo Spanyol ing tingkat U-16, U-17, U-21 lan U-23, lan dadi bagéan saka skuad sing menang ing Kejuaraan Eropah U-21 2013.\n\nKarir klub\n\nAtlético Madrid\nLair ing Getafe, Madrid, Robles kalebu asil produk pelatihan remaja ing Kutha Getafe. Dhèwèké gabung karo Atlético Madrid ing taun 2005. Dhèwèké ngrampungaké mangsa profésional kapisané ing mangsa (2009-10), d

In [248]:
random.choice(paralel_300k)

{'id': 'a1c3096e-943c-4787-9870-18017e6a701b',
 'cirebonese': 'Ari 2002 XG4 mangrupa hiji astéroid. Ieu asteroid téh bagéan tina astéroid Amor, anu nganjrek deukeut jeung marcapada. Ékséntrisitas orbit ieu astéroid kacatet gedéna 0.480, sedengkeun magnitudo mutlakna 18.2. Ari nu cios référénsina mah nyaéta MPO 237836.\n\nBebentukan\nKawas sakumna astéroid, ieu astéroid kabentuk tina nébula panonpoé primordial minangka beubeulahan planétisimal, objék di nébula marcapada ngora nu teu cukup badag pikeun robah jadi planét.\n\nRujukan\n\nTutumbu kaluar\nDaptar astéroid Amor - The sejagat Astronomical Union Minor Planet Center.\n\n338347\n338347',
 'indonesian': 'Ari 2002 XG4 mangrupa hiji astéroid. Ieu asteroid téh bagéan tina astéroid Amor, anu nganjrek deukeut jeung marcapada. Ékséntrisitas orbit ieu astéroid kacatet gedéna 0.480, sedengkeun magnitudo mutlakna 18.2. Ari nu jadi référénsina mah nyaéta MPO\xa0237836.\n\nBebentukan\nKawas sakumna astéroid, ieu astéroid kabentuk tina nébula p

In [75]:
pred = model.predict("halo halo bandung ibu kota priangan", 3)

In [246]:
import random

model.predict(random.choice(paralel_705k)["balinese"].split("\n")[-1])

(('__label__ind_Latn',), array([0.99998975], dtype=float32))

In [55]:
import datasets

# 1. Load the dataset
paralel_705k = datasets.load_from_disk("dataset/paralel_3_lang/combined_paralel_dataset_705k_dedup")

In [248]:
clean_paralel_705k = paralel_705k.filter(filter_all_text, num_proc=8)

Filter (num_proc=8): 100%|██████████| 557859/557859 [01:21<00:00, 6808.73 examples/s]


In [250]:
clean_paralel_705k

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 491113
})

In [2]:
clean_paralel_705k = load_from_disk("dataset/paralel_3_lang/combined_paralel_dataset_705k_dedup_clean")

In [392]:
def filter_by_word_length(example):
    word_count_cbn = len(example['cirebonese'].split())
    word_count_bali = len(example['balinese'].split())
    return 20 <= word_count_cbn <= 50 and 20 <= word_count_bali <= 50

In [4]:
# Filter the dataset
filtered_dataset = clean_paralel_705k.filter(filter_by_word_length)

Filter: 100%|██████████| 491113/491113 [01:05<00:00, 7551.89 examples/s] 


In [68]:
import random

random.choice(filtered_dataset)

{'id': '60304c80-13e3-4a79-88bf-722ba2650aaf',
 'cirebonese': 'HAT-P-1b punika sěkěrět planet jaba surya ingkang terletak sukat 520,92 taun dilah saking Bumi. Planet niki akělit pada taun 2006 kalawan ngginakaken metode transit. HAT-P-1b akělit massa amun 0,525 massa Jupiter.\n\nReferensi \n\nPlanet jaba surya',
 'indonesian': 'HAT-P-1b adalah sebuah planet luar surya yang terletak sekitar 520,92 tahun cahaya dari Bumi. Planet ini ditemukan pada tahun 2006 dengan menggunakan metode  transit. HAT-P-1b memiliki massa sebesar 0,525 massa Jupiter.\n\nReferensi \n \n\nPlanet luar surya',
 'balinese': 'HAT-P-1b punika abulih planet jaba surya sane terletak sawatara 520,92 tahun dipta saking Bumi. Planet niki kangen ri tahun 2006 ajak makebah metode transit. HAT-P-1b madue massa amun 0,525 massa Jupiter.\n\nReferensi \n\nPlanet jaba surya'}

## Create Annotation Data

In [316]:
bali_paralel = load_from_disk("dataset/paralel_3_lang/paralel_dataset_22k")

In [393]:
bali_med = bali_paralel.filter(filter_by_word_length)

Filter: 100%|██████████| 22248/22248 [00:01<00:00, 17114.46 examples/s]


In [445]:
random.choice(bali_med)

{'id': '816a70ef-a61a-4b83-ad48-ee4f50b77804',
 'balinese': 'Ketut Santos Fernandez uli cerik jenek di Lampung. Uling cerik biasa tuturina ane serem-serem teken reramane. Gumi Bali ento tenget, kerana sabilang wai krama Baline jemet mabanten.',
 'indonesian': 'Ketut Santos Fernandez sejak kanak-kanak tinggal di Lampung. Semenjak usia dini selalu diceritakan hal yang mistis oleh orangtuanya. Pulau Bali disebut angker sebab setiap hari warga Bali rajin sembahyang dan menghaturkan banten.',
 'prompt': 'Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:\n- setiap hari: sabên, saban dina, amban dina, unggal dina, saban\n- sejak: atêwêk\n- tinggal: tilar, tinggal\n- di: teng, ning, dipun, di, ada\n- semenjak: jég, sajég\n- usia: umur, ayusa, ayusya\n- selalu: kad

In [319]:
filter_short = load_from_disk("dataset/paralel_3_lang/filtered_paralel_dataset_90k_short")

In [494]:
banned_words = ["referensi", "pranala luar", "lihat juga", "lihat pula", "rujukan", "catatan kaki", "pranala", "kota", "kecamatan", "kelurahan", "kabupaten", "negara", "provinsi", "desa", "adalah", "merujuk"]

def filter_id_text(data):
    for word in banned_words:
        if word in data["indonesian"].lower():
            return None
    return data

In [495]:
clean_short = filter_short.filter(filter_id_text)

Filter: 100%|██████████| 93748/93748 [00:04<00:00, 20814.66 examples/s]


In [497]:
clean_short

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 1714
})

In [513]:
random.choice(clean_short)

{'id': 'e4e2ec19-1ae4-4eae-bd49-490dd01368fc',
 'cirebonese': 'Jujuluk Natakusuma, sěkěrět jujuluk kepangeranan ingkang gage atêwêk adêg Amangkurat II ning Kesultanan Mataram, Jawa. Natakusuma, nama asli Paku Alam I.',
 'indonesian': 'Gelar Natakusuma, sebuah gelar kepangeranan yang dimulai sejak pemerintahan Amangkurat II di Kesultanan Mataram, Jawa.\n Natakusuma, nama asli Paku Alam I.',
 'balinese': 'Desak Natakusuma, abulih desak kepangeranan sane ngametuang sasukate pemerintahan Amangkurat II ring Kesultanan Mataram, Jawa. Natakusuma, aran asli Paku Alam I.'}

In [515]:
clean_short.save_to_disk("dataset/paralel_3_lang/filtered_paralel_dataset_1k_short_nowiki")

Saving the dataset (1/1 shards): 100%|██████████| 1714/1714 [00:00<00:00, 108252.33 examples/s]


In [516]:
import json

def load_dictionary(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

In [517]:
bali_indo_dict = load_dictionary("dict/bali_idn.json")
indo_bali_dict = load_dictionary("dict/idn_bali.json")
cbn_indo_dict = load_dictionary("dict/cbn_idn.json")
indo_cbn_dict = load_dictionary("dict/idn_cbn.json")

In [None]:
import re
import random

def extract_symbols(word):
    # Extract prefix and suffix symbols
    prefix = re.match(r'^[^\w\'`êÊ]*', word).group(0)
    suffix = re.search(r'[^\w\'`êÊ]*$', word).group(0)
    return prefix, suffix

def clean_word(word):
    # Keep alphanumeric, apostrophes, backticks and ê
    cleaned = re.sub(r'[^\w\'`êÊ]', '', word)
    return cleaned

def correct_and_analyze_text(text, to_idn_dict, from_idn_dict):
    # Split by whitespace but keep the separators
    words_with_spaces = re.split(r'(\s+)', text)
    corrected_words = []
    valid_count = 0
    corrected_count = 0
    correction_dict = {}

    for word in words_with_spaces:
        # If it's just whitespace, preserve it
        if word.isspace():
            corrected_words.append(word)
            continue

        original_word = word
        prefix, suffix = extract_symbols(word)
        cleaned_word = clean_word(word)
        
        # Check if all characters are lowercase
        if cleaned_word and all(c.islower() for c in cleaned_word):
            if cleaned_word in to_idn_dict:
                corrected_words.append(original_word)
                valid_count += 1
            else:
                translations = from_idn_dict.get(cleaned_word, None)
                if translations:
                    replacement = translations[0] if len(translations) == 1 else random.choice(translations)
                    # Add back the symbols to the translation
                    replacement = prefix + replacement + suffix
                    corrected_words.append(replacement)
                    corrected_count += 1
                    correction_dict[original_word] = replacement
                else:
                    corrected_words.append(original_word)
        else:
            corrected_words.append(original_word)
            valid_count += 1
    
    # Count only non-whitespace elements for statistics
    total_words = sum(1 for w in words_with_spaces if not w.isspace())
    invalid_count = total_words - valid_count - corrected_count
    
    return {
        'corrected_text': ''.join(corrected_words),
        'valid_percentage': (valid_count / total_words) * 100,
        'invalid_percentage': (invalid_count / total_words) * 100,
        'corrected_percentage': (corrected_count / total_words) * 100,
        'corrections': correction_dict
    }

In [519]:
def correct_and_analyze_text_ban_dataset(data):
    result = correct_and_analyze_text(
        data['balinese'],
        bali_indo_dict,
        indo_bali_dict
    )
    
    return {
        'balinese': data['balinese'].strip(),
        'indonesian': data['indonesian'].strip(),
        'balinese_corrected': result['corrected_text'],
        'valid_percentage': result['valid_percentage'],
        'invalid_percentage': result['invalid_percentage'],
        'corrected_percentage': result['corrected_percentage'],
        'corrections': list(result['corrections'].items()),
    }

def correct_and_analyze_text_cbn_dataset(data):
    result = correct_and_analyze_text(
        data['cirebonese'],
        cbn_indo_dict,
        indo_cbn_dict
    )
    
    return {
        'cirebonese': data['cirebonese'].strip(),
        'indonesian': data['indonesian'].strip(),
        'cirebonese_corrected': result['corrected_text'],
        'valid_percentage': result['valid_percentage'],
        'invalid_percentage': result['invalid_percentage'],
        'corrected_percentage': result['corrected_percentage'],
        'corrections': list(result['corrections'].items()),
    }

In [520]:
sample = clean_short.shuffle(seed=42).select(range(600))

In [521]:
sample_1 = sample.select(range(300))
sample_2 = sample.select(range(300, 600))

In [522]:
ban_sample = sample_2.map(correct_and_analyze_text_ban_dataset, remove_columns=['cirebonese'])
cbn_sample = sample_1.map(correct_and_analyze_text_cbn_dataset, remove_columns=['balinese'])

Map: 100%|██████████| 300/300 [00:00<00:00, 7103.81 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 6936.02 examples/s]


In [571]:
random.choice(ban_sample)

{'id': 'f90d59f1-cfe1-4297-a65f-0c6f2c2d7256',
 'indonesian': 'Pintu air (pelayaran), perangkat untuk mengatur kedalaman pada alur pelayaran.\n Pintu air (floodgate), perangkat untuk mengontrol aliran air di sungai, kanal, atau waduk.\n Sluis, perangkat untuk mengontrol tinggi aliran air di sungai atau kanal.',
 'balinese': 'Apes yeh (pelayaran), prajuru buat ngatur kedalaman ri celocoh pelayaran. Apes yeh (gembok banjir), prajuru buat ngontrol kecoran yeh ring tukad, kanal, utawi waduk. Sluis, prajuru buat ngontrol ganggas kecoran yeh ring tukad utawi kanal.',
 'balinese_corrected': 'Apes yeh (pelayaran), prajuru buat ngatur kedalaman ri celocoh pelayaran. Apes yeh (gembok blabar), prajuru buat ngontrol kecoran yeh ring tukad, kanal, utawi waduk. Sluis, prajuru buat ngontrol ganggas kecoran yeh ring tukad utawi kanal.',
 'valid_percentage': 77.14285714285715,
 'invalid_percentage': 20.0,
 'corrected_percentage': 2.857142857142857,
 'corrections': [['banjir),', 'blabar),']]}

In [572]:
random.choice(cbn_sample)

{'id': '6dcb330f-fde0-4513-a748-6956c63757e9',
 'cirebonese': 'Orde Anyar bisa ngacu ing pirang-pirang bab ing ngandhap: \n\n Orde Anyar, têngêr dum mangsa imamat Presiden Nusantara ke-2 Soeharto. \n Orde Anyar, order politik ning Jerman Nazi.',
 'indonesian': 'Orde Baru dapat mengacu pada beberapa hal berikut:\n\n Orde Baru, sebutan bagi masa pemerintahan Presiden Indonesia ke-2 Soeharto.\n Orde Baru, tatanan politik di Jerman Nazi.',
 'cirebonese_corrected': 'Orde Anyar bisa ngacu ing pirang-pirang bab ing ngandhap: Orde Anyar, têngêr dum mangsa imamat Presiden Nusantara ke-2 Soeharto. Orde Anyar, order politik ning Jerman Nazi.',
 'valid_percentage': 88.46153846153845,
 'invalid_percentage': 11.538461538461538,
 'corrected_percentage': 0.0,
 'corrections': []}

In [573]:
def dataset_to_csv(dataset, output_file):
    # Convert corrections list of tuples to dictionary string format
    def format_corrections(corrections):
        return '\n'.join(f"{orig} -> {corr}" for orig, corr in corrections)
    
    # Convert to pandas DataFrame
    df = dataset.to_pandas()
    
    # Format the corrections column
    if 'corrections' in df.columns:
        df['corrections'] = df['corrections'].apply(format_corrections)
    
    # Save to CSV
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"Dataset saved to {output_file}")
    print(f"Total rows: {len(df)}")

In [574]:
dataset_to_csv(ban_sample, "dataset/annotation/ban_sample_300/ban.csv")
dataset_to_csv(cbn_sample, "dataset/annotation/cbn_sample_300/cbn.csv")

Dataset saved to dataset/annotation/ban_sample_300/ban.csv
Total rows: 300
Dataset saved to dataset/annotation/cbn_sample_300/cbn.csv
Total rows: 300


In [575]:
ban_sample.save_to_disk("dataset/annotation/ban_sample_300")
cbn_sample.save_to_disk("dataset/annotation/cbn_sample_300")

Saving the dataset (1/1 shards): 100%|██████████| 300/300 [00:00<00:00, 66572.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 300/300 [00:00<00:00, 74441.89 examples/s]


## Gathering Previous High-Quality Balinese Data

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import datasets
from datasets import load_dataset

dset = datasets.load_dataset("cis-lmu/udhr-lid", trust_remote_code=True)

README.md:   0%|          | 0.00/6.45k [00:00<?, ?B/s]

udhr-lid.csv:   0%|          | 0.00/7.07M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/27757 [00:00<?, ? examples/s]

In [10]:
bali_udhr = dset['test'].filter(lambda x: x['iso639-3'] == 'ban')

Filter:   0%|          | 0/27757 [00:00<?, ? examples/s]

In [14]:
bali_udhr.save_to_disk("dataset/bali_hq/bali_udhr")

Saving the dataset (0/1 shards):   0%|          | 0/60 [00:00<?, ? examples/s]

In [15]:
bali_sib200 = load_dataset("Davlan/sib200", "ban_Latn")

README.md:   0%|          | 0.00/47.9k [00:00<?, ?B/s]

train.tsv:   0%|          | 0.00/114k [00:00<?, ?B/s]

dev.tsv:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

test.tsv:   0%|          | 0.00/33.1k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
bali_sib200["train"]

Dataset({
    features: ['index_id', 'category', 'text'],
    num_rows: 701
})

In [18]:
bali_sib200["train"].save_to_disk("dataset/bali_hq/bali_sib200")

Saving the dataset (0/1 shards):   0%|          | 0/701 [00:00<?, ? examples/s]

In [19]:
bali_glot500 = dataset = load_dataset('cis-lmu/Glot500', 'ban_Latn', split='train')

README.md:   0%|          | 0.00/48.8k [00:00<?, ?B/s]

data-00000-of-00001.arrow:   0%|          | 0.00/8.10M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
bali_glot500.save_to_disk("dataset/bali_hq/bali_glot500") 

Saving the dataset (0/1 shards):   0%|          | 0/48958 [00:00<?, ? examples/s]

In [23]:
bali_madlad = load_dataset("allenai/madlad-400", "ban", trust_remote_code=True)

ban_clean_0000.jsonl.gz:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

ban_noisy_0000.jsonl.gz:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

Generating clean split: 0 examples [00:00, ? examples/s]

Generating noisy split: 0 examples [00:00, ? examples/s]

In [26]:
bali_madlad["clean"].save_to_disk("dataset/bali_hq/bali_madlad")

Saving the dataset (0/1 shards):   0%|          | 0/637 [00:00<?, ? examples/s]

In [30]:
bali_nllb = load_dataset("acul3/KoPI-NLLB", "ban_Latn-neardup", trust_remote_code=True)

Repo card metadata block was not found. Setting CardData to empty.


ban_Latn.json.zst:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [31]:
bali_nllb["train"].save_to_disk("dataset/bali_hq/bali_nllb")

Saving the dataset (0/1 shards):   0%|          | 0/244545 [00:00<?, ? examples/s]

In [40]:
import pandas as pd
from datasets import Dataset

# Read the CSV file
df = pd.read_csv("dataset/bali_hq/id-ban.tsv", index_col=0, sep="\t")

df = df.dropna()
df['Balinese'] = df['Balinese'].astype(str)
# Extract only the balinese column and convert to dict format
balinese_data = {
    'text': df['Balinese'].tolist()
}

# Convert to HuggingFace dataset
balinese_dataset = Dataset.from_dict(balinese_data)

# Save to disk
balinese_dataset.save_to_disk("dataset/bali_hq/bali_indonmt")

Saving the dataset (0/1 shards):   0%|          | 0/5165 [00:00<?, ? examples/s]

In [36]:
balinese_dataset

Dataset({
    features: ['text'],
    num_rows: 20611
})

In [42]:
from datasets import load_from_disk

bali_indonmt = load_from_disk("dataset/bali_hq/bali_indonmt")
bali_nusax = load_from_disk("dataset/bali_hq/bali_nusax")
bali_wiki = load_from_disk("dataset/bali_hq/bali_wiki")

In [62]:
def convert_dataset(data):
    data["text"] = data["sentence"]
    return data

In [63]:
bali_udhr = bali_udhr.map(convert_dataset, remove_columns=["sentence", "id", "iso639-3", "iso15924", "language"])

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [64]:
bali_udhr

Dataset({
    features: ['text'],
    num_rows: 60
})

In [65]:
bali_wiki

Dataset({
    features: ['text'],
    num_rows: 20611
})

In [67]:
from datasets import concatenate_datasets

all_bali_hq = concatenate_datasets([bali_udhr, bali_sib200, bali_glot500, bali_madlad, bali_nllb, bali_indonmt, bali_nusax, bali_wiki])

In [70]:
all_bali_hq = all_bali_hq.remove_columns(['index_id', 'category', 'dataset', 'script', 'lang_script', 'url', 'score', 'source'])

In [72]:
all_bali_hq.save_to_disk("dataset/bali_hq/all_bali_hq")

Saving the dataset (0/1 shards):   0%|          | 0/280246 [00:00<?, ? examples/s]

In [6]:
def filter_bali_text(data):
    ban_sentences = data["text"].split("\n")
    
    ban_first_line = ban_sentences[0]
    ban_first_pred = model.predict(ban_first_line)[0][0]
    ban_verdict = False
    if len(ban_sentences) == 1:
      if ban_first_pred == "__label__ban_Latn":
          ban_verdict = True
    else:
      ban_second_line = ban_sentences[1]
      ban_second_pred = model.predict(ban_second_line)[0][0]
      if ban_first_pred == "__label__ban_Latn" or ban_second_pred == "__label__ban_Latn":
          ban_verdict = True

    if ban_verdict:
        return data
    else:
        return None

In [None]:
all_bali_hq_dedup = load_from_disk("dataset/bali_hq/all_bali_hq_dedup")
all_bali_hq_clean = all_bali_hq_dedup.filter(filter_bali_text, num_proc=8)

In [10]:
count_tokens_in_dataset(all_bali_hq_clean, "text", num_tokens_from_string, encoding)

11075340

In [11]:
all_bali_hq_clean.save_to_disk("dataset/bali_hq/all_bali_hq_clean")

Saving the dataset (1/1 shards): 100%|██████████| 201404/201404 [00:00<00:00, 207678.31 examples/s]


In [14]:
from datasets import load_dataset

# or for a specific language
bali_ift = load_dataset("akoksal/muri-it-language-split", "ban")

Generating train split: 100%|██████████| 6026/6026 [00:00<00:00, 26282.14 examples/s]
Generating validation split: 100%|██████████| 335/335 [00:00<00:00, 78690.18 examples/s]
Generating test split: 100%|██████████| 335/335 [00:00<00:00, 77974.02 examples/s]


In [17]:
bali_ift =  bali_ift["train"]
bali_ift.save_to_disk("dataset/bali_ift/bali_muri")

Saving the dataset (1/1 shards): 100%|██████████| 6026/6026 [00:00<00:00, 603449.43 examples/s]


In [2]:
bali_clean = load_from_disk("dataset/bali_hq/all_bali_hq_clean")

## English High-Quality Data

In [12]:
en_wiki = load_from_disk("dataset/en_hq/en_wiki")

In [13]:
count_tokens_in_dataset(en_wiki, "text", num_tokens_from_string, encoding)

5615855

## Cirebonese High-Quality Data

In [1]:
from pathlib import Path
import datasets
import os

def is_valid_content(text):
    # Check if content is valid based on criteria
    if text.strip().startswith("The image"):
        return False
    tokens = text.split()
    return len(tokens) > 10

def gather_corpus():
    # Get base path
    base_path = os.path.expanduser("~/Cirebonese/corpus/")
    
    texts = []
    titles = []
    
    # Walk through the directory
    for root, dirs, files in os.walk(base_path):
        # Skip the dict subdirectory
        if "dict" in root:
            continue
            
        for file in files:
            if file.endswith(".txt"):
                file_path = Path(root) / file
                
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read().strip()
                        
                    if is_valid_content(content):
                        texts.append(content)
                        titles.append(file.replace(".txt", ""))
                except Exception as e:
                    print(f"Error reading {file}: {str(e)}")

    # Create dataset dictionary
    dataset_dict = {
        "text": texts,
        "title": titles
    }
    
    # Convert to HuggingFace dataset
    dataset = datasets.Dataset.from_dict(dataset_dict)
    
    return dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create the dataset
dataset = gather_corpus()

# Print some statistics
print(f"Total valid documents: {len(dataset)}")
print("\nFirst few examples:")
print(dataset[:2])

# Save the dataset (optional)
dataset.save_to_disk("cirebonese_corpus")

Total valid documents: 2121

First few examples:
{'text': ['Tugas Kelompok Bahasa Cirebon\n\n          “Drama Bahasa Cirebon”\n\n\n\nNama Anggota :\n \uf076 Alifia Tasya Rizkiani (03)\n \uf076 Farhan Primanata (15)\n \uf076 Faydita Laila Qodrish (16)\n \uf076 Riyan Dwi Julianto (40)\n\n\n\n\n                    Kelas : 9C\n\n\n\n               SMPN 2 CIREBON\n\x0c                   “Sederekan”\nAdegan 1\nSuasana rame sesampune ngedamel soal kang kari dinten niki.\nUAS sampun pragat. Lare-lare seneng saged ngerjanang soale\nangsal sae. Ta,mpi kecuali Rais sareng rencang-rencange.\n\x0cKabehane seneng sesampune lepas teng buku sing nyibukaken\nkiyambeke akhir pekan seniki. Sesampune bel muni kiyambek\nkesusu melajeng teng kelas.\nSelma : Sae’e kepripun sesampune selese tugas-tugas akhir\nsekolah niki?\nNurul : Kula sareng rencang-rencang perean mawon nggih?\nPripun sareng serencang sejene?\nErina : Bade perean teng pundi, rul? Kula milet mawon.\nNurul : Kepripun, sampeyan boten bosen ten

Saving the dataset (1/1 shards): 100%|██████████| 2121/2121 [00:00<00:00, 137602.18 examples/s]


## Language Filter

In [6]:
import fasttext
import numpy as np

# load the model
model = fasttext.load_model("./models/glotlid/model.bin")

class CustomLID:
    def __init__(self, model_path, languages = -1, mode='before'):
        self.model = fasttext.load_model(model_path)
        self.output_matrix = self.model.get_output_matrix()
        self.labels = self.model.get_labels()
        
        # compute language_indices
        if languages !=-1 and isinstance(languages, list):
            self.language_indices = [self.labels.index(l) for l in list(set(languages)) if l in self.labels]

        else:
            self.language_indices = list(range(len(self.labels)))

        # limit labels to language_indices
        self.labels = list(np.array(self.labels)[self.language_indices])
        
        # predict
        self.predict = self.predict_limit_after_softmax if mode=='after' else self.predict_limit_before_softmax

    
    def predict_limit_before_softmax(self, text, k=1):
        
        # sentence vector
        sentence_vector = self.model.get_sentence_vector(text)
        
        # dot
        result_vector = np.dot(self.output_matrix[self.language_indices, :], sentence_vector)

        # softmax
        softmax_result = np.exp(result_vector - np.max(result_vector)) / np.sum(np.exp(result_vector - np.max(result_vector)))

        # top k predictions
        top_k_indices = np.argsort(softmax_result)[-k:][::-1]
        top_k_labels = [self.labels[i] for i in top_k_indices]
        top_k_probs = softmax_result[top_k_indices]

        return tuple(top_k_labels), top_k_probs


    def predict_limit_after_softmax(self, text, k=1):
        
        # sentence vector
        sentence_vector = self.model.get_sentence_vector(text)
        
        # dot
        result_vector = np.dot(self.output_matrix, sentence_vector)

        # softmax
        softmax_result = np.exp(result_vector - np.max(result_vector)) / np.sum(np.exp(result_vector - np.max(result_vector)))

        # limit softmax to language_indices
        softmax_result = softmax_result[self.language_indices]

        
        # top k predictions
        top_k_indices = np.argsort(softmax_result)[-k:][::-1]
        top_k_labels = [self.labels[i] for i in top_k_indices]
        top_k_probs = softmax_result[top_k_indices]

        return tuple(top_k_labels), top_k_probs

# to make sure these languages are available in GlotLID check the list of supported labels in model.labels
limited_languages = ['__label__ind_Latn', '__label__sun_Latn', '__label__jav_Latn', '__label__ban_Latn']

model = CustomLID("./models/glotlid/model.bin", languages = limited_languages , mode='before')

In [7]:
def filter_non_id_text(data):
    sentences = data["indonesian"].split("\n")
    first_line = sentences[0]
    first_pred = model.predict(first_line)[0][0]
    if len(sentences) == 1:
      if first_pred == "__label__ind_Latn":
          return None
      else:
          return data
    else:
      second_line = sentences[1]
      second_pred = model.predict(second_line)[0][0]
      if first_pred == "__label__ind_Latn" and second_pred == "__label__ind_Latn":
          return None
      else:
          return data

def filter_id_text(data):
    sentences = data["indonesian"].split("\n")
    first_line = sentences[0]
    first_pred = model.predict(first_line)[0][0]
    if len(sentences) == 1:
      if first_pred == "__label__ind_Latn":
          return data
      else:
          return None
    else:
      second_line = sentences[1]
      second_pred = model.predict(second_line)[0][0]
      if first_pred == "__label__ind_Latn" and second_pred == "__label__ind_Latn":
          return data
      else:
          return None

def filter_id_text_2(data):
    sentences = data["text"].split("\n")
    first_line = sentences[0]
    first_pred = model.predict(first_line)[0][0]
    if len(sentences) == 1:
      if first_pred == "__label__ind_Latn":
          return data
      else:
          return None
    else:
      second_line = sentences[1]
      second_pred = model.predict(second_line)[0][0]
      if first_pred == "__label__ind_Latn" and second_pred == "__label__ind_Latn":
          return data
      else:
          return None

def filter_all_text(data):
    cbn_sentences = data["cirebonese"].split("\n")
    ban_sentences = data["balinese"].split("\n")
    
    cbn_first_line = cbn_sentences[0]
    cbn_first_pred = model.predict(cbn_first_line)[0][0]
    cbn_verdict = False
    if len(cbn_sentences) == 1:
      if cbn_first_pred == "__label__jav_Latn":
          cbn_verdict = True
    else:
      cbn_second_line = cbn_sentences[1]
      cbn_second_pred = model.predict(cbn_second_line)[0][0]
      if cbn_first_pred == "__label__jav_Latn" or cbn_second_pred == "__label__jav_Latn":
          cbn_verdict = True

    ban_first_line = ban_sentences[0]
    ban_first_pred = model.predict(ban_first_line)[0][0]
    ban_verdict = False
    if len(ban_sentences) == 1:
      if ban_first_pred == "__label__ban":
          ban_verdict = True
    else:
      ban_second_line = ban_sentences[1]
      ban_second_pred = model.predict(ban_second_line)[0][0]
      if ban_first_pred == "__label__ban" or ban_second_pred == "__label__ban":
          ban_verdict = True

    if cbn_verdict and ban_verdict:
        return data
    else:
        return None

def filter_bali_text(data):
    ban_sentences = data["text"].split("\n")
    
    ban_first_line = ban_sentences[0]
    ban_first_pred = model.predict(ban_first_line)[0][0]
    ban_verdict = False
    if len(ban_sentences) == 1:
      if ban_first_pred == "__label__ban":
          ban_verdict = True
    else:
      ban_second_line = ban_sentences[1]
      ban_second_pred = model.predict(ban_second_line)[0][0]
      if ban_first_pred == "__label__ban" or ban_second_pred == "__label__ban":
          ban_verdict = True

    if ban_verdict:
        return data
    else:
        return None

def filter_cbn_text(data):
    cbn_sentences = data["text"].split("\n")
    
    cbn_first_line = cbn_sentences[0]
    cbn_first_pred = model.predict(cbn_first_line)[0][0]
    cbn_verdict = False
    if len(cbn_sentences) == 1:
      if cbn_first_pred == "__label__jav_Latn" or cbn_first_pred == "__label__ind_Latn":
          cbn_verdict = True
    else:
      cbn_second_line = cbn_sentences[1]
      cbn_second_pred = model.predict(cbn_second_line)[0][0]
      if cbn_first_pred == "__label__jav_Latn" or cbn_first_pred == "__label__ind_Latn" or cbn_second_pred == "__label__jav_Latn" or cbn_second_pred == "__label__ind_Latn":
          cbn_verdict = True

    if cbn_verdict:
        return data
    else:
        return None

In [6]:
all_cbn_hq_dedup = load_from_disk("dataset/cbn_hq/all_cbn_hq_dedup")
all_cbn_hq_clean = all_cbn_hq_dedup.filter(filter_cbn_text, num_proc=8)

Filter (num_proc=8): 100%|██████████| 2117/2117 [00:00<00:00, 5793.59 examples/s]


In [8]:
count_tokens_in_dataset(all_cbn_hq_clean, "text", num_tokens_from_string, encoding)

1246039

In [9]:
all_cbn_hq_clean.save_to_disk("dataset/cbn_hq/all_cbn_hq_clean")

Saving the dataset (1/1 shards): 100%|██████████| 2105/2105 [00:00<00:00, 61783.67 examples/s]


In [31]:
import random

random.choice(all_cbn_hq_clean)

{'text': '"Maapkan aku guru, kau pilih kasih kenapa ajian tapak cecak itu kau wariskan pada musuh, bukan pada ku."\n\n"Ah, masalah itu rupanya, Wuluh..., itu sudah suratan takdir sang hyang widhi."\n\n"Aku tidak peduli, yang jelas kami datang menginginkan jiwamu."\n\n"Kau benar-benar murid murtad, Niluh Seroja tak pantas jadi anakmu."\n\n"Jangan bawa-bawa anak durhaka itu, karena dialah pemuda murid Sunan Jati Purba itu mewarisi ajian tapak cecak."\n\n"Murid murtad nan licik aku menyesal jadi gurumu."\n\n"Tua bangka terima ajalmu...!"\n\nKembali pemuda tegap berikat kepala merah lentingkan badannya ke udara dan dari sepuluh jari nya membersit racun mematikan warangan temiang geni, Resi Maruta mandra kebut lengan jubahnya, serangkum angin membuyarka serpihan debu mematikan tersebut.\n\n"Siapa kau Kisanak, apa hubunganmu dengan Resi sesat Mahendra Thabita?"\n\nPemuda tegap yang dipanggil Jungjungan oleh Wuluh Balang cuma menyeringai.\n\n"Dengar orang tua, agar kematianmu tak penasaran ak

In [32]:
len(all_cbn_hq_clean)

2105

In [33]:
all_cbn_hq_clean.save_to_disk("dataset/cpt/cbn_hq_2k")

Saving the dataset (1/1 shards): 100%|██████████| 2105/2105 [00:00<00:00, 16499.80 examples/s]


## Testing Tokenizer Chat Template

In [3]:
from transformers import AutoTokenizer
from datasets import load_from_disk

# 1. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "./models/Bali/instruct/BaliQwen-3B-base_HQ-instruct_en",
    trust_remote_code=True
)

# 2. Load dataset
dataset = load_from_disk("./dataset/bali_ift/bali_muri")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
dataset

Dataset({
    features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name'],
    num_rows: 6026
})

In [None]:
def format_chat(example, tokenizer):
    messages = [
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["output"]}
    ]
    
    # Apply chat template
    example["text"] = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    return example

# Apply formatting to dataset
formatted_dataset = dataset.map(
    lambda x: format_chat(x, tokenizer),
    remove_columns=dataset.column_names,
    num_proc=8
)

# Show example
print("Example of formatted data:")
print(formatted_dataset[0]["text"])

Map (num_proc=8): 100%|██████████| 6026/6026 [00:00<00:00, 24405.81 examples/s]

Example of formatted data:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Sesuratan puniki nyihnayang samian rumah sakit ring Provinsi Sulawesi Utara, Indonesia.<|im_end|>
<|im_start|>assistant
Puniki suratan indik lis rumah sakit ring propinsi Sulawesi Utara, Indonésia sané kawagi manut wewidangan kabupatén miwah kota.

Pustaka

Pranala jaba 
  Sistem Informasi Rumah Sakit (SIRS) Kementerian Kesehatan RI 
  Perhimpunan Rumah Sakit Seluruh Indonesia (PERSI) 

Rumah sakit ring Sulawesi Utara
Sulawesi Utara
Sulawesi Utara<|im_end|>






In [9]:
formatted_dataset[0]["text"]

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nSesuratan puniki nyihnayang samian rumah sakit ring Provinsi Sulawesi Utara, Indonesia.<|im_end|>\n<|im_start|>assistant\nPuniki suratan indik lis rumah sakit ring propinsi Sulawesi Utara, Indonésia sané kawagi manut wewidangan kabupatén miwah kota.\n\nPustaka\n\nPranala jaba \n  Sistem Informasi Rumah Sakit (SIRS) Kementerian Kesehatan RI \n  Perhimpunan Rumah Sakit Seluruh Indonesia (PERSI) \n\nRumah sakit ring Sulawesi Utara\nSulawesi Utara\nSulawesi Utara<|im_end|>\n'

In [11]:
# Check tokenizer encode-decode on formatted dataset
def check_tokenization(example, max_length=8192):
    # Encode the text
    encoded = tokenizer.encode(
        example["text"], 
        max_length=max_length, 
        truncation=True
    )
    
    # Decode back to text
    decoded = tokenizer.decode(encoded, skip_special_tokens=False)
    
    # Print comparison
    print("Original text:")
    print("-" * 50)
    print(example["text"])
    print("\nDecoded text:")
    print("-" * 50)
    print(decoded)
    print("\nLength of tokens:", len(encoded))
    print("-" * 50)
    return len(encoded)

# Check first example
token_length = check_tokenization(formatted_dataset[0])

Original text:
--------------------------------------------------
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Sesuratan puniki nyihnayang samian rumah sakit ring Provinsi Sulawesi Utara, Indonesia.<|im_end|>
<|im_start|>assistant
Puniki suratan indik lis rumah sakit ring propinsi Sulawesi Utara, Indonésia sané kawagi manut wewidangan kabupatén miwah kota.

Pustaka

Pranala jaba 
  Sistem Informasi Rumah Sakit (SIRS) Kementerian Kesehatan RI 
  Perhimpunan Rumah Sakit Seluruh Indonesia (PERSI) 

Rumah sakit ring Sulawesi Utara
Sulawesi Utara
Sulawesi Utara<|im_end|>


Decoded text:
--------------------------------------------------
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Sesuratan puniki nyihnayang samian rumah sakit ring Provinsi Sulawesi Utara, Indonesia.<|im_end|>
<|im_start|>assistant
Puniki suratan indik lis rumah sakit ring propinsi Sula

In [16]:
formatted_dataset.save_to_disk("./dataset/ift/bali_ift_6k")

Saving the dataset (1/1 shards): 100%|██████████| 6026/6026 [00:00<00:00, 204066.62 examples/s]


## Filter Data Based on Annotation

In [4]:
all_dataset = load_from_disk("dataset/paralel_3_lang/combined_paralel_dataset_705k_dedup_clean")

In [5]:
all_dataset

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 491113
})

In [11]:
def filter_id_text(data):
    sentences = [s for s in data["indonesian"].split("\n") if s.strip()]
    if not sentences:
        return None

    # Predict top label for each sentence
    preds = [model.predict(s)[0][0] for s in sentences]
    ind_count = sum(1 for p in preds if p == "__label__ind_Latn")

    if len(sentences) == 1:
        return data if ind_count == 1 else None
    else:
        # Require strict majority of sentences to be Indonesian
        return data if ind_count > (len(sentences) / 2) else None

In [12]:
filtered_id_1 = all_dataset.filter(filter_id_text, num_proc=8)

Filter (num_proc=8): 100%|██████████| 491113/491113 [01:34<00:00, 5190.18 examples/s]



In [13]:
filtered_id_1

Dataset({
    features: ['id', 'cirebonese', 'indonesian', 'balinese'],
    num_rows: 490291
})

In [15]:
filtered_id_1.save_to_disk("dataset/paralel_3_lang/combined_paralel_dataset_705k_dedup_clean_filtered-id")

Saving the dataset (6/6 shards): 100%|██████████| 490291/490291 [00:05<00:00, 81859.61 examples/s]



In [17]:
# Split filtered_id_1 into two datasets
# - cirebonese: drop the 'balinese' column
# - balinese: drop the 'cirebonese' column

cirebonese_only = filtered_id_1.remove_columns([c for c in ["balinese"] if c in filtered_id_1.column_names])
balinese_only = filtered_id_1.remove_columns([c for c in ["cirebonese"] if c in filtered_id_1.column_names])

print("cirebonese_only columns:", cirebonese_only.column_names)
print("balinese_only columns:", balinese_only.column_names)

cirebonese_only columns: ['id', 'cirebonese', 'indonesian']
balinese_only columns: ['id', 'indonesian', 'balinese']


### Balinese

In [21]:
# Compute Balinese valid percentage metric using dictionary-based matching
# Re-uses the approach from annotated_data_analysis: token match against Balinese dictionary

import json, re
import numpy as np

# Load Balinese word set from dictionary keys
# The JSON is expected to be a mapping where keys are Balinese tokens
with open("dict/bali_idn.json", "r", encoding="utf-8") as f:
    _bali_dict_data = json.load(f)
BALINESE_WORD_SET = set(k.lower() for k in _bali_dict_data.keys())

# Optional: load KBBI wordlist if you want to treat non-KBBI tokens as valid too
# from annotated_data_analysis, we can skip for now to keep it fast
KBBI_WORDLIST = None  # or set to a Python set of Indonesian tokens to use the alternate rule

_word_regex = re.compile(r"\b\w+\b", flags=re.UNICODE)

def calculate_valid_percentage(text: str, dictionary_words: set, kbbi_wordlist: set | None = None) -> float:
    if text is None:
        return 0.0
    text = str(text)
    if not text.strip():
        return 0.0
    words = _word_regex.findall(text.lower())
    total_words = len(words)
    if total_words == 0:
        return 0.0

    valid_words = 0
    for w in words:
        if w in dictionary_words:
            valid_words += 1
        elif kbbi_wordlist is not None:
            # Treat as valid if not in KBBI and not in dictionary
            if w not in kbbi_wordlist:
                valid_words += 1

    return (valid_words / total_words) * 100.0

# Batched helper (picklable and efficient per batch)

def add_valid_percentage_batch(batch):
    texts = batch.get("balinese", [])
    return {
        "valid_percentage": [
            calculate_valid_percentage(t, BALINESE_WORD_SET, KBBI_WORDLIST) for t in texts
        ]
    }

# Quick sample-based mean (fast smoke check)
sample_size = min(5000, len(balinese_only))
sample = balinese_only.select(range(sample_size))
sample_metrics = sample.map(add_valid_percentage_batch, batched=True)
mean_valid_pct_sample = float(np.mean(sample_metrics["valid_percentage"]))
print(f"Balinese valid percentage (mean on first {sample_size}): {mean_valid_pct_sample:.2f}%")

# To process the full dataset and attach the column, uncomment below (may take a while):
balinese_with_metrics = balinese_only.map(add_valid_percentage_batch, batched=True)
mean_valid_pct = float(np.mean(balinese_with_metrics["valid_percentage"]))
print(f"Balinese valid percentage (mean, full): {mean_valid_pct:.2f}%")

Balinese valid percentage (mean on first 5000): 48.83%


Map: 100%|██████████| 490291/490291 [00:40<00:00, 12195.79 examples/s]



Balinese valid percentage (mean, full): 48.26%


In [28]:
# Split Balinese data by valid_percentage threshold
# good: valid_percentage >= 70
# bad:  valid_percentage < 70

def is_good(example):
    try:
        return float(example.get("valid_percentage", 0.0)) >= 50.0
    except Exception:
        return False

def is_bad(example):
    try:
        return float(example.get("valid_percentage", 0.0)) < 50.0
    except Exception:
        return False

balinese_good = balinese_with_metrics.filter(is_good)
balinese_bad = balinese_with_metrics.filter(is_bad)

print("Counts:")
print("  good (>=70):", len(balinese_good))
print("  bad  (<70):", len(balinese_bad))

Filter: 100%|██████████| 490291/490291 [00:51<00:00, 9542.99 examples/s] 
Filter: 100%|██████████| 490291/490291 [00:51<00:00, 9542.99 examples/s] 
Filter: 100%|██████████| 490291/490291 [00:55<00:00, 8912.75 examples/s] 

Counts:
  good (>=70): 226950
  bad  (<70): 263341





In [29]:
count_tokens_in_dataset(balinese_good, "balinese", num_tokens_from_string, encoding)

72045815

In [27]:
count_tokens_in_dataset(balinese_with_metrics, "balinese", num_tokens_from_string, encoding)

239628839

In [30]:
balinese_good.save_to_disk("dataset/paralel_3_lang/balinese_annotation-filter_valid-pct_50")

Saving the dataset (2/2 shards): 100%|██████████| 226950/226950 [00:02<00:00, 82845.94 examples/s] 



In [45]:
# Add BT valid match percentage for Balinese and split by threshold (mirrors Cirebonese in cell 147)
import numpy as np

# Use existing Balinese->Indonesian dictionary map prepared earlier
# BAL_TO_ID_MAP should already be available from prior cells
assert 'BAL_TO_ID_MAP' in globals(), "BAL_TO_ID_MAP not found. Run the dictionary load cell first."
assert 'balinese_only' in globals(), "balinese_only dataset not found. Run the dataset split cell first."

# Helper to add bt_valid_match_percentage to a batch
# Relies on bt_valid_match_pct(balinese_text, indonesian_text, dict_map) defined earlier

def add_bt_metric_batch_bal(batch):
    bal_texts = batch.get('balinese', [])
    id_texts = batch.get('indonesian', [])
    scores = []
    for bal, idt in zip(bal_texts, id_texts):
        scores.append(bt_valid_match_pct(bal, idt, BAL_TO_ID_MAP))
    return {"bt_valid_match_percentage": scores}

# Compute metric across the full Balinese dataset (single-process for stability)
base_ds = balinese_only
balinese_with_bt = base_ds.map(
    add_bt_metric_batch_bal,
    batched=True,
    batch_size=1000,
    num_proc=None,
    desc="Computing BT valid match % for Balinese"
)

# Quick summary
mean_bt_bal = float(np.mean(balinese_with_bt["bt_valid_match_percentage"])) if len(balinese_with_bt) > 0 else 0.0
print(f"Balinese BT valid match percentage (mean): {mean_bt_bal:.2f}%")

# Split into good/bad using same threshold style as Cirebonese
THRESH_BT_BAL = 85.0

def bal_is_good_bt(example):
    return float(example.get("bt_valid_match_percentage", 0.0)) >= THRESH_BT_BAL

def bal_is_bad_bt(example):
    return float(example.get("bt_valid_match_percentage", 0.0)) < THRESH_BT_BAL

bal_good_bt = balinese_with_bt.filter(bal_is_good_bt, num_proc=1)
bal_bad_bt = balinese_with_bt.filter(bal_is_bad_bt, num_proc=1)

print(f"Balinese with BT good (>= {THRESH_BT_BAL}): {len(bal_good_bt):,}")
print(f"Balinese with BT bad  (< {THRESH_BT_BAL}): {len(bal_bad_bt):,}")

Balinese BT valid match percentage (mean): 83.94%
Balinese with BT good (>= 85.0): 241,408
Balinese with BT bad  (< 85.0): 248,883


In [46]:
count_tokens_in_dataset(bal_good_bt, "balinese", num_tokens_from_string, encoding)

92759958

In [52]:
bal_good_bt.save_to_disk("dataset/paralel_3_lang/balinese_annotation-filter_bt-valid-pct_85")

Saving the dataset (2/2 shards): 100%|██████████| 241408/241408 [00:04<00:00, 48328.60 examples/s]


### Cirebonese

In [33]:
# Backtranslation valid match metric for Balinese
# Reuses logic from annotated_data_analysis: for each Balinese token, check if any Indonesian translation
# (from dictionary) appears in the reference Indonesian tokens.

import json, re
import numpy as np

# Load Balinese->Indonesian dictionary map (lowercased keys and values)
def load_dict_map(dictionary_path: str):
    try:
        with open(dictionary_path, "r", encoding="utf-8") as f:
            raw_map = json.load(f)
        dict_map = {}
        for k, v in raw_map.items():
            key = str(k).lower()
            if isinstance(v, str):
                dict_map[key] = [v.lower()]
            elif isinstance(v, list):
                dict_map[key] = [str(item).lower() for item in v if isinstance(item, str)]
            else:
                dict_map[key] = []
        return dict_map
    except Exception as e:
        print(f"Error loading dictionary: {e}")
        return {}

CBN_TO_ID_MAP = load_dict_map("dict/cbn_idn.json")
_tokenize = re.compile(r"\b\w+\b", flags=re.UNICODE).findall

# Compute bt_valid_match_percentage for one pair of texts
def bt_valid_match_pct(cirebonese_text: str, indonesian_text: str, dict_map: dict) -> float:
    if cirebonese_text is None or indonesian_text is None:
        return 0.0
    cirebonese_text = str(cirebonese_text)
    indonesian_text = str(indonesian_text)
    if not cirebonese_text.strip() or not indonesian_text.strip():
        return 0.0

    cirebonese_tokens = _tokenize(cirebonese_text.lower())
    if not cirebonese_tokens:
        return 0.0
    id_tokens_set = set(_tokenize(indonesian_text.lower()))

    valid = 0
    for cirebonese_w in cirebonese_tokens:
        translations = dict_map.get(cirebonese_w, [])
        # if no dictionary translations, allow direct match fallback
        if not translations:
            translations = [cirebonese_w]
        if any(t in id_tokens_set for t in translations):
            valid += 1

    return (valid / len(cirebonese_tokens)) * 100.0

# Batched mapper to add metric

def add_bt_metric_batch(batch):
    cirebonese_texts = batch.get("cirebonese", [])
    id_texts = batch.get("indonesian", [])
    return {
        "bt_valid_match_percentage": [
            bt_valid_match_pct(b, i, CBN_TO_ID_MAP) for b, i in zip(cirebonese_texts, id_texts)
        ]
    }

# Ensure we have a base dataset to attach to; default to cirebonese_only if cirebonese_with_metrics is missing
try:
    base_ds = cirebonese_with_metrics
except NameError:
    base_ds = cirebonese_only

# Quick sample mean to verify
sample_n = min(5000, len(base_ds))
sample_ds = base_ds.select(range(sample_n))
sample_bt = sample_ds.map(add_bt_metric_batch, batched=True)
mean_bt_sample = float(np.mean(sample_bt["bt_valid_match_percentage"]))
print(f"BT valid match percentage (mean on first {sample_n}): {mean_bt_sample:.2f}%")

# To compute on full dataset and attach the column, uncomment below (can take a while):
base_ds = base_ds.map(add_bt_metric_batch, batched=True)
print(f"BT valid match percentage (mean, full): {float(np.mean(base_ds['bt_valid_match_percentage'])):.2f}%")

# Expose dataset with metric
cirebonese_with_bt = base_ds

Map: 100%|██████████| 5000/5000 [00:01<00:00, 4852.71 examples/s]



BT valid match percentage (mean on first 5000): 81.72%


Map: 100%|██████████| 490291/490291 [01:38<00:00, 4966.77 examples/s] 

BT valid match percentage (mean, full): 81.62%





In [50]:
# Split Cirebonese data by backtranslation valid match percentage threshold
# good: bt_valid_match_percentage >= 80
# bad:  bt_valid_match_percentage < 80

THRESH_BT_CBN = 80


def cbn_is_good_bt(example):
    try:
        return float(example.get("bt_valid_match_percentage", 0.0)) >= THRESH_BT_CBN
    except Exception:
        return False


def cbn_is_bad_bt(example):
    try:
        return float(example.get("bt_valid_match_percentage", 0.0)) < THRESH_BT_CBN
    except Exception:
        return False

# Expect cirebonese_with_bt to already include 'bt_valid_match_percentage'
cbn_good_bt = cirebonese_with_bt.filter(cbn_is_good_bt)
cbn_bad_bt = cirebonese_with_bt.filter(cbn_is_bad_bt)

print("Cirebonese BT split counts:")
print("  good (>=80):", len(cbn_good_bt))
print("  bad  (<80):", len(cbn_bad_bt))

Cirebonese BT split counts:
  good (>=80): 299071
  bad  (<80): 191220


In [48]:
count_tokens_in_dataset(cbn_good_bt, "cirebonese", num_tokens_from_string, encoding)

77022579

In [51]:
cbn_good_bt.save_to_disk("dataset/paralel_3_lang/cirebonese_annotation-filter_bt-valid-pct_80")

Saving the dataset (3/3 shards): 100%|██████████| 299071/299071 [00:05<00:00, 59766.06 examples/s]


In [53]:
print("[CIREBONESE VALID%] Computing valid percentage and filtering (threshold=50)")
# Add valid percentage metric to Cirebonese and filter with threshold 50 (like Balinese in cell 140)
import re
import numpy as np

# Ensure required inputs are available
assert 'cirebonese_only' in globals(), "cirebonese_only dataset not found. Run the earlier split cell first."
assert 'CBN_TO_ID_MAP' in globals(), "CBN_TO_ID_MAP not found. Load the Cirebonese dictionary map first."

# Build Cirebonese word set from dictionary keys (analogous to Balinese)
CBN_WORD_SET = set(CBN_TO_ID_MAP.keys())

# Use existing calculate_valid_percentage if present; otherwise define it
def _calc_valid_pct_impl(text, dictionary_words, kbbi_wordlist=None):
    if not text:
        return 0.0
    tokens = re.findall(r"\b\w+\b", str(text).lower())
    if not tokens:
        return 0.0
    valid = 0
    for tok in tokens:
        if tok in dictionary_words:
            if kbbi_wordlist is not None and tok in kbbi_wordlist:
                # Optionally exclude Indonesian dictionary words
                continue
            valid += 1
    return (valid / len(tokens)) * 100.0

# Wrapper that prefers existing function if defined
try:
    calculate_valid_percentage  # type: ignore
    _calc = calculate_valid_percentage  # reuse from Balinese metric
except NameError:
    _calc = _calc_valid_pct_impl

# Batch mapper for Cirebonese

def add_valid_percentage_batch_cbn(batch):
    cbn_texts = batch.get('cirebonese', [])
    scores = [_calc(txt, CBN_WORD_SET, KBBI_WORDLIST) for txt in cbn_texts]
    return {"valid_percentage": scores}

# Compute across the full dataset (single-process for notebook stability)
base_ds = cirebonese_only
cirebonese_with_metrics = base_ds.map(
    add_valid_percentage_batch_cbn,
    batched=True,
    batch_size=1000,
    num_proc=None,
    desc="Computing valid% for Cirebonese"
)

# Summary
mean_valid_pct_cbn = float(np.mean(cirebonese_with_metrics["valid_percentage"])) if len(cirebonese_with_metrics) > 0 else 0.0
print(f"Cirebonese valid percentage (mean): {mean_valid_pct_cbn:.2f}%")

# Filter by threshold 50
THRESH_VALID_CBN = 50.0

def cbn_is_good_valid(example):
    return float(example.get("valid_percentage", 0.0)) >= THRESH_VALID_CBN

def cbn_is_bad_valid(example):
    return float(example.get("valid_percentage", 0.0)) < THRESH_VALID_CBN

cbn_good_valid = cirebonese_with_metrics.filter(cbn_is_good_valid, num_proc=1)
cbn_bad_valid = cirebonese_with_metrics.filter(cbn_is_bad_valid, num_proc=1)

print(f"Cirebonese valid% good (>= {THRESH_VALID_CBN}): {len(cbn_good_valid):,}")
print(f"Cirebonese valid% bad  (< {THRESH_VALID_CBN}): {len(cbn_bad_valid):,}")

[CIREBONESE VALID%] Computing valid percentage and filtering (threshold=50)


Computing valid% for Cirebonese: 100%|██████████| 490291/490291 [00:47<00:00, 10284.39 examples/s]


Cirebonese valid percentage (mean): 46.31%


Filter: 100%|██████████| 490291/490291 [00:46<00:00, 10645.12 examples/s]
Filter: 100%|██████████| 490291/490291 [00:46<00:00, 10645.12 examples/s]
Filter: 100%|██████████| 490291/490291 [00:58<00:00, 8340.29 examples/s] 

Cirebonese valid% good (>= 50.0): 187,433
Cirebonese valid% bad  (< 50.0): 302,858





In [54]:
cbn_good_valid.save_to_disk("dataset/paralel_3_lang/cirebonese_annotation-filter_valid-pct_50")

Saving the dataset (2/2 shards): 100%|██████████| 187433/187433 [00:03<00:00, 52308.59 examples/s]



In [55]:
print("[SAVE DATASETS] Pruning columns to 'text' and saving under dataset/cpt ...")
import os

# Ensure required datasets exist
required_vars = [
    ('cbn_good_valid', 'cirebonese', 'cbn_good_valid'),
    ('cbn_good_bt', 'cirebonese', 'cbn_good_bt'),
    ('balinese_good', 'balinese', 'balinese_good'),
    ('bal_good_bt', 'balinese', 'bal_good_bt'),
]

save_root = 'dataset/cpt'
os.makedirs(save_root, exist_ok=True)

from datasets import Dataset

def process_and_save(ds, lang_col, out_name):
    assert lang_col in ds.column_names, f"Column '{lang_col}' not found in dataset '{out_name}'. Available: {ds.column_names}"
    keep_cols = [lang_col]
    remove_cols = [c for c in ds.column_names if c not in keep_cols]
    ds_pruned = ds.remove_columns(remove_cols) if remove_cols else ds
    ds_text = ds_pruned.rename_column(lang_col, 'text') if lang_col != 'text' else ds_pruned
    out_dir = os.path.join(save_root, out_name)
    ds_text.save_to_disk(out_dir)
    print(f"Saved {out_name}: {len(ds_text):,} rows -> {out_dir}")
    return ds_text

# Process all requested datasets
_name_to_obj = globals()
for var_name, lang_col, out_name in required_vars:
    assert var_name in _name_to_obj, f"Dataset variable '{var_name}' is not defined."
    ds_obj = _name_to_obj[var_name]
    process_and_save(ds_obj, lang_col, out_name)

[SAVE DATASETS] Pruning columns to 'text' and saving under dataset/cpt ...


Saving the dataset (1/1 shards): 100%|██████████| 187433/187433 [00:01<00:00, 109327.29 examples/s]


Saved cbn_good_valid: 187,433 rows -> dataset/cpt/cbn_good_valid


Saving the dataset (2/2 shards): 100%|██████████| 299071/299071 [00:04<00:00, 69436.39 examples/s]


Saved cbn_good_bt: 299,071 rows -> dataset/cpt/cbn_good_bt


Saving the dataset (1/1 shards): 100%|██████████| 226950/226950 [00:03<00:00, 62165.33 examples/s]


Saved balinese_good: 226,950 rows -> dataset/cpt/balinese_good


Saving the dataset (1/1 shards): 100%|██████████| 241408/241408 [00:03<00:00, 76124.04 examples/s]


Saved bal_good_bt: 241,408 rows -> dataset/cpt/bal_good_bt


In [60]:
print("[VALIDATION SPLIT] Creating validation sets from HQ datasets (Balinese, Cirebonese)")
import os
from datasets import load_from_disk

# Config: choose minimum validation sizes suitable for small LLM (causal LM) from scratch
# Rationale: 1k-5k examples often suffice for stable validation; here we use 5k for large Bali HQ, 500 for smaller CBN HQ
VAL_SIZE_BALI = 5_000
VAL_SIZE_CBN = 500
SAVE_DIR = "dataset/cpt"
os.makedirs(SAVE_DIR, exist_ok=True)

# Helper to ensure a single 'text' column exists

def normalize_to_text_column(ds):
    cols = ds.column_names
    if "text" in cols:
        # If other columns exist, keep only 'text' to avoid leaking labels/extra fields
        remove_cols = [c for c in cols if c != "text"]
        return ds.remove_columns(remove_cols) if remove_cols else ds
    # Try common language columns
    preferred = ["balinese", "cirebonese", "indonesian"]
    for c in preferred:
        if c in cols:
            keep = ds.remove_columns([x for x in cols if x != c]) if len(cols) > 1 else ds
            return keep.rename_column(c, "text") if c != "text" else keep
    # Fallback: if exactly one column, rename it to 'text'
    if len(cols) == 1:
        only = cols[0]
        return ds.rename_column(only, "text") if only != "text" else ds
    raise ValueError(f"Cannot determine text column from columns: {cols}")

# Load HQ datasets
bali_hq_path = os.path.join(SAVE_DIR, "bali_hq_200k")
cbn_hq_path = os.path.join(SAVE_DIR, "cbn_hq_2k")

bali_hq = load_from_disk(bali_hq_path)
cbn_hq = load_from_disk(cbn_hq_path)

# Normalize to 'text' and prune other columns
bali_hq_text = normalize_to_text_column(bali_hq)
cbn_hq_text = normalize_to_text_column(cbn_hq)

# Decide actual validation sizes based on availability
n_bali_val = min(VAL_SIZE_BALI, len(bali_hq_text))
n_cbn_val = min(VAL_SIZE_CBN, len(cbn_hq_text))
print(f"Balinese HQ size: {len(bali_hq_text):,} -> val: {n_bali_val:,}")
print(f"Cirebonese HQ size: {len(cbn_hq_text):,} -> val: {n_cbn_val:,}")

# Shuffle and take head for validation
bali_val = bali_hq_text.shuffle(seed=42).select(range(n_bali_val))
cbn_val = cbn_hq_text.shuffle(seed=42).select(range(n_cbn_val))

# Save validation sets
bali_val_dir = os.path.join(SAVE_DIR, f"bali_valid_hq_{n_bali_val}")
cbn_val_dir = os.path.join(SAVE_DIR, f"cbn_valid_hq_{n_cbn_val}")

# Overwrite behavior: allow saving anew by removing existing dirs if needed
for p in [bali_val_dir, cbn_val_dir]:
    if os.path.isdir(p):
        # Clean existing to avoid Arrow dataset save errors
        import shutil
        shutil.rmtree(p)

bali_val.save_to_disk(bali_val_dir)
cbn_val.save_to_disk(cbn_val_dir)

print(f"Saved Balinese validation -> {bali_val_dir}")
print(f"Saved Cirebonese validation -> {cbn_val_dir}")

[VALIDATION SPLIT] Creating validation sets from HQ datasets (Balinese, Cirebonese)
Balinese HQ size: 201,404 -> val: 5,000
Cirebonese HQ size: 2,105 -> val: 500


Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 5000/5000 [00:00<00:00, 86217.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 500/500 [00:00<00:00, 47302.40 examples/s]

Saved Balinese validation -> dataset/cpt/bali_valid_hq_5000
Saved Cirebonese validation -> dataset/cpt/cbn_valid_hq_500





In [61]:
count_tokens_in_dataset(bali_val, "text", num_tokens_from_string, encoding)

268637

In [62]:
count_tokens_in_dataset(cbn_val, "text", num_tokens_from_string, encoding)

361418

In [63]:
# Remove validation examples from HQ datasets and save cleaned HQ splits
from datasets import Dataset

# Helper to normalize to a single 'text' column
def _to_text_only(ds: Dataset) -> Dataset:
    cols = ds.column_names
    if 'text' in cols:
        return ds.remove_columns([c for c in cols if c != 'text']) if len(cols) > 1 else ds
    for c in ('balinese', 'cirebonese', 'indonesian'):
        if c in cols:
            tmp = ds.remove_columns([x for x in cols if x != c]) if len(cols) > 1 else ds
            return tmp.rename_column(c, 'text') if c != 'text' else tmp
    # Fallback: single column dataset
    if len(cols) == 1 and cols[0] != 'text':
        return ds.rename_column(cols[0], 'text')
    return ds

# Ensure text columns
_bali_hq_txt = _to_text_only(bali_hq)
_bali_val_txt = _to_text_only(bali_val)
_cbn_hq_txt = _to_text_only(cbn_hq)
_cbn_val_txt = _to_text_only(cbn_val)

# Build fast lookup sets of validation texts (stripped)
_bali_val_set = set(s.strip() for s in _bali_val_txt['text'] if s is not None)
_cbn_val_set = set(s.strip() for s in _cbn_val_txt['text'] if s is not None)

# Filter HQ to exclude any row that appears in validation by exact text match
bali_hq_no_val = _bali_hq_txt.filter(lambda x: x['text'] is not None and x['text'].strip() not in _bali_val_set)
cbn_hq_no_val = _cbn_hq_txt.filter(lambda x: x['text'] is not None and x['text'].strip() not in _cbn_val_set)

# Save cleaned HQ datasets
bali_hq_no_val_dir = 'dataset/cpt/bali_hq_no_val'
cbn_hq_no_val_dir = 'dataset/cpt/cbn_hq_no_val'

bali_hq_no_val.save_to_disk(bali_hq_no_val_dir)
cbn_hq_no_val.save_to_disk(cbn_hq_no_val_dir)

print({
    'bali_hq_total': len(_bali_hq_txt),
    'bali_val_total': len(_bali_val_txt),
    'bali_hq_no_val': len(bali_hq_no_val),
    'bali_saved_to': bali_hq_no_val_dir,
})
print({
    'cbn_hq_total': len(_cbn_hq_txt),
    'cbn_val_total': len(_cbn_val_txt),
    'cbn_hq_no_val': len(cbn_hq_no_val),
    'cbn_saved_to': cbn_hq_no_val_dir,
})

Filter: 100%|██████████| 201404/201404 [00:31<00:00, 6420.90 examples/s]
Filter: 100%|██████████| 2105/2105 [00:00<00:00, 7036.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 196404/196404 [00:00<00:00, 198753.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1605/1605 [00:00<00:00, 37809.23 examples/s]


{'bali_hq_total': 201404, 'bali_val_total': 5000, 'bali_hq_no_val': 196404, 'bali_saved_to': 'dataset/cpt/bali_hq_no_val'}
{'cbn_hq_total': 2105, 'cbn_val_total': 500, 'cbn_hq_no_val': 1605, 'cbn_saved_to': 'dataset/cpt/cbn_hq_no_val'}
