In [23]:
import os
import json
import glob
import pickle

from dotenv import load_dotenv

load_dotenv()

True

In [24]:
from openai import OpenAI

client = OpenAI()

def llm_call(model, messages, **call_args):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        **call_args
    )
    return response

def create_batch_req_object(req_id, model, messages, response_format, temperature=0.0):
    return {
       "custom_id": req_id,
       "method": "POST",
       "url": "/v1/chat/completions",
       "body": {
          "model": model, 
          "messages": messages,
          "temperature": temperature,
          "response_format": response_format,
        }
    }

def llm_batch_api(batch_filepath, purpose="", desc="", completion_window="24h"):
    batch_input_file = client.files.create(
      file=open(batch_filepath, "rb"),
      purpose=purpose
    )

    batch_input_file_id = batch_input_file.id

    batch_info = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window=completion_window,
        metadata={
          "description": desc
        }
    )

    return batch_info

def llm_batch_check_retrieve(batch_info):
    updated_batch = client.batches.retrieve(batch_info.id)
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

def llm_batch_check_retrieve_dict(batch_info):
    updated_batch = client.batches.retrieve(batch_info["id"])
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

In [25]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def num_tokens_from_string(string: str, encoder) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoder.encode(string))
    return num_tokens

def count_tokens_in_dataset(dataset, num_tokens_from_string, encoder):
    total_tokens = 0
    
    for item in dataset:
        text = item['text']
        tokens = num_tokens_from_string(text, encoder)
        total_tokens += tokens
    
    return total_tokens

In [26]:
def write_jsonl(data_string, output_file):
    """
    Writes JSONL string to a file.
    
    Args:
        data_string (str): String containing JSONL data
        output_file (str): Path to output file
    """
    # Split the string into lines and filter out empty lines
    json_lines = [line.strip() for line in data_string.split('\n') if line.strip()]
    
    # Write each line to the file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in json_lines:
            json_obj = json.loads(line)  # Parse the JSON string
            f.write(json.dumps(json_obj) + '\n')  # Write formatted JSON

def read_jsonl(input_file):
    """
    Reads a JSONL file and returns a list of JSON objects.
    
    Args:
        input_file (str): Path to input JSONL file
        
    Returns:
        list: List of parsed JSON objects
    """
    data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data.append(json.loads(line))
    return data

# Function to save the batch object using pickle
def save_batch_to_pickle(batch_obj, output_file="batch_data.pkl"):
    with open(output_file, 'wb') as f:
        pickle.dump(batch_obj, f)

# Function to load the batch object from a pickle file
def load_batch_from_pickle(input_file="batch_data.pkl"):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

def create_batches(dataset, batch_size=1000):
    return [dataset.select(range(i, min(i + batch_size, len(dataset)))) for i in range(0, len(dataset), batch_size)]

def load_dictionary(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

## Translate Parallel 60K (Cirebonese)

In [18]:
def split_text_into_ngrams(text, n):
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngrams.append(tuple(words[i:i + n]))
    return ngrams

def get_dict_translation(text, dictionary):
    unigrams = split_text_into_ngrams(text, 1)
    bigrams = split_text_into_ngrams(text, 2)
    trigrams = split_text_into_ngrams(text, 3)

    word_translation = ""

    for ngram in (trigrams, bigrams, unigrams):
        for ngram_tuple in ngram:
            ngram_str = ' '.join(ngram_tuple)
            if ngram_str in dictionary:
                word_translation += f"- {ngram_str}: {', '.join(dictionary[ngram_str])}\n"

    return word_translation

In [20]:
txt = "Suku Cirebon adalah kelompok etnis berketurunan jawa cirebonan (rumpun jawa banyumasan) yang tersebar di sekitar wilayah Kabupaten Cirebon dan Kota Cirebon"

dictionary = load_dictionary("dict/idn_cbn.json")
print(get_dict_translation(txt, dictionary))

- di sekitar: leker, bangkung, bêngkok
- kelompok: sêsangga, gugus, gélémék, blok
- jawa: gula, jawi
- jawa: gula, jawi
- yang: ingkang, kang, sing
- tersebar: wiwir, amprak, angjrah, amiwir, ngamprak, jrah, nyêbar, makabalasah, lumra, anjrah, balasah, jêrah
- di: teng, ning, dipun, di, ada
- sekitar: sukat, longgar, lwa, aléba, lukat, cocor, amba, curuk, léba
- wilayah: wanua, distrik, ulayat, palêmahan
- dan: lan, dan



In [None]:
from tqdm import tqdm
import datasets

pt_data = datasets.load_from_disk("dataset/id_hq_data_dedup")

# Load the dictionary
dictionary = load_dictionary("dict/idn_cbn.json")

def get_prompt_text(data):
    prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_cbn_dict_translation(data["text"].lower(), dictionary)}
<id_text>
{data["text"]}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    data["text"] = prompt_text
    return data

In [23]:
pt_data = pt_data.map(get_prompt_text)

In [24]:
pt_data.save_to_disk("id_hq_data_prompt")

Saving the dataset (9/9 shards): 100%|██████████| 651856/651856 [00:25<00:00, 26003.35 examples/s] 


In [11]:
pt_tokens = count_tokens_in_dataset(pt_data, num_tokens_from_string, encoding)

In [13]:
pt_tokens

1418349015

In [26]:
pt_data = pt_data.shuffle(seed=42)

In [27]:
pt_data_60k = pt_data.select(range(60000))

In [28]:
pt_tokens_60k = count_tokens_in_dataset(pt_data_60k, num_tokens_from_string, encoding)

In [29]:
pt_tokens_60k

133108166

In [30]:
133108166 / 4 / 1000000 * 0.3

9.98311245

In [31]:
pt_data_60k.save_to_disk("dataset/id_hq_data_prompt_60k")

Saving the dataset (1/1 shards): 100%|██████████| 60000/60000 [00:00<00:00, 137178.96 examples/s]


In [32]:
def batch_gen_translate_w_dict(prompt):
    sys_prompt = "Always answer in a valid JSON format and provide only the JSON answer without anything else."
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}
    ]
    model = "gpt-4o-mini-2024-07-18"
    temperature = 1
    max_tokens = 4096
    response_format = {"type": "json_object"}
    
    return messages, model, temperature, max_tokens, response_format

In [56]:
import uuid

batch_req_objects = []
def create_batch_and_update_data(data):
  random_uuid = uuid.uuid4()
  uuid_str = str(random_uuid)
  data["req_id"] = f"req-{uuid_str}"
  
  messages, model, temperature, max_tokens, response_format = batch_gen_translate_w_dict(data["text"])
  batch_req_object = create_batch_req_object(req_id=data["req_id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)
  
  return data

In [31]:
def save_batches_to_jsonl(batch_req_objects, batch_size, base_filename):
    # Divide the batch_req_objects into smaller lists with the specified batch size
    for i in range(0, len(batch_req_objects), batch_size):
        batch = batch_req_objects[i:i + batch_size]
        batch_filename = f"{base_filename}_batch_{i // batch_size + 1}.jsonl"
        
        # Convert the batch to JSONL format
        batch_jsonl = "\n".join([json.dumps(obj) for obj in batch])
        
        # Write the batch to a .jsonl file
        write_jsonl(batch_jsonl, batch_filename)

In [58]:
# Write all the batch req objects into a jsonl file using write_jsonl
pt_data_60k = pt_data_60k.map(create_batch_and_update_data)

Map: 100%|██████████| 60000/60000 [00:03<00:00, 19475.29 examples/s]


In [59]:
save_batches_to_jsonl(batch_req_objects, 20000, "dataset/translate_batch/cbn_translate")

In [60]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/") if f.startswith("cbn_translate_batch_") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate translation from indonesian to Cirebonese with help of cirebonese word-by-word translation"
    batch_info = llm_batch_api(f"dataset/translate_batch/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [61]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/cbn_translate_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [70]:
for batch_info_dict in all_batch_info_dict:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"translate_batch_output_{batch_info_dict['id']}.jsonl")

Status of batch batch_67b3ee9ae58881909226449f6d3f8202 is completed
Batch(id='batch_67b3ee9ae58881909226449f6d3f8202', completion_window='24h', created_at=1739845274, endpoint='/v1/chat/completions', input_file_id='file-Jo9oi5WKePKiKp3WrsLboS', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739857274, error_file_id='file-XFbnGk3njZvCqoWqesPjCB', errors=None, expired_at=None, expires_at=1739931674, failed_at=None, finalizing_at=1739852857, in_progress_at=1739845280, metadata={'description': 'Batch 1 of requests to generate translation from indonesian to Cirebonese with help of cirebonese word-by-word translation'}, output_file_id='file-PxGzyXCYYvpu8Bn9dgsiW4', request_counts=BatchRequestCounts(completed=19991, failed=9, total=20000))
Status of batch batch_67b3eeac452c8190b9440e54c51c2ce4 is completed
Batch(id='batch_67b3eeac452c8190b9440e54c51c2ce4', completion_window='24h', created_at=1739845292, endpoint='/v1/chat/completions', input_file_id='

In [270]:
pt_data_60k = datasets.load_from_disk("dataset/id_hq_data_prompt_60k")

In [276]:
def extract_id_text(data):
    data["indonesian"] = data["text"].split("<id_text>\n")[-1].split("\n</id_text>")[0]
    return data

In [277]:
pt_data_60k = pt_data_60k.map(extract_id_text)

Map: 100%|██████████| 60000/60000 [00:07<00:00, 8100.70 examples/s] 


In [279]:
pt_data_60k[0]

{'text': 'Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:\n- di: teng, ning, dipun, di, ada\n- kota: kota\n- pada: têng, ing, ajéro, ning jêro\n- tahun: naun, taun\n- mengelola: anggarap, garap\n- dan: lan, dan\n- daerah: wanua, distrik, ulayat, palêmahan\n- kota: kota\n\n<id_text>\nDistrik Hubin () berada di kota Sanmenxia, Henan, Tiongkok.\n\nPada tahun 2012, Distrik Hubin mengelola 8 subdistrik dan 3 daerah administrasi kota praja.\n\nReferensi \n\nTiongkok\n</id_text>\n\nReturn only the translated text in JSON format with key "translated_text".',
 'indonesian': 'Distrik Hubin () berada di kota Sanmenxia, Henan, Tiongkok.\n\nPada tahun 2012, Distrik Hubin mengelola 8 subdistrik dan 3 daerah administrasi kota praja.\n\nReferensi \n\nTiongkok'}

In [283]:
import jsonlines

input_batch = {}
with jsonlines.open("dataset/translate_batch/cbn_translate/cbn_translate_batch_1.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id'].split("req-")[-1]
        prompt = obj['body']['messages'][-1]['content']
        input_batch[prompt] = custom_id

with jsonlines.open("dataset/translate_batch/cbn_translate/cbn_translate_batch_2.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id'].split("req-")[-1]
        prompt = obj['body']['messages'][-1]['content']
        input_batch[prompt] = custom_id

with jsonlines.open("dataset/translate_batch/cbn_translate/cbn_translate_batch_3.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id'].split("req-")[-1]
        prompt = obj['body']['messages'][-1]['content']
        input_batch[prompt] = custom_id

In [285]:
def add_custom_id_to_data(data):
    data["custom_id"] = input_batch[data["text"]]
    return data

In [286]:
pt_data_60k = pt_data_60k.map(add_custom_id_to_data)

Map: 100%|██████████| 60000/60000 [00:02<00:00, 20238.13 examples/s]


In [298]:
import jsonlines

output_batch = {}
with jsonlines.open("dataset/translate_batch/cbn_translate/translate_batch_output_batch_67b3ee9ae58881909226449f6d3f8202.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id'].split("req-")[-1]
        try:
          translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
          if isinstance(translation['translated_text'], str):
            output_batch[custom_id] = translation['translated_text']
          elif isinstance(translation['translated_text'], list):
            output_batch[custom_id] = "".join(translation['translated_text'])
        except Exception as e:
           print(custom_id)
           continue

with jsonlines.open("dataset/translate_batch/cbn_translate/translate_batch_output_batch_67b3eeac452c8190b9440e54c51c2ce4.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id'].split("req-")[-1]
        try:
          translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
          if isinstance(translation['translated_text'], str):
            output_batch[custom_id] = translation['translated_text']
          elif isinstance(translation['translated_text'], list):
            output_batch[custom_id] = "".join(translation['translated_text'])
        except Exception as e:
           print(custom_id)
           continue

with jsonlines.open("dataset/translate_batch/cbn_translate/translate_batch_output_batch_67b3eebce45881908d7e2bf61d0720ad.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id'].split("req-")[-1]
        try:
          translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
          if isinstance(translation['translated_text'], str):
            output_batch[custom_id] = translation['translated_text']
          elif isinstance(translation['translated_text'], list):
            output_batch[custom_id] = "".join(translation['translated_text'])
        except Exception as e:
           print(custom_id)
           continue

d615fa42-9c4a-4c27-9cae-21b161a07d4d
f816ad49-7ed3-4500-8dc4-1b1693ee1594
4e2f3090-4720-404f-b647-e29b747bcac5
12865bbc-4e70-48b3-bf36-01e92ded2992
2b23dc56-fa18-4029-92d6-e81f8aa10643
940aba19-65ae-4b16-807f-75261efe04c9
cbaf92b7-4bed-4c32-a5fa-2d9185a3bad0
8d0bedd2-7153-449e-859f-fc53008b8cf0
1eec2434-8343-4c02-8380-a9f442b319d8
c108ba52-7ccf-4206-a0df-4964dfe49158
d3da2e9e-4253-4a5e-84c3-dd8973f2d8de
63a2960a-e6db-4006-925b-b80f4d32d0f6
23e5ee30-d648-476a-a0e9-82535b9d3618
2b3bc33c-4585-40aa-aacd-805f5f9350a8
0be62978-4a9c-43c0-8e25-7b4d639019dc
202863ce-58c1-467a-9e04-6bf1ffc798f5
733b803a-541a-4195-9740-a505f375ca2c
9063d0b7-da47-48b2-a4c4-42f33429d49b
c2e4e0f1-7df0-4fb0-af85-fc1a4a6ac1c5
49cf3fe5-e6f5-4267-931b-eec4a1454f66
95f3a00d-c8c4-4be9-9838-9f17a4bd1bf6
c43f90aa-fadd-4076-bc60-cc03dc19be0f
453c512f-6637-489b-816d-fdadb8b4321f
4cfff412-f71b-447b-9295-40df0d8a2b2f
ace17345-380f-461d-8944-8fd6804ef89a
5a258af4-b7f3-49bd-9949-c938639fcc4a
a5ea723b-eb10-4838-acb2-c4a2773eb32a
6

In [301]:
def add_cirebonese_translation(data):
    if data["custom_id"] in output_batch:
      data["cirebonese"] = output_batch[data["custom_id"]]
      return data
    else:
      return None

In [302]:
pt_data_60k_new = pt_data_60k.map(add_cirebonese_translation)

Map: 100%|██████████| 60000/60000 [00:05<00:00, 10797.62 examples/s]


In [305]:
pt_data_60k_new.save_to_disk("dataset/paralel_dataset_60k")

Saving the dataset (2/2 shards): 100%|██████████| 59859/59859 [00:00<00:00, 147628.89 examples/s]


## Translate Parallel 44K (Bali)

In [10]:
import datasets

paralel_id_cbn_44k = datasets.load_from_disk("dataset/paralel_2_lang/paralel_id_cbn_127k_filtered")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def fix_data_features(data):
    return {
        "id": data["id"],
        "indonesian": data["text"],
        "cirebonese": data["translated_text"]
    }

In [12]:
paralel_id_cbn_44k = paralel_id_cbn_44k.map(fix_data_features, remove_columns=["text", "translated_text", "title", "url"])

In [13]:
paralel_id_cbn_44k

Dataset({
    features: ['id', 'indonesian', 'cirebonese'],
    num_rows: 44743
})

In [446]:
indo_bali_dict = load_dictionary("dict/idn_bali.json")

def get_translate_bali_prompt_text(data):
    prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Balinese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. Not all word need to be translated such as named entities, therefore you need to properly choose which word need to be translated and which one is the right translation based on context. The translations are as follows:
{get_dict_translation(data["indonesian"].lower(), indo_bali_dict)}
<id_text>
{data["indonesian"]}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    
    data["prompt_text"] = prompt_text
    
    return data

In [447]:
paralel_44k_prompt_bali = paralel_id_cbn_44k.map(get_translate_bali_prompt_text)

Map: 100%|██████████| 44743/44743 [00:22<00:00, 1998.25 examples/s]


In [448]:
model = "gpt-4o-mini-2024-07-18"
temperature = 0
response_format = {"type": "json_object"}

batch_req_objects = []
for data in paralel_44k_prompt_bali:
  messages = [
      {"role": "user", "content": data["prompt_text"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [449]:
save_batches_to_jsonl(batch_req_objects, 10000, "dataset/translate_batch/bali_translate_paralel_44k/bali_translate")

In [450]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/bali_translate_paralel_44k") if f.startswith("bali_translate") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Balinese translation from Indonesian text (44k sample) with help of dictionary"
    batch_info = llm_batch_api(f"dataset/translate_batch/bali_translate_paralel_44k/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [451]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/bali_translate_paralel_44k/bali_translate_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [5]:
with open("dataset/translate_batch/bali_translate_paralel_44k/bali_translate_batch_info.json", 'r') as f:
    all_batch_info_dict = json.load(f)

for idx, batch_info_dict in enumerate(all_batch_info_dict):
    num_batch = batch_info_dict['metadata']['description'].split(" ")[1]
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print("Num batch:", num_batch, "->", updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"dataset/translate_batch/bali_translate_paralel_44k/bali_translate_output_{idx}.jsonl")

Status of batch batch_67c7d852377c8190bf58e48de6ea4760 is completed
Num batch: 3 -> Batch(id='batch_67c7d852377c8190bf58e48de6ea4760', completion_window='24h', created_at=1741150290, endpoint='/v1/chat/completions', input_file_id='file-HcxLi1j8JTCDNUQFFNo6yh', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1741165270, error_file_id=None, errors=None, expired_at=None, expires_at=1741236690, failed_at=None, finalizing_at=1741164273, in_progress_at=1741150297, metadata={'description': 'Batch 3 of requests to generate Balinese translation from Indonesian text (44k sample) with help of dictionary'}, output_file_id='file-3FFRwLqZqUsVymNnjhJ3zP', request_counts=BatchRequestCounts(completed=10000, failed=0, total=10000))
Status of batch batch_67c7d85feca881908fc2a95bd8856eb3 is completed
Num batch: 4 -> Batch(id='batch_67c7d85feca881908fc2a95bd8856eb3', completion_window='24h', created_at=1741150304, endpoint='/v1/chat/completions', input_file_id='file-

In [7]:
import jsonlines
# Create a mapping of translations from JSONL
output_files = [f for f in os.listdir("dataset/translate_batch/bali_translate_paralel_44k") if f.startswith("bali_translate_output") and f.endswith(".jsonl")]

failed_id = []
translations = {}
for batch_file in output_files:
    with jsonlines.open(f"dataset/translate_batch/bali_translate_paralel_44k/{batch_file}") as reader:
        for obj in reader:
            custom_id = obj['custom_id']
            try:
                translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
                if obj['response']['body']['choices'][0]['finish_reason'] != "length":
                  if isinstance(translation['translated_text'], str):
                    translations[custom_id] = translation['translated_text']
                  elif isinstance(translation['translated_text'], list):
                    translations[custom_id] = "".join(translation['translated_text'])
            except Exception as e:
               failed_id.append(custom_id)
               print(custom_id)
               print(batch_file)
               continue

def add_bali_translation_to_dataset(data):
    try:
      if data["id"] in translations:
          data["balinese"] = translations[data["id"]]
          return data
      else:
          return None
    except:
       print(data["id"])
       print(translations[data["id"]])
       raise ValueError(f"Translation not found for ID: {data['id']}")
        # raise ValueError(f"Translation not found for ID: {data['id']}")

3415167
bali_translate_output_4.jsonl
3441063
bali_translate_output_4.jsonl
3443302
bali_translate_output_4.jsonl
3587953
bali_translate_output_4.jsonl
3603829
bali_translate_output_4.jsonl
3605668
bali_translate_output_4.jsonl
3620345
bali_translate_output_4.jsonl
3623529
bali_translate_output_4.jsonl
3623771
bali_translate_output_4.jsonl
3630645
bali_translate_output_4.jsonl
3635720
bali_translate_output_4.jsonl
3646846
bali_translate_output_4.jsonl
3647257
bali_translate_output_4.jsonl
3652727
bali_translate_output_4.jsonl
3681440
bali_translate_output_4.jsonl
3684608
bali_translate_output_4.jsonl
3693902
bali_translate_output_4.jsonl
3706284
bali_translate_output_4.jsonl
3706969
bali_translate_output_4.jsonl
3718182
bali_translate_output_4.jsonl
3722817
bali_translate_output_4.jsonl
3824577
bali_translate_output_4.jsonl
3833417
bali_translate_output_4.jsonl
3841360
bali_translate_output_4.jsonl
3848849
bali_translate_output_4.jsonl
3864218
bali_translate_output_4.jsonl
3874202
bali

In [8]:
len(translations.keys())

43818

In [431]:
paralel_300k_prompt_bali[260606]

{'id': 'cf9fbfb7-32aa-4e3b-a06a-c40fc875fe2d',
 'cirebonese': 'Carex kashmirensis punika spesies tumbuhan kados suket ingkang kalebet ke famili Cyperaceae. Spesies punika ugi minangka pihak saking ordo Poales. Spesies Carex kashmirensis dewek minangka pihak saking genus Carex. Nama ilmiah saking spesies punika pertama kalih diterbitkan dening C.B.Clarke.',
 'indonesian': 'Carex kashmirensis adalah spesies tumbuhan seperti rumput yang tergolong ke dalam famili Cyperaceae. Spesies ini juga merupakan bagian dari ordo Poales. Spesies Carex kashmirensis sendiri merupakan bagian dari genus Carex. Nama ilmiah dari spesies ini pertama kali diterbitkan oleh C.B.Clarke.\n\nReferensi \n\nCarex',
 'prompt_text': 'Translate the given Indonesian text in the <id_text> tag below into Balinese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. Not all word need to be translated such 

In [14]:
paralel_44k = paralel_id_cbn_44k.map(add_bali_translation_to_dataset)

Map: 100%|██████████| 44743/44743 [00:19<00:00, 2299.89 examples/s] 


In [15]:
paralel_44k

Dataset({
    features: ['id', 'indonesian', 'cirebonese', 'balinese'],
    num_rows: 43818
})

In [16]:
paralel_44k.save_to_disk("dataset/paralel_3_lang/paralel_dataset_44k")

Saving the dataset (1/1 shards): 100%|██████████| 43818/43818 [00:09<00:00, 4659.31 examples/s]


## Fill Empty ID Translation for Bali Dict

In [76]:
bali_dict = load_dictionary("dict/transformed_bali_dict.json")

In [81]:
from datasets import Dataset, DatasetDict
import json
import uuid

def create_translation_dataset(input_dict):
    data = {
        'id': [],
        'bali_word': [],
        'translation_english': [],
        'translation_indonesian': []
    }
    
    for bali_word, values in input_dict.items():
        # Skip if both translations are empty or non-existent
        if len(values["translation_english"]) == 0 and len(values["translation_indonesian"]) == 0:
            continue
            
        # Only process if at least translation_english exists and is not empty
        if  len(values['translation_english']) > 0 and len(values['translation_indonesian']) == 0:
            data['id'].append(str(uuid.uuid4()))
            data['bali_word'].append(bali_word)
            data['translation_english'].append(values['translation_english'])
            data['translation_indonesian'].append([])
    
    # Create HF Dataset
    dataset = Dataset.from_dict(data)
    return dataset

In [82]:
# Create the dataset
translation_dataset = create_translation_dataset(bali_dict)

In [83]:
translation_dataset

Dataset({
    features: ['id', 'bali_word', 'translation_english', 'translation_indonesian'],
    num_rows: 4545
})

In [87]:
def get_id_translation_prompt_text(data):
    prompt_text = f"""Translate the given Balinese word into Indonesian, with the help of its English translation:
- Balinese word: {data["bali_word"]}
- English translation: {data["translation_english"]}

The result must be a list of string, which is the Indonesian translation from the Balinese word, and since it is a list it can be more than one translation. Return only the Indonesian translation in JSON format with key "translation_indonesian"."""
    data["prompt"] = prompt_text
    return data

In [88]:
translation_dataset = translation_dataset.map(get_id_translation_prompt_text)

Map: 100%|██████████| 4545/4545 [00:00<00:00, 7166.05 examples/s]


In [89]:
translation_dataset[0]

{'id': '5338f62e-0155-4b45-9928-c1b496dbf153',
 'bali_word': 'abah-abah',
 'translation_english': ['tabiat', 'bakat'],
 'translation_indonesian': [],
 'prompt': 'Translate the given Balinese word into Indonesian, with the help of its English translation:\n- Balinese word: abah-abah\n- English translation: [\'tabiat\', \'bakat\']\n\nThe result must be a list of string, which is the Indonesian translation from the Balinese word, and since it is a list it can be more than one translation. Return only the Indonesian translation in JSON format with key "translation_indonesian".'}

In [107]:
model = "gpt-4o-2024-08-06"
temperature = 0
max_tokens = 512
response_format = {"type": "json_object"}

batch_req_objects = []
for data in translation_dataset:
  messages = [
      {"role": "user", "content": data["prompt"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [110]:
save_batches_to_jsonl(batch_req_objects, 20000, "dataset/translate_batch/bali_dict")

In [111]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/") if f.startswith("bali_dict_batch") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Indonesian translation from Balinese word with help of English translation"
    batch_info = llm_batch_api(f"dataset/translate_batch/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [112]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/bali_dict_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [117]:
for batch_info_dict in all_batch_info_dict:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"bali_dict_batch_output.jsonl")

Status of batch batch_67b57facc8248190897fdd3631a9cdee is completed
Batch(id='batch_67b57facc8248190897fdd3631a9cdee', completion_window='24h', created_at=1739947948, endpoint='/v1/chat/completions', input_file_id='file-QWHGpJgFVv8Dob8eMy6EN3', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739950437, error_file_id=None, errors=None, expired_at=None, expires_at=1740034348, failed_at=None, finalizing_at=1739949787, in_progress_at=1739947951, metadata={'description': 'Batch 1 of requests to generate Indonesian translation from Balinese word with help of English translation'}, output_file_id='file-4yRQdw1BtjLQMqf15nqL2t', request_counts=BatchRequestCounts(completed=4545, failed=0, total=4545))


In [114]:
len(bali_dict.keys())

20082

In [119]:
translation_dataset[0]

{'id': '5338f62e-0155-4b45-9928-c1b496dbf153',
 'bali_word': 'abah-abah',
 'translation_english': ['tabiat', 'bakat'],
 'translation_indonesian': [],
 'prompt': 'Translate the given Balinese word into Indonesian, with the help of its English translation:\n- Balinese word: abah-abah\n- English translation: [\'tabiat\', \'bakat\']\n\nThe result must be a list of string, which is the Indonesian translation from the Balinese word, and since it is a list it can be more than one translation. Return only the Indonesian translation in JSON format with key "translation_indonesian".'}

In [122]:
import jsonlines

def update_translations_from_jsonl(dataset, jsonl_path):
    # Create a mapping of custom_id to translations from JSONL
    translations = {}
    with jsonlines.open(jsonl_path) as reader:
        for obj in reader:
            custom_id = obj['custom_id']
            # Extract translation_indonesian from the response
            try:
                translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
                translations[custom_id] = translation['translation_indonesian']
            except (KeyError, json.JSONDecodeError):
                continue

    # Update dataset with translations
    def update_translation(example):
        if example['id'] in translations:
            example['translation_indonesian'] = translations[example['id']]
        return example

    # Apply updates to dataset
    updated_dataset = dataset.map(update_translation)
    
    return updated_dataset

In [123]:
translation_dataset = update_translations_from_jsonl(translation_dataset, "dataset/translate_batch/bali_dict/bali_dict_batch_output.jsonl")

Map: 100%|██████████| 4545/4545 [00:00<00:00, 21266.66 examples/s]


In [128]:
def update_dict_translations(bali_dict, dataset):
    # Convert dataset to dictionary for easier lookup
    updated_entries = 0
    
    for item in dataset:
        bali_word = item['bali_word']
        
        # Check if word exists in dictionary
        if bali_word in bali_dict:
            # Update translation_indonesian
            bali_dict[bali_word]['translation_indonesian'] = item['translation_indonesian']
            updated_entries += 1
    
    return bali_dict, updated_entries

In [129]:
# Update the dictionary with new translations
updated_dict, num_updated = update_dict_translations(bali_dict, translation_dataset)

In [131]:
with open('dict/transformed_bali_dict_.json', 'w', encoding='utf-8') as f:
    json.dump(updated_dict, f, ensure_ascii=False, indent=2)

In [132]:
from collections import defaultdict

def create_translation_dicts(bali_dict):
    # Initialize dictionaries
    bali_indo = {}
    indo_bali = defaultdict(list)
    
    # Process each entry
    for bali_word, values in bali_dict.items():
        # Skip entries without translations
        if not isinstance(values, dict):
            continue
        if 'translation_indonesian' not in values or not values['translation_indonesian']:
            continue
            
        # Add to Balinese-Indonesian dictionary
        bali_indo[bali_word] = values['translation_indonesian']
        
        # Add to Indonesian-Balinese dictionary
        for indo_word in values['translation_indonesian']:
            indo_bali[indo_word].append(bali_word)
    
    return bali_indo, dict(indo_bali)

In [133]:
# Create both dictionaries
bali_indo_dict, indo_bali_dict = create_translation_dicts(bali_dict)

# Save Balinese-Indonesian dictionary
with open('dict/bali_idn.json', 'w', encoding='utf-8') as f:
    json.dump(bali_indo_dict, f, ensure_ascii=False, indent=2)

# Save Indonesian-Balinese dictionary
with open('dict/idn_bali.json', 'w', encoding='utf-8') as f:
    json.dump(indo_bali_dict, f, ensure_ascii=False, indent=2)


## Translate Sentence Example for Bali Dict

In [152]:
import datasets

def create_sent_translation_dataset(input_dict):
    data = {
        'id': [],
        'bali_word': [],
        'example_idx': [],
        'balinese_text': [],
        'indonesian_text': []
    }
    
    for bali_word, values in input_dict.items():
        # Skip if no sentence examples
        if not isinstance(values, dict) or 'sentence_examples' not in values:
            continue
            
        for example_idx, example in enumerate(values['sentence_examples']):
            # Skip if both are empty or both exist
            has_bali = example.get('Balinese', '-') != '-'
            has_indo = example.get('Indonesian', '-') != '-'
            
            if has_bali == has_indo:  # both True or both False
                continue
                
            data['id'].append(str(uuid.uuid4()))
            data['bali_word'].append(bali_word)
            data['example_idx'].append(example_idx)
            data['balinese_text'].append(example.get('Balinese', '-'))
            data['indonesian_text'].append(example.get('Indonesian', '-'))
    
    # Create HF Dataset
    dataset = Dataset.from_dict(data)
    
    return dataset

In [157]:
sent_translation_dataset = create_sent_translation_dataset(bali_dict)

In [158]:
sent_translation_dataset[86]

{'id': 'd7b8db96-e304-47a2-8d7f-d326b85873ca',
 'bali_word': 'anteg',
 'example_idx': 0,
 'balinese_text': 'Anteg jani Yan Galung tusing ada teka',
 'indonesian_text': '-'}

In [159]:
bali_indo_dict = load_dictionary("dict/bali_idn.json")
indo_bali_dict = load_dictionary("dict/idn_bali.json")

def get_translate_sentence_prompt_text(data):
    if data["balinese_text"] == "-" and data["indonesian_text"] != "-":
      prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Balinese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_dict_translation(data["indonesian_text"].lower(), indo_bali_dict)}
<id_text>
{data["indonesian_text"]}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    elif data["balinese_text"] != "-" and data["indonesian_text"] == "-":
      prompt_text = f"""Translate the given Balinese text in the <bali_text> tag below into Indonesian with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_dict_translation(data["balinese_text"].lower(), bali_indo_dict)}
<bali_text>
{data["balinese_text"]}
</bali_text>

Return only the translated text in JSON format with key "translated_text"."""
    
    data["prompt_text"] = prompt_text
    
    return data

In [160]:
sent_translation_dataset = sent_translation_dataset.map(get_translate_sentence_prompt_text)

Map: 100%|██████████| 6421/6421 [00:00<00:00, 19170.91 examples/s]


In [163]:
model = "gpt-4o-2024-08-06"
temperature = 0
max_tokens = 512
response_format = {"type": "json_object"}

batch_req_objects = []
for data in sent_translation_dataset:
  messages = [
      {"role": "user", "content": data["prompt_text"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [None]:
save_batches_to_jsonl(batch_req_objects, 20000, "dataset/translate_batch/bali_sent/bali_sent")

In [168]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/bali_sent") if f.startswith("bali_sent_batch") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Indonesian/Balinese translation from Indonesian/Balinese text with help of dictionary"
    batch_info = llm_batch_api(f"dataset/translate_batch/bali_sent/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [169]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/bali_sent/bali_sent_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [174]:
for batch_info_dict in all_batch_info_dict:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"dataset/translate_batch/bali_sent/bali_sent_batch_output.jsonl")

Status of batch batch_67b5cfdf27d48190aea00909f1e03bbf is completed
Batch(id='batch_67b5cfdf27d48190aea00909f1e03bbf', completion_window='24h', created_at=1739968479, endpoint='/v1/chat/completions', input_file_id='file-JeYcdKJB7Duhy3QyGUuh1r', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739970673, error_file_id=None, errors=None, expired_at=None, expires_at=1740054879, failed_at=None, finalizing_at=1739970051, in_progress_at=1739968480, metadata={'description': 'Batch 1 of requests to generate Indonesian/Balinese translation from Indonesian/Balinese text with help of dictionary'}, output_file_id='file-Kd16BJtJwxbmn4B8NLQdty', request_counts=BatchRequestCounts(completed=6421, failed=0, total=6421))


In [176]:
sent_translation_dataset

Dataset({
    features: ['id', 'bali_word', 'example_idx', 'balinese_text', 'indonesian_text', 'prompt_text'],
    num_rows: 6421
})

In [186]:
import json
import jsonlines

def update_sentence_examples(bali_dict, sent_dataset, jsonl_path):
    # Create a mapping of translations from JSONL
    translations = {}
    with jsonlines.open(jsonl_path) as reader:
        for obj in reader:
            custom_id = obj['custom_id']
            try:
                translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
                translations[custom_id] = translation['translated_text']
            except (KeyError, json.JSONDecodeError):
                continue

    # Update dictionary with translations
    for data in sent_dataset:
        custom_id = data['id']
        bali_word = data['bali_word']
        example_idx = data['example_idx']
        if custom_id in translations:
            if bali_word in bali_dict and 'sentence_examples' in bali_dict[bali_word]:
              # Get the example at specified index
              if example_idx < len(bali_dict[bali_word]['sentence_examples']):
                  example = bali_dict[bali_word]['sentence_examples'][example_idx]
                  
                  # Update the empty translation
                  if example['Balinese'] == '-':
                      example['Balinese'] = translations[custom_id]
                  elif example['Indonesian'] == '-':
                      example['Indonesian'] = translations[custom_id]

    return bali_dict

In [187]:
# Load the dictionary
with open('dict/transformed_bali_dict.json', 'r') as f:
    bali_dict = json.load(f)

# Update the dictionary with translations
updated_dict = update_sentence_examples(bali_dict, sent_translation_dataset, 'dataset/translate_batch/bali_sent/bali_sent_batch_output.jsonl')

# Save the updated dictionary
with open('dict/transformed_bali_dict.json', 'w', encoding='utf-8') as f:
    json.dump(updated_dict, f, ensure_ascii=False, indent=2)

In [214]:
def extract_sentence_examples(dict_data):
    # Initialize lists to store the examples
    examples = {
        'id': [],
        'balinese': [],
        'indonesian': []
    }
    
    # Iterate through dictionary entries
    for word, entry in dict_data.items():
        if 'sentence_examples' in entry:
            for example in entry['sentence_examples']:
                # Skip if either Balinese or Indonesian is "-" or empty
                if example.get('Balinese', '-') == '-' or example.get('Indonesian', '-') == '-':
                    continue
                
                if not isinstance(example['Balinese'], str) or not isinstance(example['Indonesian'], str):
                    print(example)
                # Create dataset entry
                examples['id'].append(str(uuid.uuid4()))
                examples['balinese'].append(example['Balinese'])
                examples['indonesian'].append(example['Indonesian']) 
    
    # Create HuggingFace dataset
    dataset = Dataset.from_dict(examples)
    
    return dataset

In [215]:
updated_dict = load_dictionary("dict/transformed_bali_dict.json")
paralel_dataset = extract_sentence_examples(updated_dict)

In [216]:
paralel_dataset

Dataset({
    features: ['id', 'balinese', 'indonesian'],
    num_rows: 22248
})

In [217]:
paralel_dataset.save_to_disk("dataset/paralel_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 22248/22248 [00:00<00:00, 682795.10 examples/s]


## Translate Parallel (from bali dict) CBN

In [218]:
# Load the dictionary
idn_cbn = load_dictionary("dict/idn_cbn.json")

def get_translate_cbn_prompt_text(data):
    prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_dict_translation(data["indonesian"].lower(), idn_cbn)}
<id_text>
{data["indonesian"]}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    data["prompt"] = prompt_text
    
    return data

In [219]:
paralel_dataset = paralel_dataset.map(get_translate_cbn_prompt_text)

Map: 100%|██████████| 22248/22248 [00:01<00:00, 19035.66 examples/s]


In [220]:
paralel_dataset[7845]

{'id': 'dbcda0ea-9510-48fb-9736-5198a88d255a',
 'balinese': 'mula keto',
 'indonesian': 'memang begitu',
 'prompt': 'Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:\n- memang: si, ugi, ugah, uga, juga\n- begitu: koten, begitu, mengkonon, mekoten, mudu, konon\n\n<id_text>\nmemang begitu\n</id_text>\n\nReturn only the translated text in JSON format with key "translated_text".'}

In [221]:
model = "gpt-4o-mini-2024-07-18"
temperature = 0
max_tokens = 4096
response_format = {"type": "json_object"}

batch_req_objects = []
for data in paralel_dataset:
  messages = [
      {"role": "user", "content": data["prompt"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [227]:
save_batches_to_jsonl(batch_req_objects, 11500, "dataset/translate_batch/cbn_translate_paralel_bali_dict/cbn_translate_paralel_bali_dict")

In [228]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/cbn_translate_paralel_bali_dict") if f.startswith("cbn_translate_paralel_bali_dict") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Cirebonese translation from Indonesian text (from Balinese corpus) with help of dictionary"
    batch_info = llm_batch_api(f"dataset/translate_batch/cbn_translate_paralel_bali_dict/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [229]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/cbn_translate_paralel_bali_dict/cbn_translate_paralel_bali_dict.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [231]:
for idx, batch_info_dict in enumerate(all_batch_info_dict):
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"dataset/translate_batch/cbn_translate_paralel_bali_dict/cbn_translate_paralel_bali_dict_output_{idx}.jsonl")

Status of batch batch_67b6ee04bf2c8190b97d33352be5e6e4 is completed
Batch(id='batch_67b6ee04bf2c8190b97d33352be5e6e4', completion_window='24h', created_at=1740041732, endpoint='/v1/chat/completions', input_file_id='file-88YiYVEhwwovWtPBxRZfvv', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1740052727, error_file_id=None, errors=None, expired_at=None, expires_at=1740128132, failed_at=None, finalizing_at=1740050984, in_progress_at=1740041736, metadata={'description': 'Batch 1 of requests to generate Cirebonese translation from Indonesian text (from Balinese corpus) with help of dictionary'}, output_file_id='file-S9MzKYyZrDz7qKCf1PUjf7', request_counts=BatchRequestCounts(completed=11500, failed=0, total=11500))
Status of batch batch_67b6ee0717488190bc55e5e760fe27de is completed
Batch(id='batch_67b6ee0717488190bc55e5e760fe27de', completion_window='24h', created_at=1740041735, endpoint='/v1/chat/completions', input_file_id='file-NnLWvVJgcofEfYr7RSUz

In [253]:
# Create a mapping of translations from JSONL
translations = {}
with jsonlines.open("dataset/translate_batch/cbn_translate_paralel_bali_dict/cbn_translate_paralel_bali_dict_output_0.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id']
        translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
        if isinstance(translation['translated_text'], str):
          translations[custom_id] = translation['translated_text']
        elif isinstance(translation['translated_text'], list):
          translations[custom_id] = "".join(translation['translated_text'])

with jsonlines.open("dataset/translate_batch/cbn_translate_paralel_bali_dict/cbn_translate_paralel_bali_dict_output_1.jsonl") as reader:
    for obj in reader:
        custom_id = obj['custom_id']
        try:
            translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
            if isinstance(translation['translated_text'], str):
              translations[custom_id] = translation['translated_text']
            elif isinstance(translation['translated_text'], list):
              translations[custom_id] = "".join(translation['translated_text'])
        except (KeyError, json.JSONDecodeError):
            print(custom_id)
            print(translation['translated_text'])
            raise ValueError

def add_cbn_translation_to_dataset(data):
    if data["id"] in translations:
        data["cirebonese"] = translations[data["id"]]
    else:
        raise ValueError(f"Translation not found for ID: {data['id']}")
    return data

In [254]:
len(translations.keys())

22248

In [255]:
paralel_dataset_new = paralel_dataset.map(add_cbn_translation_to_dataset)

Map: 100%|██████████| 22248/22248 [00:00<00:00, 31959.80 examples/s]


In [260]:
paralel_dataset_new[5786]

{'id': '19ae585c-e2bc-4f12-8310-b892c8f37a7a',
 'balinese': 'A: Ne madan bok jojong.\nB: Mih, tonden taen kene “KAMEHAME” cai…\nBok/Hair/Rambut\nWimba/Eyebrow/Alis\nKuping/Ear/Telinga\nPanyingakan/Eye/Mata\nCunguh/Nose/Hidung\nPala/Shoulder/Pundak\nBibih/Lip/ Bibir\nBaong/Neck/ Leher\nTangkah/Chest/Dada\nLima/Hand/ Tangan\nBasang/Belly/Perut\nJriji/Finger/Jemari\nPaa/Thigh/Paha\nEntud/Knee/Lutut\n\nBatis/Leg/Kaki',
 'indonesian': 'A: Ini namanya rambut kaku.\n\n\nB: Wah, belum pernah kena “KAMEHAME” kau…',
 'prompt': 'Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:\n- belum pernah: béli ésok\n- ini: enya, kien, niku, mangga, puniki, ikih, niki, nyah, iki\n- namanya: namanya, arane, jenenge\n- rambut: rema, rambut\n- belum: durung, dereng\n- pernah: sokan

In [261]:
def clean_paralel_dataset(example):
    # Replace specific text in balinese attribute
    if "Bok/Hair/Rambut" in example['balinese']:
        example['balinese'] = "A: Ne madan bok jojong.\nB: Mih, tonden taen kene “KAMEHAME” cai…"
    return example

In [262]:
paralel_dataset_new = paralel_dataset_new.map(clean_paralel_dataset)

Map: 100%|██████████| 22248/22248 [00:00<00:00, 29250.29 examples/s]


In [263]:
paralel_dataset_new[5786]

{'id': '19ae585c-e2bc-4f12-8310-b892c8f37a7a',
 'balinese': 'A: Ne madan bok jojong.\nB: Mih, tonden taen kene “KAMEHAME” cai…',
 'indonesian': 'A: Ini namanya rambut kaku.\n\n\nB: Wah, belum pernah kena “KAMEHAME” kau…',
 'prompt': 'Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:\n- belum pernah: béli ésok\n- ini: enya, kien, niku, mangga, puniki, ikih, niki, nyah, iki\n- namanya: namanya, arane, jenenge\n- rambut: rema, rambut\n- belum: durung, dereng\n- pernah: sokan, sokat, nate, pernah\n- kena: tanajahat, kèna, kênoh, kênop\n\n<id_text>\nA: Ini namanya rambut kaku.\n\n\nB: Wah, belum pernah kena “KAMEHAME” kau…\n</id_text>\n\nReturn only the translated text in JSON format with key "translated_text".',
 'cirebonese': 'A: Niku arane rema kaku.\n\nB: W

In [264]:
paralel_dataset_new = paralel_dataset_new.save_to_disk("dataset/paralel_dataset_from_bali_dict")

Saving the dataset (1/1 shards): 100%|██████████| 22248/22248 [00:00<00:00, 544217.95 examples/s]


## Translate 300K Parallel Data to CBN & BAN

In [363]:
pt_data_60k_new_bali_updated

Dataset({
    features: ['text', 'indonesian', 'custom_id', 'cirebonese', 'prompt_text', 'balinese'],
    num_rows: 59257
})

In [364]:
all_data_dedup = datasets.load_from_disk("dataset/id_hq_data_dedup")

In [None]:
def filter_dedup_data(data):
    if data["text"] in pt_data_60k_new_bali_updated["indonesian"]:
        return None
    else: 
        len = num_tokens_from_string(data["text"], encoding)
        if len < 7500:
            return data
        else:
            return None

In [369]:
all_data_dedup_filtered = all_data_dedup.map(filter_dedup_data, num_proc=8)

Map (num_proc=8): 100%|██████████| 651856/651856 [2:49:35<00:00, 64.06 examples/s]  


In [371]:
all_data_dedup_filtered.save_to_disk("dataset/id_hq_data_dedup_filtered")

Saving the dataset (3/3 shards): 100%|██████████| 605224/605224 [00:03<00:00, 194424.09 examples/s]


In [372]:
all_data_dedup_filtered[0]

{'text': 'Kebebasan politik (juga dikenal sebagai otonomi politik atau kontrak politik) adalah konsep sentral dalam sejarah dan pemikiran politik Barat juga salah satu fitur terpenting dari masyarakat demokratis. Kebebasan politik digambarkan sebagai kebebasan dari penindasan atau paksaan, tidak adanya kondisi yang menjegal bagi individu dan membungkam situasi, atau tidak adanya keadaan paksaan pemenuhan kehidupan, misalnya paksaan ekonomi dalam suatu masyarakat. Meskipun kebebasan politik sering kali dimaknai negatif sebagai kebebasan dengan perilaku yang tidak masuk akal dari kendala eksternal, itu juga dapat merujuk pada pemenuhan hak, kapasitas dan kelayakan tindakan secara positif dan pelaksanaan hak-hak sosial atau kelompok. Konsep ini juga dapat mencakup kebebasan dari kendala internal semacam tindakan atau perkataan politik (misalnya kearifan sosial, konsistensi atau perilaku tidak terpuji). Konsep kebebasan politik erat kaitannya dengan konsep kebebasan sipil dan hak asasi man

In [375]:
import uuid

indo_bali_dict = load_dictionary("dict/idn_bali.json")
indo_cbn_dict = load_dictionary("dict/idn_cbn.json")

def get_translate_parallel_prompt_text(data):
    prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Cirebonese & Balinese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. Not all word need to be translated such as named entities, therefore you need to properly choose which word need to be translated and which one is the right translation based on context. The translations are as follows:

<cbn_translation>
{get_dict_translation(data["text"].lower(), indo_cbn_dict)}
</cbn_translation>

<bali_translation>
{get_dict_translation(data["text"].lower(), indo_bali_dict)}
</bali_translation>

<id_text>
{data["text"]}
</id_text>

Return only the translated text in JSON format with key "cirebonese_text" for the Cirebonese translation and "balinese_text" for the Balinese translation."""
    
    data['id'] = str(uuid.uuid4())
    data["prompt_text"] = prompt_text
    
    return data

In [376]:
all_data_dedup_filtered_prompt = all_data_dedup_filtered.map(get_translate_parallel_prompt_text)

Map: 100%|██████████| 605224/605224 [04:34<00:00, 2201.28 examples/s]


In [378]:
def count_tokens_in_dataset_prompt(dataset, num_tokens_from_string, encoder):
    total_tokens = 0
    
    for item in dataset:
        text = item['prompt_text']
        tokens = num_tokens_from_string(text, encoder)
        total_tokens += tokens
    
    return total_tokens

count_tokens_in_dataset_prompt(all_data_dedup_filtered_prompt, num_tokens_from_string, encoding)

2491046034

In [380]:
count_tokens_in_dataset(all_data_dedup_filtered_prompt, num_tokens_from_string, encoding)

279138374

In [379]:
2491 * 0.075

186.825

In [381]:
279 * 2 * 0.3

167.4

In [382]:
all_data_dedup_filtered_prompt = all_data_dedup_filtered_prompt.shuffle(seed=42)

In [383]:
all_data_dedup_filtered_prompt

Dataset({
    features: ['text', 'id', 'prompt_text'],
    num_rows: 605224
})

In [384]:
all_data_dedup_filtered_prompt_1 = all_data_dedup_filtered_prompt.select(range(300000))
all_data_dedup_filtered_prompt_2 = all_data_dedup_filtered_prompt.select(range(300000, len(all_data_dedup_filtered_prompt)))

In [387]:
all_data_dedup_filtered_prompt_1.save_to_disk("dataset/id_hq_data_prompt_300k")

Saving the dataset (8/8 shards): 100%|██████████| 300000/300000 [00:35<00:00, 8498.63 examples/s] 


In [388]:
all_data_dedup_filtered_prompt_2.save_to_disk("dataset/id_hq_data_prompt_305k")

Saving the dataset (9/9 shards): 100%|██████████| 305224/305224 [03:39<00:00, 1390.80 examples/s]


In [27]:
clean_305k = datasets.load_from_disk("dataset/id_hq/id_hq_data_prompt_305k_clean")

In [28]:
clean_305k[1]

{'text': 'Topik adalah acara berita televisi induk dari antv. Hadir pertama kali tayang pada tanggal 30 April 2006, Topik berisikan materi berita dari dalam dan luar negeri. Khusus untuk berita internasional, materi yang ditampilkan adalah informasi yang "memiliki kedekatan dengan masyarakat Indonesia". Sementara, kejadian-kejadian yang berlangsung di kawasan Timur Tengah, Asia, dan Asia Tenggara serta beberapa kawasan yang berdekatan dengan Indonesia akan menjadi "pilihan utama berita-berita dari luar negeri".\n\nProgram yang umumnya berdurasi tiga puluh menit ini awalnya disajikan dalam lima acara berita yang berbeda. Namun, sejak tahun 2018 banyak program Topik yang tidak disiarkan lagi di antv, menyisakan acara berita setiap jam yang bernama Topik Terkini.\n\nSejarah\n\n1994-2003\nAcara Topik di antv didahului oleh penayangan beberapa acara berita dengan nama-nama yang berubah.\n\nProgram berita pertama yang disiarkan di ANteve adalah Laporan ANteve. Acara ini ditayangkan selama ti

In [29]:
model = "gpt-4o-mini-2024-07-18"
temperature = 0
response_format = {"type": "json_object"}

batch_req_objects = []
for data in clean_305k:
  messages = [
      {"role": "user", "content": data["prompt_text"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [32]:
save_batches_to_jsonl(batch_req_objects, 10000, "dataset/translate_batch/translate_paralel_305k/translate_paralel_305k")

In [34]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/translate_paralel_305k") if f.startswith("translate_paralel_305k_batch") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Balinese & Cirebonese translation from Indonesian text with help of dictionary"
    batch_info = llm_batch_api(f"dataset/translate_batch/translate_paralel_305k/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [35]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/translate_paralel_305k/translate_paralel_305k_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [38]:
for idx, batch_info_dict in enumerate(all_batch_info_dict):
    num_batch = batch_info_dict['metadata']['description'].split(" ")[1]
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print("Num batch:", num_batch, "->", updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"dataset/translate_batch/translate_paralel_305k/translate_paralel_305k_output_{idx}.jsonl")

Status of batch batch_67cff943a23881909ab6a5a25efaa39b is completed
Num batch: 10 -> Batch(id='batch_67cff943a23881909ab6a5a25efaa39b', completion_window='24h', created_at=1741683011, endpoint='/v1/chat/completions', input_file_id='file-3tevBRhQsaJkd6yEBHMFRa', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1741686380, error_file_id='file-8UhWbn7KxtQMZZYDYzrpiF', errors=None, expired_at=None, expires_at=1741769411, failed_at=None, finalizing_at=1741685397, in_progress_at=1741683016, metadata={'description': 'Batch 10 of requests to generate Balinese & Cirebonese translation from Indonesian text with help of dictionary'}, output_file_id='file-ReRqdXoW9bvx945mioo6Tz', request_counts=BatchRequestCounts(completed=9996, failed=4, total=10000))
Status of batch batch_67cff95398688190bb4c3f50096e01cc is completed
Num batch: 3 -> Batch(id='batch_67cff95398688190bb4c3f50096e01cc', completion_window='24h', created_at=1741683027, endpoint='/v1/chat/completi

In [44]:
# Create a mapping of translations from JSONL
output_files = [f for f in os.listdir("dataset/translate_batch/translate_paralel_305k") if f.startswith("translate_paralel_305k_output") and f.endswith(".jsonl")]

failed_id = []
translations = {}
for batch_file in output_files:
    with jsonlines.open(f"dataset/translate_batch/translate_paralel_305k/{batch_file}") as reader:
        for obj in reader:
            custom_id = obj['custom_id']
            translations[custom_id] = {}
            try:
                translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
                if obj['response']['body']['choices'][0]['finish_reason'] != "length":
                  if isinstance(translation['cirebonese_text'], str):
                    translations[custom_id]["cirebonese"] = translation['cirebonese_text']
                  elif isinstance(translation['cirebonese_text'], list):
                    translations[custom_id]["cirebonese"] = "".join(translation['cirebonese_text'])
                  if isinstance(translation['balinese_text'], str):
                    translations[custom_id]["balinese"] = translation['balinese_text']
                  elif isinstance(translation['balinese_text'], list):
                    translations[custom_id]["balinese"] = "".join(translation['balinese_text'])
            except Exception as e:
               failed_id.append(custom_id)
               print(e)
               print(batch_file)
               continue

def add_translation_to_dataset(data):
    if data["id"] in translations:
        if "cirebonese" in translations[data["id"]] and "balinese" in translations[data["id"]]:
          return {
              "id": data["id"],
              "cirebonese": translations[data["id"]]["cirebonese"],
              "balinese": translations[data["id"]]["balinese"],
              "indonesian": data["text"]
          }
        else:
          return None
    else:
        return None
        # raise ValueError(f"Translation not found for ID: {data['id']}")

Unterminated string starting at: line 2 column 2683 (char 2684)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 12698)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 14997)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 2 column 22 (char 23)
translate_paralel_305k_output_12.jsonl
'balinese_text'
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 4314)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 26028)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 10746)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 15580)
translate_paralel_305k_output_12.jsonl
Unterminated string starting at: line 3 column 20 (char 16463)
translate_paralel_305k_output_12.jsonl
Unterminated string starting a

In [45]:
print(len(failed_id))

3494


In [46]:
paralel_305k = clean_305k.map(add_translation_to_dataset, num_proc=8)

Map (num_proc=8): 100%|██████████| 264918/264918 [01:09<00:00, 3834.49 examples/s] 


In [47]:
paralel_305k = paralel_305k.remove_columns(["text", "prompt_text"])

In [48]:
paralel_305k

Dataset({
    features: ['id', 'cirebonese', 'balinese', 'indonesian'],
    num_rows: 261287
})

In [49]:
paralel_305k.save_to_disk("dataset/paralel_3_lang/paralel_dataset_305k")

Saving the dataset (3/3 shards): 100%|██████████| 261287/261287 [00:14<00:00, 18177.47 examples/s]


In [52]:
combined_400k = datasets.load_from_disk("dataset/paralel_3_lang/combined_paralel_dataset_400k_dedup_clean")

In [53]:
from datasets import concatenate_datasets

combined_705k = concatenate_datasets([combined_400k, paralel_305k])

In [55]:
combined_705k.save_to_disk("dataset/paralel_3_lang/combined_paralel_dataset_705k")

Saving the dataset (7/7 shards): 100%|██████████| 607580/607580 [00:34<00:00, 17766.21 examples/s] 
