In [71]:
import os
import json
import glob
import pickle

from dotenv import load_dotenv

load_dotenv()

True

In [72]:
from openai import OpenAI

client = OpenAI()

def llm_call(model, messages, **call_args):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        **call_args
    )
    return response

def create_batch_req_object(req_id, model, messages, response_format, temperature=0.0):
    return {
       "custom_id": req_id,
       "method": "POST",
       "url": "/v1/chat/completions",
       "body": {
          "model": model, 
          "messages": messages,
          "temperature": temperature,
          "response_format": response_format,
        }
    }

def llm_batch_api(batch_filepath, purpose="", desc="", completion_window="24h"):
    batch_input_file = client.files.create(
      file=open(batch_filepath, "rb"),
      purpose=purpose
    )

    batch_input_file_id = batch_input_file.id

    batch_info = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window=completion_window,
        metadata={
          "description": desc
        }
    )

    return batch_info

def llm_batch_check_retrieve(batch_info):
    updated_batch = client.batches.retrieve(batch_info.id)
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

def llm_batch_check_retrieve_dict(batch_info):
    updated_batch = client.batches.retrieve(batch_info["id"])
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

In [73]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def num_tokens_from_string(string: str, encoder) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoder.encode(string))
    return num_tokens

def count_tokens_in_dataset(dataset, num_tokens_from_string, encoder):
    total_tokens = 0
    
    for item in dataset:
        text = item['text']
        tokens = num_tokens_from_string(text, encoder)
        total_tokens += tokens
    
    return total_tokens

In [74]:
def write_jsonl(data_string, output_file):
    """
    Writes JSONL string to a file.
    
    Args:
        data_string (str): String containing JSONL data
        output_file (str): Path to output file
    """
    # Split the string into lines and filter out empty lines
    json_lines = [line.strip() for line in data_string.split('\n') if line.strip()]
    
    # Write each line to the file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in json_lines:
            json_obj = json.loads(line)  # Parse the JSON string
            f.write(json.dumps(json_obj) + '\n')  # Write formatted JSON

def read_jsonl(input_file):
    """
    Reads a JSONL file and returns a list of JSON objects.
    
    Args:
        input_file (str): Path to input JSONL file
        
    Returns:
        list: List of parsed JSON objects
    """
    data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data.append(json.loads(line))
    return data

# Function to save the batch object using pickle
def save_batch_to_pickle(batch_obj, output_file="batch_data.pkl"):
    with open(output_file, 'wb') as f:
        pickle.dump(batch_obj, f)

# Function to load the batch object from a pickle file
def load_batch_from_pickle(input_file="batch_data.pkl"):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

def create_batches(dataset, batch_size=1000):
    return [dataset.select(range(i, min(i + batch_size, len(dataset)))) for i in range(0, len(dataset), batch_size)]

def load_dictionary(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

## Translate Parallel (CBN)

In [134]:
def split_text_into_ngrams(text, n):
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngrams.append(tuple(words[i:i + n]))
    return ngrams

def get_dict_translation(text, dictionary):
    unigrams = split_text_into_ngrams(text, 1)
    bigrams = split_text_into_ngrams(text, 2)
    trigrams = split_text_into_ngrams(text, 3)

    word_translation = ""

    for ngram in (trigrams, bigrams, unigrams):
        for ngram_tuple in ngram:
            ngram_str = ' '.join(ngram_tuple)
            if ngram_str in dictionary:
                word_translation += f"- {ngram_str}: {', '.join(dictionary[ngram_str])}\n"

    return word_translation

In [None]:
from tqdm import tqdm
import datasets

pt_data = datasets.load_from_disk("dataset/id_hq_data_dedup")

# Load the dictionary
dictionary = load_dictionary("dict/idn_cbn.json")

def get_prompt_text(data):
    prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_cbn_dict_translation(data["text"].lower(), dictionary)}
<id_text>
{data["text"]}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    data["text"] = prompt_text
    return data

In [23]:
pt_data = pt_data.map(get_prompt_text)

In [24]:
pt_data.save_to_disk("id_hq_data_prompt")

Saving the dataset (9/9 shards): 100%|██████████| 651856/651856 [00:25<00:00, 26003.35 examples/s] 


In [11]:
pt_tokens = count_tokens_in_dataset(pt_data, num_tokens_from_string, encoding)

In [13]:
pt_tokens

1418349015

In [26]:
pt_data = pt_data.shuffle(seed=42)

In [27]:
pt_data_60k = pt_data.select(range(60000))

In [28]:
pt_tokens_60k = count_tokens_in_dataset(pt_data_60k, num_tokens_from_string, encoding)

In [29]:
pt_tokens_60k

133108166

In [30]:
133108166 / 4 / 1000000 * 0.3

9.98311245

In [31]:
pt_data_60k.save_to_disk("dataset/id_hq_data_prompt_60k")

Saving the dataset (1/1 shards): 100%|██████████| 60000/60000 [00:00<00:00, 137178.96 examples/s]


In [32]:
def batch_gen_translate_w_dict(prompt):
    sys_prompt = "Always answer in a valid JSON format and provide only the JSON answer without anything else."
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}
    ]
    model = "gpt-4o-mini-2024-07-18"
    temperature = 1
    max_tokens = 4096
    response_format = {"type": "json_object"}
    
    return messages, model, temperature, max_tokens, response_format

In [56]:
import uuid

batch_req_objects = []
def create_batch_and_update_data(data):
  random_uuid = uuid.uuid4()
  uuid_str = str(random_uuid)
  data["req_id"] = f"req-{uuid_str}"
  
  messages, model, temperature, max_tokens, response_format = batch_gen_translate_w_dict(data["text"])
  batch_req_object = create_batch_req_object(req_id=data["req_id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)
  
  return data

In [57]:
def save_batches_to_jsonl(batch_req_objects, batch_size, base_filename):
    # Divide the batch_req_objects into smaller lists with the specified batch size
    for i in range(0, len(batch_req_objects), batch_size):
        batch = batch_req_objects[i:i + batch_size]
        batch_filename = f"{base_filename}_batch_{i // batch_size + 1}.jsonl"
        
        # Convert the batch to JSONL format
        batch_jsonl = "\n".join([json.dumps(obj) for obj in batch])
        
        # Write the batch to a .jsonl file
        write_jsonl(batch_jsonl, batch_filename)

In [58]:
# Write all the batch req objects into a jsonl file using write_jsonl
pt_data_60k = pt_data_60k.map(create_batch_and_update_data)

Map: 100%|██████████| 60000/60000 [00:03<00:00, 19475.29 examples/s]


In [59]:
save_batches_to_jsonl(batch_req_objects, 20000, "dataset/translate_batch/cbn_translate")

In [60]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/") if f.startswith("cbn_translate_batch_") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate translation from indonesian to Cirebonese with help of cirebonese word-by-word translation"
    batch_info = llm_batch_api(f"dataset/translate_batch/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [61]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/cbn_translate_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [70]:
for batch_info_dict in all_batch_info_dict:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"translate_batch_output_{batch_info_dict['id']}.jsonl")

Status of batch batch_67b3ee9ae58881909226449f6d3f8202 is completed
Batch(id='batch_67b3ee9ae58881909226449f6d3f8202', completion_window='24h', created_at=1739845274, endpoint='/v1/chat/completions', input_file_id='file-Jo9oi5WKePKiKp3WrsLboS', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739857274, error_file_id='file-XFbnGk3njZvCqoWqesPjCB', errors=None, expired_at=None, expires_at=1739931674, failed_at=None, finalizing_at=1739852857, in_progress_at=1739845280, metadata={'description': 'Batch 1 of requests to generate translation from indonesian to Cirebonese with help of cirebonese word-by-word translation'}, output_file_id='file-PxGzyXCYYvpu8Bn9dgsiW4', request_counts=BatchRequestCounts(completed=19991, failed=9, total=20000))
Status of batch batch_67b3eeac452c8190b9440e54c51c2ce4 is completed
Batch(id='batch_67b3eeac452c8190b9440e54c51c2ce4', completion_window='24h', created_at=1739845292, endpoint='/v1/chat/completions', input_file_id='

## Fill Empty ID Translation for Bali Dict

In [76]:
bali_dict = load_dictionary("dict/transformed_bali_dict.json")

In [81]:
from datasets import Dataset, DatasetDict
import json
import uuid

def create_translation_dataset(input_dict):
    data = {
        'id': [],
        'bali_word': [],
        'translation_english': [],
        'translation_indonesian': []
    }
    
    for bali_word, values in input_dict.items():
        # Skip if both translations are empty or non-existent
        if len(values["translation_english"]) == 0 and len(values["translation_indonesian"]) == 0:
            continue
            
        # Only process if at least translation_english exists and is not empty
        if  len(values['translation_english']) > 0 and len(values['translation_indonesian']) == 0:
            data['id'].append(str(uuid.uuid4()))
            data['bali_word'].append(bali_word)
            data['translation_english'].append(values['translation_english'])
            data['translation_indonesian'].append([])
    
    # Create HF Dataset
    dataset = Dataset.from_dict(data)
    return dataset

In [82]:
# Create the dataset
translation_dataset = create_translation_dataset(bali_dict)

In [83]:
translation_dataset

Dataset({
    features: ['id', 'bali_word', 'translation_english', 'translation_indonesian'],
    num_rows: 4545
})

In [87]:
def get_id_translation_prompt_text(data):
    prompt_text = f"""Translate the given Balinese word into Indonesian, with the help of its English translation:
- Balinese word: {data["bali_word"]}
- English translation: {data["translation_english"]}

The result must be a list of string, which is the Indonesian translation from the Balinese word, and since it is a list it can be more than one translation. Return only the Indonesian translation in JSON format with key "translation_indonesian"."""
    data["prompt"] = prompt_text
    return data

In [88]:
translation_dataset = translation_dataset.map(get_id_translation_prompt_text)

Map: 100%|██████████| 4545/4545 [00:00<00:00, 7166.05 examples/s]


In [89]:
translation_dataset[0]

{'id': '5338f62e-0155-4b45-9928-c1b496dbf153',
 'bali_word': 'abah-abah',
 'translation_english': ['tabiat', 'bakat'],
 'translation_indonesian': [],
 'prompt': 'Translate the given Balinese word into Indonesian, with the help of its English translation:\n- Balinese word: abah-abah\n- English translation: [\'tabiat\', \'bakat\']\n\nThe result must be a list of string, which is the Indonesian translation from the Balinese word, and since it is a list it can be more than one translation. Return only the Indonesian translation in JSON format with key "translation_indonesian".'}

In [107]:
model = "gpt-4o-2024-08-06"
temperature = 0
max_tokens = 512
response_format = {"type": "json_object"}

batch_req_objects = []
for data in translation_dataset:
  messages = [
      {"role": "user", "content": data["prompt"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [110]:
save_batches_to_jsonl(batch_req_objects, 20000, "dataset/translate_batch/bali_dict")

In [111]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/") if f.startswith("bali_dict_batch") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Indonesian translation from Balinese word with help of English translation"
    batch_info = llm_batch_api(f"dataset/translate_batch/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [112]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/bali_dict_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [117]:
for batch_info_dict in all_batch_info_dict:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"bali_dict_batch_output.jsonl")

Status of batch batch_67b57facc8248190897fdd3631a9cdee is completed
Batch(id='batch_67b57facc8248190897fdd3631a9cdee', completion_window='24h', created_at=1739947948, endpoint='/v1/chat/completions', input_file_id='file-QWHGpJgFVv8Dob8eMy6EN3', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739950437, error_file_id=None, errors=None, expired_at=None, expires_at=1740034348, failed_at=None, finalizing_at=1739949787, in_progress_at=1739947951, metadata={'description': 'Batch 1 of requests to generate Indonesian translation from Balinese word with help of English translation'}, output_file_id='file-4yRQdw1BtjLQMqf15nqL2t', request_counts=BatchRequestCounts(completed=4545, failed=0, total=4545))


In [114]:
len(bali_dict.keys())

20082

In [119]:
translation_dataset[0]

{'id': '5338f62e-0155-4b45-9928-c1b496dbf153',
 'bali_word': 'abah-abah',
 'translation_english': ['tabiat', 'bakat'],
 'translation_indonesian': [],
 'prompt': 'Translate the given Balinese word into Indonesian, with the help of its English translation:\n- Balinese word: abah-abah\n- English translation: [\'tabiat\', \'bakat\']\n\nThe result must be a list of string, which is the Indonesian translation from the Balinese word, and since it is a list it can be more than one translation. Return only the Indonesian translation in JSON format with key "translation_indonesian".'}

In [122]:
import jsonlines

def update_translations_from_jsonl(dataset, jsonl_path):
    # Create a mapping of custom_id to translations from JSONL
    translations = {}
    with jsonlines.open(jsonl_path) as reader:
        for obj in reader:
            custom_id = obj['custom_id']
            # Extract translation_indonesian from the response
            try:
                translation = json.loads(obj['response']['body']['choices'][0]['message']['content'])
                translations[custom_id] = translation['translation_indonesian']
            except (KeyError, json.JSONDecodeError):
                continue

    # Update dataset with translations
    def update_translation(example):
        if example['id'] in translations:
            example['translation_indonesian'] = translations[example['id']]
        return example

    # Apply updates to dataset
    updated_dataset = dataset.map(update_translation)
    
    return updated_dataset

In [123]:
translation_dataset = update_translations_from_jsonl(translation_dataset, "dataset/translate_batch/bali_dict/bali_dict_batch_output.jsonl")

Map: 100%|██████████| 4545/4545 [00:00<00:00, 21266.66 examples/s]


In [128]:
def update_dict_translations(bali_dict, dataset):
    # Convert dataset to dictionary for easier lookup
    updated_entries = 0
    
    for item in dataset:
        bali_word = item['bali_word']
        
        # Check if word exists in dictionary
        if bali_word in bali_dict:
            # Update translation_indonesian
            bali_dict[bali_word]['translation_indonesian'] = item['translation_indonesian']
            updated_entries += 1
    
    return bali_dict, updated_entries

In [129]:
# Update the dictionary with new translations
updated_dict, num_updated = update_dict_translations(bali_dict, translation_dataset)

In [131]:
with open('dict/transformed_bali_dict_.json', 'w', encoding='utf-8') as f:
    json.dump(updated_dict, f, ensure_ascii=False, indent=2)

In [132]:
from collections import defaultdict

def create_translation_dicts(bali_dict):
    # Initialize dictionaries
    bali_indo = {}
    indo_bali = defaultdict(list)
    
    # Process each entry
    for bali_word, values in bali_dict.items():
        # Skip entries without translations
        if not isinstance(values, dict):
            continue
        if 'translation_indonesian' not in values or not values['translation_indonesian']:
            continue
            
        # Add to Balinese-Indonesian dictionary
        bali_indo[bali_word] = values['translation_indonesian']
        
        # Add to Indonesian-Balinese dictionary
        for indo_word in values['translation_indonesian']:
            indo_bali[indo_word].append(bali_word)
    
    return bali_indo, dict(indo_bali)

In [133]:
# Create both dictionaries
bali_indo_dict, indo_bali_dict = create_translation_dicts(bali_dict)

# Save Balinese-Indonesian dictionary
with open('dict/bali_idn.json', 'w', encoding='utf-8') as f:
    json.dump(bali_indo_dict, f, ensure_ascii=False, indent=2)

# Save Indonesian-Balinese dictionary
with open('dict/idn_bali.json', 'w', encoding='utf-8') as f:
    json.dump(indo_bali_dict, f, ensure_ascii=False, indent=2)


## Translate Sentence Example for Bali Dict

In [152]:
import datasets

def create_sent_translation_dataset(input_dict):
    data = {
        'id': [],
        'bali_word': [],
        'example_idx': [],
        'balinese_text': [],
        'indonesian_text': []
    }
    
    for bali_word, values in input_dict.items():
        # Skip if no sentence examples
        if not isinstance(values, dict) or 'sentence_examples' not in values:
            continue
            
        for example_idx, example in enumerate(values['sentence_examples']):
            # Skip if both are empty or both exist
            has_bali = example.get('Balinese', '-') != '-'
            has_indo = example.get('Indonesian', '-') != '-'
            
            if has_bali == has_indo:  # both True or both False
                continue
                
            data['id'].append(str(uuid.uuid4()))
            data['bali_word'].append(bali_word)
            data['example_idx'].append(example_idx)
            data['balinese_text'].append(example.get('Balinese', '-'))
            data['indonesian_text'].append(example.get('Indonesian', '-'))
    
    # Create HF Dataset
    dataset = Dataset.from_dict(data)
    
    return dataset

In [157]:
sent_translation_dataset = create_sent_translation_dataset(bali_dict)

In [158]:
sent_translation_dataset[86]

{'id': 'd7b8db96-e304-47a2-8d7f-d326b85873ca',
 'bali_word': 'anteg',
 'example_idx': 0,
 'balinese_text': 'Anteg jani Yan Galung tusing ada teka',
 'indonesian_text': '-'}

In [159]:
bali_indo_dict = load_dictionary("dict/bali_idn.json")
indo_bali_dict = load_dictionary("dict/idn_bali.json")

def get_translate_sentence_prompt_text(data):
    if data["balinese_text"] == "-" and data["indonesian_text"] != "-":
      prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Balinese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_dict_translation(data["indonesian_text"].lower(), indo_bali_dict)}
<id_text>
{data["indonesian_text"]}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    elif data["balinese_text"] != "-" and data["indonesian_text"] == "-":
      prompt_text = f"""Translate the given Balinese text in the <bali_text> tag below into Indonesian with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_dict_translation(data["balinese_text"].lower(), bali_indo_dict)}
<bali_text>
{data["balinese_text"]}
</bali_text>

Return only the translated text in JSON format with key "translated_text"."""
    
    data["prompt_text"] = prompt_text
    
    return data

In [160]:
sent_translation_dataset = sent_translation_dataset.map(get_translate_sentence_prompt_text)

Map: 100%|██████████| 6421/6421 [00:00<00:00, 19170.91 examples/s]


In [163]:
model = "gpt-4o-2024-08-06"
temperature = 0
max_tokens = 512
response_format = {"type": "json_object"}

batch_req_objects = []
for data in sent_translation_dataset:
  messages = [
      {"role": "user", "content": data["prompt_text"]}
  ]
  batch_req_object = create_batch_req_object(req_id=data["id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)

In [None]:
save_batches_to_jsonl(batch_req_objects, 20000, "dataset/translate_batch/bali_sent/bali_sent")

In [168]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir("dataset/translate_batch/bali_sent") if f.startswith("bali_sent_batch") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate Indonesian/Balinese translation from Indonesian/Balinese text with help of dictionary"
    batch_info = llm_batch_api(f"dataset/translate_batch/bali_sent/{batch_file}", purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [169]:
all_batch_info_dict = [b.to_dict() for b in all_batch_info]
# Save all batch_info to a file
with open("dataset/translate_batch/bali_sent/bali_sent_batch_info.json", "w") as f:
    json.dump(all_batch_info_dict, f, indent=4)

In [172]:
for batch_info_dict in all_batch_info_dict:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"dataset/translate_batch/bali_sent/bali_sent_batch_output.jsonl")

Status of batch batch_67b5cfdf27d48190aea00909f1e03bbf is in_progress
Batch(id='batch_67b5cfdf27d48190aea00909f1e03bbf', completion_window='24h', created_at=1739968479, endpoint='/v1/chat/completions', input_file_id='file-JeYcdKJB7Duhy3QyGUuh1r', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1740054879, failed_at=None, finalizing_at=None, in_progress_at=1739968480, metadata={'description': 'Batch 1 of requests to generate Indonesian/Balinese translation from Indonesian/Balinese text with help of dictionary'}, output_file_id=None, request_counts=BatchRequestCounts(completed=1510, failed=0, total=6421))
