In [1]:
import os
import json
import glob
import pickle

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from openai import OpenAI

client = OpenAI()

def llm_call(model, messages, **call_args):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        **call_args
    )
    return response

def create_batch_req_object(req_id, model, messages, response_format, temperature=0.0):
    return {
       "custom_id": req_id,
       "method": "POST",
       "url": "/v1/chat/completions",
       "body": {
          "model": model, 
          "messages": messages,
          "temperature": temperature,
          "response_format": response_format,
        }
    }

def llm_batch_api(batch_filepath, purpose="", desc="", completion_window="24h"):
    batch_input_file = client.files.create(
      file=open(batch_filepath, "rb"),
      purpose=purpose
    )

    batch_input_file_id = batch_input_file.id

    batch_info = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window=completion_window,
        metadata={
          "description": desc
        }
    )

    return batch_info

def llm_batch_check_retrieve(batch_info):
    updated_batch = client.batches.retrieve(batch_info.id)
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

def llm_batch_check_retrieve_dict(batch_info):
    updated_batch = client.batches.retrieve(batch_info["id"])
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

In [3]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def num_tokens_from_string(string: str, encoder) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoder.encode(string))
    return num_tokens

def count_tokens_in_dataset(dataset, num_tokens_from_string, encoder):
    total_tokens = 0
    
    for item in dataset:
        text = item['text']
        tokens = num_tokens_from_string(text, encoder)
        total_tokens += tokens
    
    return total_tokens

In [4]:
def write_jsonl(data_string, output_file):
    """
    Writes JSONL string to a file.
    
    Args:
        data_string (str): String containing JSONL data
        output_file (str): Path to output file
    """
    # Split the string into lines and filter out empty lines
    json_lines = [line.strip() for line in data_string.split('\n') if line.strip()]
    
    # Write each line to the file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in json_lines:
            json_obj = json.loads(line)  # Parse the JSON string
            f.write(json.dumps(json_obj) + '\n')  # Write formatted JSON

def read_jsonl(input_file):
    """
    Reads a JSONL file and returns a list of JSON objects.
    
    Args:
        input_file (str): Path to input JSONL file
        
    Returns:
        list: List of parsed JSON objects
    """
    data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data.append(json.loads(line))
    return data

# Function to save the batch object using pickle
def save_batch_to_pickle(batch_obj, output_file="batch_data.pkl"):
    with open(output_file, 'wb') as f:
        pickle.dump(batch_obj, f)

# Function to load the batch object from a pickle file
def load_batch_from_pickle(input_file="batch_data.pkl"):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

def create_batches(dataset, batch_size=1000):
    return [dataset.select(range(i, min(i + batch_size, len(dataset)))) for i in range(0, len(dataset), batch_size)]

In [5]:
def load_dictionary(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def split_text_into_ngrams(text, n):
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngrams.append(tuple(words[i:i + n]))
    return ngrams

def get_dict_translation(text, dictionary):
    unigrams = split_text_into_ngrams(text, 1)
    bigrams = split_text_into_ngrams(text, 2)
    trigrams = split_text_into_ngrams(text, 3)

    word_translation = ""

    for ngram in (trigrams, bigrams, unigrams):
        for ngram_tuple in ngram:
            ngram_str = ' '.join(ngram_tuple)
            if ngram_str in dictionary:
                word_translation += f"- {ngram_str}: {', '.join(dictionary[ngram_str])}\n"

    return word_translation

In [6]:
from tqdm import tqdm
import datasets

pt_data = datasets.load_from_disk("id_hq_data_dedup")

# Load the dictionary
dictionary = load_dictionary("dict/idn_cbn.json")

def get_prompt_text(data):
    text = data["text"].lower()
    prompt_text = f"""Translate the given Indonesian text in the <id_text> tag below into Cirebonese with the help of some word-to-word translation provided below. For one word, there can be multiple translations, and you need to choose the right one based on the context. The translations are as follows:
{get_dict_translation(text, dictionary)}
<id_text>
{text}
</id_text>

Return only the translated text in JSON format with key "translated_text"."""
    data["text"] = prompt_text
    return data

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
pt_data = pt_data.map(get_prompt_text)

Map: 100%|██████████| 651856/651856 [02:26<00:00, 4456.93 examples/s]


In [9]:
pt_data.save_to_disk("id_hq_data_prompt")

Saving the dataset (9/9 shards): 100%|██████████| 651856/651856 [00:07<00:00, 92440.44 examples/s] 


In [11]:
pt_tokens = count_tokens_in_dataset(pt_data, num_tokens_from_string, encoding)

In [13]:
pt_tokens

1418349015

In [16]:
1418349015 / 1000000 * 0.075

106.376176125

In [18]:
pt_data = pt_data.shuffle(seed=42)

In [32]:
pt_data_20k = pt_data.select(range(20000))

In [33]:
pt_tokens_20k = count_tokens_in_dataset(pt_data_20k, num_tokens_from_string, encoding)

In [34]:
pt_tokens_20k

43674330

In [36]:
43674330 / 1000000 * 0.3

13.102298999999999

In [37]:
pt_data_20k.save_to_disk("id_hq_data_prompt_20k")

Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 141942.58 examples/s]


In [38]:
def batch_gen_translate_w_dict(prompt):
    sys_prompt = "Always answer in a valid JSON format and provide only the JSON answer without anything else."
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}
    ]
    model = "gpt-4o-mini-2024-07-18"
    temperature = 1
    max_tokens = 4096
    response_format = {"type": "json_object"}
    
    return messages, model, temperature, max_tokens, response_format

In [40]:
import uuid

# Generate a random UUID
random_uuid = uuid.uuid4()
print(f"Random UUID: {random_uuid}")

# Convert to string
uuid_str = str(random_uuid)
print(f"UUID as string: {uuid_str}")

Random UUID: 9027a724-6001-420b-8770-2f3b310b7557
UUID as string: 9027a724-6001-420b-8770-2f3b310b7557


In [41]:
import uuid

batch_req_objects = []
def create_batch_and_update_data(data):
  random_uuid = uuid.uuid4()
  uuid_str = str(random_uuid)
  data["req_id"] = f"req-{uuid_str}"
  
  messages, model, temperature, max_tokens, response_format = batch_gen_translate_w_dict(data["text"])
  batch_req_object = create_batch_req_object(req_id=data["req_id"], model=model, messages=messages, response_format=response_format, temperature=temperature)
  
  batch_req_objects.append(batch_req_object)
  
  return data

In [42]:
# Write all the batch req objects into a jsonl file using write_jsonl
pt_data_20k = pt_data_20k.map(create_batch_and_update_data)
batch_req_objects_jsonl = "\n".join([json.dumps(obj) for obj in batch_req_objects])
write_jsonl(batch_req_objects_jsonl, "gen_translate_batch.jsonl")

Map: 100%|██████████| 20000/20000 [00:01<00:00, 12636.88 examples/s]


In [43]:
batch_info = llm_batch_api("gen_translate_batch.jsonl", purpose="batch", desc="Batch of requests to generate translation from indonesian to Cirebonese with help of cirebonese word-by-word translation", completion_window="24h")

In [45]:
batch_info_dict = batch_info.to_dict()
# Save all batch_info to a file
with open("translate_batch_info.json", "w") as f:
    json.dump(batch_info_dict, f, indent=4)

In [48]:
updated_batch, output_file = llm_batch_check_retrieve_dict(batch_info_dict)
print(updated_batch)
if output_file:
    write_jsonl(output_file.text, f"translate_batch_output.jsonl")

Status of batch batch_67b3308d97208190ab2f142a7aa99026 is completed
Batch(id='batch_67b3308d97208190ab2f142a7aa99026', completion_window='24h', created_at=1739796621, endpoint='/v1/chat/completions', input_file_id='file-PK5wBtqUNSAua1fgxJguUx', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739806397, error_file_id='file-KU92rTPq7AxWXvNEn7rjRe', errors=None, expired_at=None, expires_at=1739883021, failed_at=None, finalizing_at=1739803900, in_progress_at=1739796631, metadata={'description': 'Batch of requests to generate translation from indonesian to Cirebonese with help of cirebonese word-by-word translation'}, output_file_id='file-MUucbzkLMDmE4o8hbjs3Uw', request_counts=BatchRequestCounts(completed=19997, failed=3, total=20000))
