In [1]:
import os
import json
import glob
import pickle

from dotenv import load_dotenv

load_dotenv()

True

In [14]:
from openai import OpenAI

client = OpenAI()

def llm_call(model, messages, **call_args):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        **call_args
    )
    return response

def create_batch_req_object(req_id, model, messages, response_format, temperature=0.0):
    return {
       "custom_id": req_id,
       "method": "POST",
       "url": "/v1/chat/completions",
       "body": {
          "model": model, 
          "messages": messages,
          "temperature": temperature,
          "response_format": response_format,
        }
    }

def llm_batch_api(batch_filepath, purpose="", desc="", completion_window="24h"):
    batch_input_file = client.files.create(
      file=open(batch_filepath, "rb"),
      purpose=purpose
    )

    batch_input_file_id = batch_input_file.id

    batch_info = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window=completion_window,
        metadata={
          "description": desc
        }
    )

    return batch_info

def llm_batch_check_retrieve(batch_info):
    updated_batch = client.batches.retrieve(batch_info.id)
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

def llm_batch_check_retrieve_dict(batch_info):
    updated_batch = client.batches.retrieve(batch_info["id"])
    print(f"Status of batch {updated_batch.id} is {updated_batch.status}")
    if updated_batch.status == "completed":
      output_file = client.files.content(updated_batch.output_file_id)
      return updated_batch, output_file
    else:
      return updated_batch, None

In [3]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def num_tokens_from_string(string: str, encoder) -> int:
    """Returns the number of tokens in a text string."""
    num_tokens = len(encoder.encode(string))
    return num_tokens

def count_tokens_in_dataset(dataset, num_tokens_from_string, encoder):
    total_tokens = 0
    
    for item in dataset:
        text = item['text']
        tokens = num_tokens_from_string(text, encoder)
        total_tokens += tokens
    
    return total_tokens

In [4]:
def write_jsonl(data_string, output_file):
    """
    Writes JSONL string to a file.
    
    Args:
        data_string (str): String containing JSONL data
        output_file (str): Path to output file
    """
    # Split the string into lines and filter out empty lines
    json_lines = [line.strip() for line in data_string.split('\n') if line.strip()]
    
    # Write each line to the file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in json_lines:
            json_obj = json.loads(line)  # Parse the JSON string
            f.write(json.dumps(json_obj) + '\n')  # Write formatted JSON

def read_jsonl(input_file):
    """
    Reads a JSONL file and returns a list of JSON objects.
    
    Args:
        input_file (str): Path to input JSONL file
        
    Returns:
        list: List of parsed JSON objects
    """
    data = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data.append(json.loads(line))
    return data

# Function to save the batch object using pickle
def save_batch_to_pickle(batch_obj, output_file="batch_data.pkl"):
    with open(output_file, 'wb') as f:
        pickle.dump(batch_obj, f)

# Function to load the batch object from a pickle file
def load_batch_from_pickle(input_file="batch_data.pkl"):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

def create_batches(dataset, batch_size=1000):
    return [dataset.select(range(i, min(i + batch_size, len(dataset)))) for i in range(0, len(dataset), batch_size)]

In [5]:
def direct_gen_search_keyword(str_dict):
    sys_prompt = "Always answer in a valid JSON format following the user instructions, without any introduction, commentary, or anything else, only the JSON answer."
    prompt = f"""You are a master in Indonesian and Javanese Language. Now, you are given a Cirebonese word with its translation or definition in Indonesian. The word is:

<cbn_word>
{str_dict}
</cbn_word>
    
What you need to do is to convert the word into list of searchable keywords to search on Google. At least one search keyword must contain the cirebonese word. The search keyword also need to be relevant to Indonesian and Cirebonese specifically to the word meaning, also surely will return some results when searching in the internet. Return in JSON format with key "search_keyword" and the list of search keyword as the value."""

    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": prompt}
        ],
        max_tokens=512,
        temperature=1,
        response_format={"type": "json_object"},
    )
    return response

def batch_gen_search_keyword(str_dict):
    sys_prompt = "Always answer in a valid JSON format following the user instructions, without any introduction, commentary, or anything else, only the JSON answer."
    prompt = f"""You are a master in Indonesian and Javanese Language. Now, you are given a Cirebonese word with its translation or definition in Indonesian. The word is:

<cbn_word>
{str_dict}
</cbn_word>
    
What you need to do is to convert the word into list of searchable keywords to search on Google. At least one search keyword must contain the cirebonese word. The search keyword also need to be relevant to Indonesian and Cirebonese specifically to the word meaning, also surely will return some results when searching in the internet. Return in JSON format with key "search_keyword" and the list of search keyword as the value."""

    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}
    ]
    model = "gpt-4o-mini-2024-07-18"
    temperature = 1
    max_tokens = 512
    response_format = {"type": "json_object"}
    
    return messages, model, temperature, max_tokens, response_format

In [6]:
import json
import random
from collections import defaultdict

# filepath: /f:/dev/cirebonese/dict/cbn_idn.json
with open('./dict/cbn_idn.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Initialize the counter dictionary
counter = defaultdict(int)

# Function to get a random word and its translations/definitions
def get_random_word(data, counter):
    word = random.choice(list(data.keys()))
    counter[word] += 1
    return word

# Function to format the selected word and its translations/definitions
def format_word(word, data):
    result = f"{word}:\n"
    for translation in data[word]:
        result += f"  - {translation}\n"
    return result

In [None]:
word = get_random_word(data, counter)
formatted_string = format_word(word, data)
print(formatted_string)

upaper:
  - cepat



In [42]:
response = gen_search_keyword(formatted_string)

In [43]:
print(response.choices[0].message.content)

{
  "search_keyword": [
    "pamêg permainan",
    "permainan tradisional Cirebon",
    "permainan Cirebon",
    "jenis permainan di Indonesia",
    "permainan anak Cirebon",
    "permainan budaya Cirebon"
  ]
}


In [None]:
# Utilize the get_random_word to randomly choose a word from the dict
batch_req_objects = []
total_words = len(data)
while any(count < 2 for count in counter.values()) or not bool(counter.values()):
    word = get_random_word(data, counter)
    formatted_string = format_word(word, data)
    
    # Use the formatted string in the prompt in batch_gen_search_keyword and obtain the batch info
    messages, model, temperature, max_tokens, response_format = batch_gen_search_keyword(formatted_string)
    
    # Convert into batch req object using create_batch_req_object
    batch_req_object = create_batch_req_object(req_id=f"req_{word}_{counter[word]}", model=model, messages=messages, response_format=response_format, temperature=temperature)
    
    # Gather the batch req object in a list
    batch_req_objects.append(batch_req_object)

# Write all the batch req objects into a jsonl file using write_jsonl
batch_req_objects_jsonl = "\n".join([json.dumps(obj) for obj in batch_req_objects])
write_jsonl(batch_req_objects_jsonl, "gen_search_batch.jsonl")

# ...existing code...

In [52]:
len(batch_req_objects)

192678

In [53]:
batch_req_objects[21378]

{'custom_id': 'req_gêring_2',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-4o-mini-2024-07-18',
  'messages': [{'role': 'system',
    'content': 'Always answer in a valid JSON format following the user instructions, without any introduction, commentary, or anything else, only the JSON answer.'},
   {'role': 'user',
    'content': 'You are a master in Indonesian and Javanese Language. Now, you are given a Cirebonese word with its translation or definition in Indonesian. The word is:\n\n<cbn_word>\ngêring:\n  - sakit\n\n</cbn_word>\n    \nWhat you need to do is to convert the word into list of searchable keywords to search on Google. At least one search keyword must contain the cirebonese word. The search keyword also need to be relevant to Indonesian and Cirebonese specifically to the word meaning, also surely will return some results when searching in the internet. Return in JSON format with key "search_keyword" and the list of search keyword as the value.

In [8]:
batch_info = llm_batch_api("gen_search_batch.jsonl", purpose="batch", desc="Batch of requests to generate search keywords for Cirebonese words", completion_window="24h")

In [10]:
batch_info, updated_batch = llm_batch_check_retrieve(batch_info)

Status of batch batch_67ac193a60d8819098380df4bc7ff5a6 is failed


In [15]:
batch_objects = read_jsonl("gen_search_batch.jsonl")

In [17]:
len(batch_objects)

192678

In [18]:
def save_batches_to_jsonl(batch_req_objects, batch_size, base_filename):
    # Divide the batch_req_objects into smaller lists with the specified batch size
    for i in range(0, len(batch_req_objects), batch_size):
        batch = batch_req_objects[i:i + batch_size]
        batch_filename = f"{base_filename}_batch_{i // batch_size + 1}.jsonl"
        
        # Convert the batch to JSONL format
        batch_jsonl = "\n".join([json.dumps(obj) for obj in batch])
        
        # Write the batch to a .jsonl file
        write_jsonl(batch_jsonl, batch_filename)

In [19]:
save_batches_to_jsonl(batch_objects, 50000, "batch_requests")

In [None]:
# Run the llm_batch_api function with all .jsonl files and gather batch_info
batch_files = [f for f in os.listdir() if f.startswith("batch_requests_batch_") and f.endswith(".jsonl")]
all_batch_info = []
for batch_file in batch_files:
    batch_number = batch_file.split('_')[-1].split('.')[0]
    desc = f"Batch {batch_number} of requests to generate search keywords for Cirebonese words"
    batch_info = llm_batch_api(batch_file, purpose="batch", desc=desc, completion_window="24h")
    all_batch_info.append(batch_info)

In [21]:
# Convert batch_info objects to a JSON-serializable format
all_batch_info_serializable = [batch_info.to_dict() for batch_info in all_batch_info]


In [22]:
# Save all batch_info to a file
with open("all_batch_info.json", "w") as f:
    json.dump(all_batch_info_serializable, f, indent=4)

In [7]:
with open('all_batch_info.json', 'r', encoding='utf-8') as file:
    all_batch_info = json.load(file)

In [30]:
for batch in all_batch_info:
    updated_batch, output_file = llm_batch_check_retrieve_dict(batch)
    print(updated_batch)
    if output_file:
        write_jsonl(output_file.text, f"batch_output_{updated_batch.metadata['description'].split()[1]}.jsonl")

Status of batch batch_67ac37dbdfb081909467ebe322b86904 is in_progress
Batch(id='batch_67ac37dbdfb081909467ebe322b86904', completion_window='24h', created_at=1739339739, endpoint='/v1/chat/completions', input_file_id='file-3j5YFNshfg66LDAsGczR6f', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1739426139, failed_at=None, finalizing_at=None, in_progress_at=1739339751, metadata={'description': 'Batch 1 of requests to generate search keywords for Cirebonese words'}, output_file_id=None, request_counts=BatchRequestCounts(completed=49153, failed=0, total=50000))
Status of batch batch_67ac37e5a400819085905be69807282f is in_progress
Batch(id='batch_67ac37e5a400819085905be69807282f', completion_window='24h', created_at=1739339749, endpoint='/v1/chat/completions', input_file_id='file-QySTAxjN34tDT77gxLCcbT', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, com

In [22]:
updated_batch.metadata['description'].split()[1]

'4'