In [18]:
import pandas as pd
from dotenv import load_dotenv
import json

# 2. Prepare and Create the Batch Input file for the Batch API

In [19]:
# Load the dataset
df = pd.read_csv('./data/processed/PostHistoryFirstVersion.csv')

# Make individual request for the batch file JSON style (example of a batch entry: https://platform.openai.com/docs/guides/batch#1-prepare-your-batch-file)
def make_batch_entry(request_id, instructions, prompt_text):
    return {
        "custom_id": f"request-{request_id}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4.1-nano",        # CHANGE THE MODEL HERE. THIS MODEL IS THE CHEAPEST
            "messages": [
                {
                    "role": "system", 
                    "content": instructions
                },
                {
                    "role": "developer",
                    "content": "Response like you are a senior developer writing for a question on Stack Overflow. With clear and concise Title, Body, Tags"
                },
                {
                    "role": "user", 
                    "content": prompt_text
                }
            ],
            "n": 1,
            "max_tokens": 1000,
            "temperature": 0.3
        }
    }

# Create a batch request per question
batch_requests = []
for index, row in df.iterrows():
    # Construct the instruction for dev role
    instructions = f"""
    # Identity
    You are a developer assistant that improves Stack Overflow questions.
    
    # Instructions
    * Rewrite the Title to be more clear and professional.
    * Rewrite the Body to be clearer and more detailed if needed.
    * Add or Adjust Tags if necessary.
    
    * Format your response as:
    _ Title: your improved title\n
    _ Body: your improved body\n
    _ Tags: your improved tags separated <>, like this <tag1><tag2><tag3>\n
    
    """
    
    # Construct the input for the user role
    prompt = f"""
    Could you rewrite this Stackoverflow question given 3 information below:
    _ Original Title:
    {row['Title']}
    
    _ Original Body:
    {row['Body']}
    
    _ Original Tags:
    {row['Tags']}
    """
    
    # append each batch request in a list
    batch_requests.append(make_batch_entry(index, instructions, prompt))

print(batch_requests)

# write "batch_requests" to a json file
with open("./data/processed/batch_input.jsonl", "w", encoding="utf-8") as json_file: 
    for item in batch_requests:
        json_file.write(json.dumps(item, ensure_ascii = False) + "\n") # , indent = 4

[{'custom_id': 'request-0', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4.1-nano', 'messages': [{'role': 'system', 'content': '\n    # Identity\n    You are a developer assistant that improves Stack Overflow questions.\n    \n    # Instructions\n    * Rewrite the Title to be more clear and professional.\n    * Rewrite the Body to be clearer and more detailed if needed.\n    * Add or Adjust Tags if necessary.\n    \n    * Format your response as:\n    _ Title: your improved title\n\n    _ Body: your improved body\n\n    _ Tags: your improved tags separated <>, like this <tag1><tag2><tag3>\n\n    \n    '}, {'role': 'developer', 'content': 'Response like you are a senior developer writing for a question on Stack Overflow. With clear and concise Title, Body, Tags'}, {'role': 'user', 'content': '\n    Could you rewrite this Stackoverflow question given 3 information below:\n    _ Original Title:\n    Make a div fill the remaining screen space\n    \n    _ Origina