# 3. Upload the Batch input file for the LLM to process

In [11]:
import pandas as pd
import os
from dotenv import load_dotenv
from openai import OpenAI
import json

In [None]:
# Load the API key from config file
load_dotenv("./config/api_key.env")
# Fetch the API key from the environment
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
# Upload the batch input file 
batch_input_file = client.files.create(
    file=open("./data/processed/batch_input.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

# Create the batch for process based on the batch input file ID
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "Testing Batches of Stackoverflow Questions for OpenAI to process"
    }
)

FileObject(id='file-AJs7U3z5fiV1KUTbJRVXVH', bytes=15138, created_at=1747014684, filename='batch_input.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)


Batch(id='batch_6821541dc1d881908ddcb473b7a01d13', completion_window='24h', created_at=1747014685, endpoint='/v1/chat/completions', input_file_id='file-AJs7U3z5fiV1KUTbJRVXVH', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747101085, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Testing Batches of Stackoverflow Questions for OpenAI to process'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

## Check the status of the Batch and Retrieve the result when done

In [None]:
# Get the status of the batch ( id = batct_123 )
batch = client.batches.retrieve("batch_6821541dc1d881908ddcb473b7a01d13")
print(batch)

Batch(id='batch_6821541dc1d881908ddcb473b7a01d13', completion_window='24h', created_at=1747014685, endpoint='/v1/chat/completions', input_file_id='file-AJs7U3z5fiV1KUTbJRVXVH', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1747014850, error_file_id=None, errors=None, expired_at=None, expires_at=1747101085, failed_at=None, finalizing_at=1747014848, in_progress_at=1747014687, metadata={'description': 'Testing Batches of Stackoverflow Questions for OpenAI to process'}, output_file_id='file-P1JCB4sx51YmhXW9ywRY4A', request_counts=BatchRequestCounts(completed=10, failed=0, total=10))


In [None]:
# Retrieve the result ( output_file_id='file-123')
file_response = client.files.content("file-P1JCB4sx51YmhXW9ywRY4A")
print(file_response.text)

{"id": "batch_req_682154c0e248819092d483e5f61fbbb8", "custom_id": "request-0", "response": {"status_code": 200, "request_id": "4bb348cdba775632f82ca07b6142d9ac", "body": {"id": "chatcmpl-BWCcLqBSbxByUMK3CkrGcTdTOaXnY", "object": "chat.completion", "created": 1747014697, "model": "gpt-4.1-nano-2025-04-14", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Title: How to Make a Div Fill Remaining Screen Space Without Using Tables\n\nBody:  \nI'm developing a web application and want the main content area to fill the remaining vertical space of the viewport, below a header. Currently, I am using a table-based layout with the following structure:\n\nCSS:\n```css\n#page {\n  height: 100%;\n}\n#tdcontent {\n  height: 100%;\n}\n```\n\nHTML:\n```html\n<table id=\"page\">\n  <tr>\n    <td id=\"tdheader\">\n      <div id=\"header\">...</div>\n    </td>\n  </tr>\n  <tr>\n    <td id=\"tdcontent\">\n      <div id=\"content\">...</div>\n    </td>\n  </tr>\n</table>\n```\n\nThis se

## Output the response part (Put it in a List, CSV, JSON file,...)

In [None]:
# Full response data as a JSON string
json_data = file_response.text

# Full response data, put each line in a List
json_list = []
for line in json_data.strip().split("\n"):
    if line.strip():
        json_list.append(json.loads(line))  # parse a JSON string and convert it to a Python Dictionary, then append to a List

# (Optional) Put them in a batch output file
with open("./data/processed/batch_output.json", "w", encoding="utf-8") as json_file: 
    for item in json_list:
        json_file.write(json.dumps(item, ensure_ascii = False, indent = 4) + "\n") 


In [23]:
# Debug json List
print(json_list[2]) 

{'id': 'batch_req_682154c105bc81908b4b0813a09014be', 'custom_id': 'request-2', 'response': {'status_code': 200, 'request_id': 'cb6ea85e32447153d18f8ef7535c035a', 'body': {'id': 'chatcmpl-BWCcMZV33GjL8fvT5juvBzwgE8UES', 'object': 'chat.completion', 'created': 1747014698, 'model': 'gpt-4.1-nano-2025-04-14', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'Title: Why is it discouraged to use "for...in" loops for Array iteration in JavaScript?\n\nBody:  \nI\'ve heard that using "for...in" loops to iterate over arrays in JavaScript is not recommended, but I haven\'t come across a clear explanation as to why. Could someone explain the potential issues or drawbacks of using "for...in" with arrays? Are there better alternatives for array iteration?\n\nTags: <javascript><iteration><best-practices>', 'refusal': None, 'annotations': []}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 240, 'completion_tokens': 96, 'total_tokens': 336, 'prompt_tokens_de

## Get the Response content and get the Title, Body and Tags part only

In [None]:
# List to store the improved version of title, body and tags
new_titles = []
new_bodies = []
new_tags = []

# Parse the response by Title, Body and Tags
for data in json_list:
    # Parse the response
    response = data["response"]["body"]["choices"][0]["message"]["content"]
    
    # Very basic parsing
    title_only = response.split('Title:')[1].split('Body:')[0].strip()
    body_only = response.split('Body:')[1].split('Tags:')[0].strip()
    tags_only = response.split('Tags:')[1].strip()
    
    # append each new Title, Body and Tags in a List
    new_titles.append(title_only)
    new_bodies.append(body_only)
    new_tags.append(tags_only)

In [None]:
# Load the dataset
df = pd.read_csv('./data/processed/PostHistoryFirstVersion.csv')

# print(new_titles)
# print(new_bodies)
# print(new_tags)

# Add new columns to the DataFrame
df['Improved_Title'] = new_titles
df['Improved_Body'] = new_bodies
df['Improved_Tags'] = new_tags

# Save to new CSV
df.to_csv('./data/processed/ImprovedQuestions.csv', index=False)