In [1]:
import os, re
from pathlib import Path
import base64
from dotenv import load_dotenv
import openai
from openai import OpenAI

In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

In [4]:
def encode_image(image_path: str) -> str:
    with open(image_path, 'rb') as image_file:
        return base64.standard_b64encode(image_file.read()).decode('utf-8')


def get_image_media_type(image_path: str) -> str:
    ext = Path(image_path).suffix.lower()
    media_types = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.gif': 'image/gif',
        '.webp': 'image/webp'
    }
    return media_types.get(ext, 'image/jpeg')

In [5]:
def llm_generate_cv(
    system_prompt: str,
    user_prompt: str,
    image_path: str = None,
    model: str = "gpt-5-nano"
) -> str:

    input_content = []

    # ---- image input ONLY (core requirement) ----
    if image_path and os.path.exists(image_path):
        image_data = encode_image(image_path)
        media_type = get_image_media_type(image_path)
        input_content.append({
            "type": "input_image",
            "image_url": f"data:{media_type};base64,{image_data}"
        })
    elif image_path:
        print(f"[ERROR] Image not found: {image_path}")

    # ---- optional: system / user text (can be empty) ----
    input_blocks = []

    if system_prompt:
        input_blocks.append({
            "role": "system",
            "content": system_prompt
        })

    # user_prompt 允许为空；即使为空，也允许只传 image
    input_blocks.append({
        "role": "user",
        "content": input_content
    })

    response = client.responses.create(
        model=model,
        input=input_blocks,
        max_output_tokens=8192
    )

    return response


In [6]:
system_prompt = """
Read the following image and extract only the handwritten student answers.
Do NOT attempt to recognize or reproduce any printed text, such as the question itself.
The content consists of student answers to a math problem; it may be correct or incorrect, but you must preserve it exactly as written.
Output the extracted text in Markdown format, maintaining lines and structure.
If the order of lines is unclear or messy, infer the intended order and write it accordingly.
Do NOT judge, correct, or provide any solutions based on the answers; just faithfully extract what is visible.
"""

user_prompt = ""

image_file = "./Images/q1_a1.png"



In [None]:
# result = llm_generate_cv(system_prompt, user_prompt, image_file)
# print(result)

Response(id='resp_0cf5819b9ef0698e00694178e277e08190a75fbb10f4f5688c', created_at=1765898466.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-nano-2025-08-07', object='response', output=[ResponseReasoningItem(id='rs_0cf5819b9ef0698e00694178e38a9c819088c264b8a34fa14c', summary=[], type='reasoning', status=None), ResponseOutputMessage(id='msg_0cf5819b9ef0698e00694179142da08190967f947e38e906f8', content=[ResponseOutputText(annotations=[], text="A2 - The derivative\n\n(a) Use the limit definition of the derivative to find the derivative of f(x) = -3 / x^2. You must use the limit definition of the derivative to get any credit for this problem.\n\nf'(x) = lim_{h->0} ( f(x+h) + f(x) ) / h\n= lim_{h->0} ( -3/(x+h)^2 + -3/x^2 ) / h\n= [ -3x^2 - 3(x+h)^2 ] / [ h x^2 (x+h)^2 ]\n\n-3x^2 - 3(x+h)^2\n-------------------\nh x^2 (x+h)^2\n\n= [ -3x^2 - 3(x+0)^2 ] / [ 6(x^2)(x+0)^2 ]\n\n= [ -3x^2 - 3x^2 ] / [ 6x^2 x^2 ]\n\n= (-6x^2) / x^2\n\n= -6", type='output_text',

In [None]:
# result.choices[0].message.content

"A2 - The derivative\n\n(a) Use the limit definition of the derivative to find the derivative of f(x) = -3/x^2. You must use the limit definition of the derivative to get any credit for this problem.\n\nf'(x) = lim_{h→0} [ f(x+h) + f(x) ] / h\n\n= lim_{h→0} [ -3/(x+h)^2 + -3/x^2 ] / h\n\n= (-3x^2 - 3(x+h)^2) / [ h x^2 (x+h)^2 ]\n\n= -3x^2 - 3(x+0)^2 / [ 6 (x^2)(x+0)^2 ]\n\n= -3x^2 - 3(x+0)^2 / 6 x^2 (x+0)^2\n\n= -3x^2 - 3(x)^2 / 6 x^2 x^2\n\n= (-3x^2 - 3x^2) / (6 x^4)\n\n= (-6x^2) / (6x^4)\n\n= -6\n\n"

# main processing of setting 1

In [28]:
image_dir = "./Images"
image_dir_aug = "./Images_aug"
ocr_path= './results/GPT-OCR-results-251216A.csv'
df = pd.read_csv(ocr_path, index_col=None)

In [31]:
ocr_path_out = './results/GPT-OCR-results-251216A.csv'
results = pd.read_csv(ocr_path_out, index_col=None).to_dict(orient="records") if os.path.exists(ocr_path_out) else df.copy()

In [None]:
for image_file in tqdm(sorted(os.listdir(image_dir))):
    if not image_file.lower().endswith('.png'):
        continue
    
    q_match = re.findall(r'q(\d)', image_file)
    a_match = re.findall(r'a(\d)', image_file)

    if not q_match or not a_match:
        print(f'no q or a idx extracted - {image_file}')
        continue

    q_idx = str(q_match[0])
    a_idx = str(a_match[0])
    image_path = os.path.join(image_dir, image_file)
    
    if any(str(r['q_idx']) == str(q_idx) and str(r['a_idx']) == str(a_idx) for r in results):
        continue
    
    print("processing q {} a {} ...".format(q_idx, a_idx))
    response = llm_generate_cv(system_prompt, user_prompt, image_path)

    if hasattr(response, 'choices'):
        feedback = response.choices[0].message.content
    elif hasattr(response, 'output'):
        feedback = response.output[1].content[0].text
    else:
        feedback = response
        
    results.append({
        'q_idx': q_idx,
        'a_idx': a_idx,
        'answer_path': f"{image_file}",
        'answer_code': str(feedback)
    })
    
    df = pd.DataFrame(results)
    df = df.astype(str)
    df = df.sort_values(by=['q_idx', 'a_idx'])
    df.to_csv(ocr_path_out, index=False)
    xlsx_path = ocr_path_out.replace('.csv', '.xlsx')
    df.to_excel(xlsx_path, index=False)


  0%|          | 0/25 [00:00<?, ?it/s]

processing q 1 a 1 ...


  4%|▍         | 1/25 [00:44<17:36, 44.03s/it]

processing q 1 a 2 ...


  8%|▊         | 2/25 [02:03<24:51, 64.85s/it]

processing q 1 a 3 ...


 12%|█▏        | 3/25 [03:10<24:12, 66.01s/it]

processing q 1 a 4 ...


 16%|█▌        | 4/25 [03:35<17:24, 49.72s/it]

processing q 1 a 5 ...


 20%|██        | 5/25 [04:35<17:49, 53.48s/it]

processing q 2 a 1 ...


 24%|██▍       | 6/25 [05:40<18:10, 57.40s/it]

processing q 2 a 2 ...


 28%|██▊       | 7/25 [06:39<17:20, 57.82s/it]

processing q 2 a 3 ...


 32%|███▏      | 8/25 [07:32<15:59, 56.43s/it]

processing q 2 a 4 ...


 36%|███▌      | 9/25 [08:13<13:42, 51.42s/it]

processing q 2 a 5 ...


 40%|████      | 10/25 [09:28<14:42, 58.85s/it]

processing q 3 a 1 ...


 44%|████▍     | 11/25 [10:08<12:21, 52.96s/it]

processing q 3 a 2 ...


 48%|████▊     | 12/25 [10:56<11:10, 51.54s/it]

processing q 3 a 3 ...


 52%|█████▏    | 13/25 [11:57<10:50, 54.20s/it]

processing q 3 a 4 ...


 56%|█████▌    | 14/25 [12:42<09:28, 51.66s/it]

processing q 3 a 5 ...


 60%|██████    | 15/25 [13:32<08:30, 51.03s/it]

processing q 4 a 1 ...


 64%|██████▍   | 16/25 [14:30<07:57, 53.05s/it]

processing q 4 a 2 ...


 68%|██████▊   | 17/25 [15:13<06:40, 50.11s/it]

processing q 4 a 3 ...


 72%|███████▏  | 18/25 [15:41<05:03, 43.41s/it]

processing q 4 a 4 ...


 76%|███████▌  | 19/25 [16:36<04:41, 46.87s/it]

processing q 4 a 5 ...


 80%|████████  | 20/25 [17:09<03:34, 42.82s/it]

processing q 5 a 1 ...


 84%|████████▍ | 21/25 [18:25<03:31, 52.90s/it]

processing q 5 a 2 ...


 88%|████████▊ | 22/25 [19:27<02:46, 55.43s/it]

processing q 5 a 3 ...


 92%|█████████▏| 23/25 [20:13<01:45, 52.57s/it]

processing q 5 a 4 ...


 96%|█████████▌| 24/25 [20:46<00:46, 46.71s/it]

processing q 5 a 5 ...


100%|██████████| 25/25 [21:56<00:00, 52.66s/it]


In [None]:
# df.to_excel('GPT-OCR-results-251209.xlsx', index=False, engine='openpyxl')

In [34]:
ocr_path_out = './results/GPT-OCR-results-251216A.csv'
results = pd.read_csv(ocr_path_out, index_col=None)

In [36]:
for idx, row in tqdm(results.iterrows()):
    question, answer = "q"+str(row['q_idx']), "a"+str(row['a_idx'])
    if isinstance(row['answer_code'], str) and len(row['answer_code'].strip()) > 0:
        continue
    
    print(f"Processing question idx {row['q_idx']} and answer idx {row['a_idx']}...")
    
    # answer_text = row['code']
    image_path = os.path.join(image_dir_aug, row['answer_path'])
    print(f">> if the image path {image_path} exist?", os.path.exists(image_path))
    
    print("processing q {} a {} ...".format(question, answer))
    response = llm_generate_cv(system_prompt, user_prompt, image_path)

    if hasattr(response, 'choices'):
        feedback = response.choices[0].message.content
    elif hasattr(response, 'output'):
        feedback = response.output[1].content[0].text
    else:
        feedback = response
    
    feedback = "'" + feedback if isinstance(feedback, str) and feedback.startswith("=") else feedback

    results.at[idx, 'answer_code'] = str(feedback)
    
    df = pd.DataFrame(results)
    df = df.astype(str)
    df = df.sort_values(by=['q_idx', 'a_idx'])
    df.to_csv(ocr_path_out, index=False)
    xlsx_path = ocr_path_out.replace('.csv', '.xlsx')
    df.to_excel(xlsx_path, index=False)

0it [00:00, ?it/s]

Processing question idx 4 and answer idx 3...
>> if the image path ./Images_aug/q4_a3.png exist? True
processing q q4 a a3 ...


18it [00:24,  1.39s/it]

Processing question idx 4 and answer idx 4...
>> if the image path ./Images_aug/q4_a4.png exist? True
processing q q4 a a4 ...


19it [01:09,  4.56s/it]

Processing question idx 4 and answer idx 5...
>> if the image path ./Images_aug/q4_a5.png exist? True
processing q q4 a a5 ...


20it [01:29,  6.01s/it]

Processing question idx 5 and answer idx 3...
>> if the image path ./Images_aug/q5_a3.png exist? True
processing q q5 a a3 ...


23it [02:26,  9.78s/it]

Processing question idx 5 and answer idx 4...
>> if the image path ./Images_aug/q5_a4.png exist? True
processing q q5 a a4 ...


25it [03:26,  8.24s/it]
