In [16]:
import os
import base64
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

# Load API key from .env if present
load_dotenv()
client = OpenAI()

image_folder = "contest_images"
output = []

# Path to existing captions (to avoid duplicates)
existing_csv_path = "/Users/chivo/Downloads/data_studio/project4/new_yorker_contest/ai_captions_chatgpt_textonly.csv"

# Load already-captioned contest IDs
if os.path.exists(existing_csv_path):
    existing_df = pd.read_csv(existing_csv_path)
    already_captioned_ids = set(existing_df["contest"].astype(str))
    print(f"📄 Found existing captions for {len(already_captioned_ids)} contests.")
else:
    already_captioned_ids = set()
    print("📄 No existing captions found. Starting fresh.")

# Loop through images
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]

for image_name in image_files:
    contest_id = os.path.splitext(image_name)[0]

    if contest_id in already_captioned_ids:
        print(f"⏩ Skipping {contest_id} (already captioned)")
        continue

    image_path = os.path.join(image_folder, image_name)

    # Encode the image as base64 and convert to image_url format
    with open(image_path, "rb") as img_file:
        encoded_image = base64.b64encode(img_file.read()).decode("utf-8")
        image_url_data = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{encoded_image}"
            }
        }

    try:
        response = client.chat.completions.create(
            model="gpt-4o-2024-05-13",  # or "gpt-4o-mini" if available to you
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Write the funniest possible caption for this cartoon. Think like The New Yorker — clever, absurd, or dry humor welcome."
                        },
                        image_url_data
                    ],
                }
            ],
            max_tokens=100,
        )

        caption = response.choices[0].message.content.strip()
        print(f"✅ [{contest_id}] Caption: {caption}")

        output.append({
            "contest": contest_id,
            "model": "ChatGPT (gpt-4o)",
            "caption": caption
        })

    except Exception as e:
        print(f"❌ [{contest_id}] Error: {e}")

# Save updated captions
if output:
    new_df = pd.DataFrame(output)
    if os.path.exists(existing_csv_path):
        full_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        full_df = new_df
    full_df.to_csv(chatgpt, index=False)
    print(f"💾 Saved {len(new_df)} new captions to: {existing_csv_path}")
else:
    print("✅ All captions already exist. Nothing new to save.")


📄 No existing captions found. Starting fresh.
✅ [807_Dashboard] Caption: "Turns out, our maze is so hard, even the cheese applied for early retirement."
✅ [689_Dashboard] Caption: "I'm afraid the DMV doesn't recognize 'valor and chivalry' as valid forms of insurance."
✅ [538_Dashboard] Caption: "As a tailor, Mr. Johnson learned the hard way that some projects simply required a mouse-click escape route."
✅ [634_Dashboard] Caption: “This is why I never book the 'Scenic Pyramid View' on Expedia.”
✅ [764_Dashboard] Caption: "And there, at the summit, lays the elusive Zenfish—sought by monks and aquarium enthusiasts alike."
✅ [585_Dashboard] Caption: "First wish granted: an endless supply of toast. Next?"
✅ [623_Dashboard] Caption: "I'm telling you, officer, he thought 'dog park' and 'kite park' were the same thing."
✅ [773_Dashboard] Caption: "Darling, I think we've finally discovered where all the water in the apartment is going."
✅ [592_Dashboard] Caption: "Sure, they can type — but wait

NameError: name 'chatgpt' is not defined

In [21]:
import pandas as pd

# Load your CSV
df = pd.read_csv("ai_captions_chatGPT.csv")

# Clean up captions (remove outer quotes + whitespace)
df["caption"] = df["caption"].astype(str).str.strip().str.replace(r'^["“”]+|["“”]+$', '', regex=True)

# ✅ Detect question mark
df["has_question_mark"] = df["caption"].str.contains(r"\?")

# ✅ Detect ending punctuation (., !, ?)
df["ends_with_punctuation"] = df["caption"].str.extract(r"([\.\!\?])$").notnull()

# ✅ Caption length
df["caption_length"] = df["caption"].str.len()

# Save
df.to_csv("ai_captions_chatGPT_enriched.csv", index=False)
print("✅ Cleaned and saved as 'ai_captions_chatGPT_enriched.csv'")


✅ Cleaned and saved as 'ai_captions_chatGPT_enriched.csv'


In [22]:
df = pd.read_csv("ai_captions_chatGPT_enriched.csv")

# Remove '_Dashboard' from 'contest' column
df["contest"] = df["contest"].astype(str).str.replace("_Dashboard", "", regex=False)

# Save cleaned version
df.to_csv("ai_captions_chatGPT_enriched.csv", index=False)
print("✅ Cleaned 'contest' column and saved.")


✅ Cleaned 'contest' column and saved.
