# Setting up

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Fetch the Web Page

In [3]:
url = 'https://nextml.github.io/caption-contest-data/'
response = requests.get(url)
html = response.text

## Soup

In [10]:
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', attrs={'style': 'width:80%'})
#table

## Loop

In [11]:
data = []

for row in table.find_all('tr')[1:]:  # skip header
    tds = row.find_all('td')


## Extract

In [22]:
data = []

# Loop through all table rows, skipping the header row
for row in table.find_all('tr')[1:]:
    tds = row.find_all('td')

    if len(tds) >= 6:  # just a safety check
        # First td
        link_tag = tds[0].find('a')
        contest = link_tag.text.strip()
        link = url + link_tag['href']

        # Second td (img)
        img_tag = tds[1].find('img')
        photo = url + img_tag['src'] if img_tag else None

        # Sixth td (vote count), clean commas
        raw_votes = tds[5].text.strip().replace(',', '')
        try:
            total_votes = int(raw_votes)
        except ValueError:
            total_votes = None  # fallback if it's not a number

        image_folder = 'contest_images'
        os.makedirs(image_folder, exist_ok=True)

        img_filename = None
        if photo:
        # Get the image content
            img_response = requests.get(photo)
        if img_response.status_code == 200:
            
            # Use contest name or some unique identifier to name the file
            img_filename = f"{contest.replace(' ', '_')}.jpg"
            img_path = os.path.join(image_folder, img_filename)

        # Save the image
        with open(img_path, 'wb') as f:
            f.write(img_response.content)

        
        data.append({
            'contest': contest,
            'link': link,
            'photo_url': photo,
            'photo_file': img_filename,
            'total votes': total_votes
        })

df = pd.DataFrame(data)

df.to_csv('caption_contest_data.csv', index=False)


In [31]:
import requests
from bs4 import BeautifulSoup

page_url = 'https://nextml.github.io/caption-contest-data/dashboards/895.html'
response = requests.get(page_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Print out all <a> tags to see what’s there
for a in soup.find_all('a'):
    print("TEXT:", a.text.strip())
    print("HREF:", a.get('href'))
    print("-" * 40)


TEXT: 
HREF: ../cartoons/895.jpg
----------------------------------------
TEXT: 895.csv
HREF: ../summaries/895.csv
----------------------------------------
TEXT: standard error
                  of the mean
HREF: https://en.wikipedia.org/wiki/Standard_error
----------------------------------------
TEXT: 68-95-99.7 rule
HREF: https://en.wikipedia.org/wiki/68–95–99.7_rule
----------------------------------------


# debugging links, they didn't load before 
df = pd.read_csv("caption_contest_data.csv")

for _, row in df.iterrows():
    contest_name = row['contest']
    dashboard_url = row['link']  # This already has the full link like https://.../dashboards/895.html

    print(f"🔍 [{contest_name}] Dashboard URL: {dashboard_url}")

    response = requests.get(dashboard_url)
    
    if response.status_code != 200:
        print(f"❌ [{contest_name}] Dashboard not found (status {response.status_code})")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    link_tag = soup.select_one('div.row ul li a[href$=".csv"]')

    if link_tag:
        csv_url = urljoin(dashboard_url, link_tag['href'])
        print(f"✅ [{contest_name}] CSV found: {csv_url}")
    else:
        print(f"❌ [{contest_name}] CSV link not found in HTML")


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os

# Load the table with all dashboard links
df = pd.read_csv("caption_contest_data.csv")

# Make folders to store output
os.makedirs("dashboards/top_captions", exist_ok=True)
os.makedirs("dashboards/updated_data", exist_ok=True)

for _, row in df.iterrows():
    contest = row['contest']
    dashboard_url = row['link']

    print(f"\n🔍 Processing Contest {contest}...")

    try:
        response = requests.get(dashboard_url)
        if response.status_code != 200:
            print(f"[{contest}] Dashboard not found (Status {response.status_code})")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        link_tag = soup.select_one('div.row ul li a[href$=".csv"]')

        if not link_tag:
            print(f"[{contest}] CSV link not found in page")
            continue

        csv_url = urljoin(dashboard_url, link_tag['href'])
        df_contest = pd.read_csv(csv_url)

        # -- Top 5 Funny & Unfunny --
        top_funny = df_contest.nlargest(5, 'funny').copy()
        top_unfunny = df_contest.nlargest(5, 'not_funny').copy()

        top_funny['category'] = 'Top 5 Funny'
        top_unfunny['category'] = 'Top 5 Unfunny'

        top_captions = pd.concat([top_funny, top_unfunny])
        top_captions = top_captions[['caption', 'funny', 'not_funny', 'votes', 'category']]
        top_captions.rename(columns={'funny': 'funny_votes', 'not_funny': 'unfunny_votes'}, inplace=True)
        top_captions.insert(0, 'contest', contest)

        top_path = f"dashboards/top_captions/top_captions_{contest}.csv"
        top_captions.to_csv(top_path, index=False)

        # -- Add extra features to full data --
        df_contest['has_question_mark'] = df_contest['caption'].str.contains(r'\?', regex=True)
        df_contest['ends_with_punctuation'] = df_contest['caption'].str.endswith(('.', '!', '?'))
        df_contest['caption_length'] = df_contest['caption'].str.len()

        updated_path = f"dashboards/updated_data/updated_{contest}.csv"
        df_contest.to_csv(updated_path, index=False)

        #print(f"✅ [{contest}] Dashboard processed and saved.")

    except Exception as e:
        #print(f"[{contest}] Error: {e}")



🔍 Processing Contest 895 Dashboard...
✅ [895 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 894 Dashboard...
✅ [894 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 893 Dashboard...
✅ [893 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 892 Dashboard...
✅ [892 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 891 Dashboard...
✅ [891 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 890 Dashboard...
✅ [890 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 889 Dashboard...
✅ [889 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 888 Dashboard...
✅ [888 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 887 Dashboard...
✅ [887 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 886 Dashboard...
✅ [886 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 885 Dashboard...
✅ [885 Dashboard] Dashboard processed and saved.

🔍 Processing Contest 884 Dashbo

## test with one csv 

In [5]:
import pandas as pd 

# Selecting top 5 captions with highest "funny" votes and top 5 with highest "not_funny" votes
top_funny = df_895.nlargest(5, 'funny').copy()
top_unfunny = df_895.nlargest(5, 'not_funny').copy()

# Adding a category column
top_funny['category'] = 'Top 5 Funny'
top_unfunny['category'] = 'Top 5 Unfunny'

# Creating a new dataframe for these captions
contest_name = "895"
top_captions_df = pd.concat([top_funny, top_unfunny]).copy()

# Selecting required columns and renaming them
top_captions_df = top_captions_df[['caption', 'funny', 'not_funny', 'votes', 'category']]
top_captions_df.rename(columns={'funny': 'funny_votes', 'not_funny': 'unfunny_votes'}, inplace=True)

# Adding 'contest' column
top_captions_df.insert(0, 'contest', contest_name)

# Saving this as a new CSV file
top_captions_file_path = "top_captions_895.csv"
top_captions_df.to_csv(top_captions_file_path, index=False)

# Extracting additional information for the original dataset
df_895['has_question_mark'] = df_895['caption'].str.contains(r'\?', regex=True)
df_895['ends_with_punctuation'] = df_895['caption'].str.endswith(('.', '!', '?'))
df_895['caption_length'] = df_895['caption'].str.len()

# Saving the updated dataset back
updated_file_path = "updated_895.csv"
df_895.to_csv(updated_file_path, index=False)

# Providing the file paths
top_captions_file_path, updated_file_path


('top_captions_895.csv', 'updated_895.csv')

## AI caption 

In [74]:
import os

image_folder = "contest_images"
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]

for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)

    # We'll send this image to each model
    #print(f"Processing: {image_name}")


## CHAT GPT 

from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [None]:
Moved to a different notebook 
import os
import base64
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

# Load API key from .env if present
load_dotenv()
client = OpenAI()

image_folder = "contest_images"
output = []

# Path to existing captions (to avoid duplicates)
existing_csv_path = "/Users/chivo/Downloads/data_studio/project4/new_yorker_contest/ai_captions_chatgpt_textonly.csv"

# Load already-captioned contest IDs
if os.path.exists(existing_csv_path):
    existing_df = pd.read_csv(existing_csv_path)
    already_captioned_ids = set(existing_df["contest"].astype(str))
    print(f"📄 Found existing captions for {len(already_captioned_ids)} contests.")
else:
    already_captioned_ids = set()
    print("📄 No existing captions found. Starting fresh.")

# Loop through images
image_files = [f for f in os.listdir(image_folder) if f.endswith('.jpg')]

for image_name in image_files:
    contest_id = os.path.splitext(image_name)[0]

    if contest_id in already_captioned_ids:
        print(f"⏩ Skipping {contest_id} (already captioned)")
        continue

    image_path = os.path.join(image_folder, image_name)

    # Encode the image as base64 and convert to image_url format
    with open(image_path, "rb") as img_file:
        encoded_image = base64.b64encode(img_file.read()).decode("utf-8")
        image_url_data = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{encoded_image}"
            }
        }

    try:
        response = client.chat.completions.create(
            model="gpt-4o",  # or "gpt-4o-mini" if available to you
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Write the funniest possible caption for this cartoon. Think like The New Yorker — clever, absurd, or dry humor welcome."
                        },
                        image_url_data
                    ],
                }
            ],
            max_tokens=100,
        )

        caption = response.choices[0].message.content.strip()
        print(f"✅ [{contest_id}] Caption: {caption}")

        output.append({
            "contest": contest_id,
            "model": "ChatGPT (gpt-4o)",
            "caption": caption
        })

    except Exception as e:
        print(f"❌ [{contest_id}] Error: {e}")

# Save updated captions
if output:
    new_df = pd.DataFrame(output)
    if os.path.exists(existing_csv_path):
        full_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        full_df = new_df
    full_df.to_csv(existing_csv_path, index=False)
    print(f"💾 Saved {len(new_df)} new captions to: {existing_csv_path}")
else:
    print("✅ All captions already exist. Nothing new to save.")


In [72]:
print("Using key:", os.getenv("OPENAI_API_KEY")[:10], "...")
print("Project ID:", os.getenv("OPENAI_PROJECT_ID"))
print("Org ID:", os.getenv("OPENAI_ORG_ID"))


Using key: sk-proj-T4 ...
Project ID: proj_mbMtq2w8hn6Cqz2zOlFx9dTD
Org ID: org-R4jrwuaXg4bAXyoNr9Up4ltF


## Deepseek 

In [49]:
!pip install deepseek_vl



In [51]:
import deepseek_vl
print(deepseek_vl.__version__)

ModuleNotFoundError: No module named 'deepseek_vl'

In [53]:
import os
import requests
import base64
import pandas as pd
import time
from dotenv import load_dotenv
import torch
from transformers import AutoModelForCausalLM
from deepseek_vl.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl.utils.io import load_pil_images

# Load DeepSeek API key
load_dotenv()
deepseek_key = os.getenv("DEEPSEEK_API_KEY")

deepseek_headers = {
    "Authorization": f"Bearer {deepseek_key}",
    "Content-Type": "application/json"
}

# Folder where images live
image_folder = "contest_images"
output = []

def call_deepseek(img_path, contest_id):
    # Read and encode the image file
    with open(img_path, "rb") as img_file:
        encoded = base64.b64encode(img_file.read()).decode("utf-8")

    # Construct the request body
    body = {
        "model": "deepseek-ai/deepseek-vl2-small",  # Replace with the correct model name
        "messages": [
            {
                "role": "user",
                "content": "Write a hilarious caption for this cartoon."
            }
        ],
        "image": f"data:image/jpeg;base64,{encoded}",  # Send image data separately
        "max_tokens": 100
    }

    try:
        # Send the request to the DeepSeek API
        response = requests.post("https://api.deepseek.com/v1/chat/completions", headers=deepseek_headers, json=body)
        response.raise_for_status()
        caption = response.json()["choices"][0]["message"]["content"].strip()
        print(f"✅ [DeepSeek - {contest_id}] Caption: {caption}")
        return caption
    except requests.exceptions.HTTPError as e:
        print(f"❌ [DeepSeek - {contest_id}] HTTP Error: {e}")
        print(f"Response: {response.text}")  # Log the response body for more details
        return None
    except Exception as e:
        print(f"❌ [DeepSeek - {contest_id}] Error: {e}")
        return None

# Loop over images and collect captions
for image_name in os.listdir(image_folder):
    if image_name.lower().endswith(".jpg"):
        contest_id = os.path.splitext(image_name)[0]
        image_path = os.path.join(image_folder, image_name)

        caption = call_deepseek(image_path, contest_id)
        if caption:
            output.append({
                "contest": contest_id,
                "model": "DeepSeek",
                "caption": caption
            })

        # Add a delay between requests to avoid rate limiting
        time.sleep(10)  # Wait for 10 seconds before the next request

# Save to CSV
df = pd.DataFrame(output)
df.to_csv("deepseek_captions.csv", index=False)
print("✅ All captions saved to deepseek_captions.csv")

ModuleNotFoundError: No module named 'deepseek_vl'

In [28]:
body = {
    "model": "deepseek-vision",  # Replace with the correct model name
    "messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this image."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}}
            ]
        }
    ],
    "max_tokens": 100
}

NameError: name 'encoded' is not defined

## Claude

In [17]:
#!pip install requests



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [54]:
import os
import requests
import base64
from dotenv import load_dotenv
import pandas as pd

load_dotenv()
anthropic_key = os.getenv("ANTHROPIC_API_KEY")

headers = {
    "x-api-key": anthropic_key,
    "anthropic-version": "2023-06-01"
}

image_folder = "contest_images"
output = []

for image_name in os.listdir(image_folder):
    if not image_name.lower().endswith(".jpg"):
        continue

    image_path = os.path.join(image_folder, image_name)
    contest_id = os.path.splitext(image_name)[0]

    with open(image_path, "rb") as img_file:
        img_bytes = img_file.read()
        img_base64 = base64.b64encode(img_bytes).decode("utf-8")

    try:
        response = requests.post(
            "https://api.anthropic.com/v1/messages",
            headers=headers,
            json={
                "model": "claude-3-opus-20240229",
                "max_tokens": 100,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/jpeg",
                                    "data": img_base64
                                }
                            },
                            {
                                "type": "text",
                                "text": "Write the funniest possible caption for this cartoon. Be clever, witty, or absurd like The New Yorker."
                            }
                        ]
                    }
                ]
            }
        )

        response.raise_for_status()
        caption = response.json()["content"][0]["text"].strip()
        print(f"✅ [Claude - {contest_id}] Caption: {caption}")

        output.append({
            "contest": contest_id,
            "model": "Claude",
            "caption": caption
        })

    except Exception as e:
        print(f"❌ [Claude - {contest_id}] Error: {e}")

# Save Claude captions
pd.DataFrame(output).to_csv("claude_captions.csv", index=False)


✅ [Claude - 807_Dashboard] Caption: "After years of selective breeding, Dr. Corntassel unveils his prized creation: the infinitely recursive cornfield. Nobel Prize nomination pending."

The cartoon depicts two scientists overlooking an intricate cornfield that appears to repeat endlessly into the distance, suggesting an absurdly recursive agricultural experiment. The witty caption plays on this visual absurdity, jokingly presenting it as the groundbreaking work of a fictional scientist seeking accolades for his outlan
✅ [Claude - 689_Dashboard] Caption: The knight on his checkered steed sternly commands the ticketing officer, "As you can plainly see, officer, this horse has a chess set painted on it. I'm only transporting my game board, not evading carpool regulations. Checkmate!"
✅ [Claude - 538_Dashboard] Caption: "I asked for yarn, not yawn! This hobby is putting me to sleep."

The cartoon humorously depicts a giant cat who appears bored and unimpressed by the tiny human offering it

In [11]:
import os
import requests
import base64
from dotenv import load_dotenv
import pandas as pd

load_dotenv()
anthropic_key = os.getenv("ANTHROPIC_API_KEY")

headers = {
    "x-api-key": anthropic_key,
    "anthropic-version": "2023-06-01"
}

image_folder = "contest_images"
output = []

# ✅ Load existing captions to skip duplicates
existing_file = "claude_captions.csv"
if os.path.exists(existing_file):
    existing_df = pd.read_csv(existing_file)
    existing_contests = set(existing_df["contest"].astype(str))
    print(f"📄 Found {len(existing_contests)} existing Claude captions.")
else:
    existing_contests = set()
    print("📄 No existing Claude captions found. Starting fresh.")

# 🔁 Loop through images
for image_name in os.listdir(image_folder):
    if not image_name.lower().endswith(".jpg"):
        continue

    contest_id = os.path.splitext(image_name)[0]
    if contest_id in existing_contests:
        print(f"⏩ [Claude - {contest_id}] Skipped (already captioned)")
        continue

    image_path = os.path.join(image_folder, image_name)

    with open(image_path, "rb") as img_file:
        img_bytes = img_file.read()
        img_base64 = base64.b64encode(img_bytes).decode("utf-8")

    try:
        response = requests.post(
            "https://api.anthropic.com/v1/messages",
            headers=headers,
            json={
                "model": "claude-3-opus-20240229",
                "max_tokens": 100,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "image/jpeg",
                                    "data": img_base64
                                }
                            },
                            {
                                "type": "text",
                                "text": "Write the funniest possible caption for this cartoon. Be clever, witty, or absurd like The New Yorker. Please DO NOT describe the cartoon, and don't say depicts something. Don't Explain yourself. Just the caption is enough! Thanks."
                            }
                        ]
                    }
                ]
            }
        )

        response.raise_for_status()
        caption = response.json()["content"][0]["text"].strip()
        print(f"✅ [Claude - {contest_id}] Caption: {caption}")

        output.append({
            "contest": contest_id,
            "model": "Claude",
            "caption": caption
        })

    except Exception as e:
        print(f"❌ [Claude - {contest_id}] Error: {e}")

# 💾 Save new captions (merge with existing if needed)
if output:
    new_df = pd.DataFrame(output)

    # ➕ Add new analysis columns
    new_df["has_question_mark"] = new_df["caption"].str.contains(r"\?")
    new_df["ends_with_punctuation"] = new_df["caption"].str.endswith((".", "!", "?"))
    new_df["caption_length"] = new_df["caption"].str.len()

    if os.path.exists(existing_file):
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        combined_df = new_df

    combined_df.to_csv(existing_file, index=False)
    print(f"💾 Saved {len(new_df)} new captions → {existing_file}")



📄 Found 376 existing Claude captions.
⏩ [Claude - 807_Dashboard] Skipped (already captioned)
⏩ [Claude - 689_Dashboard] Skipped (already captioned)
⏩ [Claude - 538_Dashboard] Skipped (already captioned)
⏩ [Claude - 634_Dashboard] Skipped (already captioned)
⏩ [Claude - 764_Dashboard] Skipped (already captioned)
⏩ [Claude - 585_Dashboard] Skipped (already captioned)
⏩ [Claude - 623_Dashboard] Skipped (already captioned)
⏩ [Claude - 773_Dashboard] Skipped (already captioned)
⏩ [Claude - 592_Dashboard] Skipped (already captioned)
⏩ [Claude - 810_Dashboard] Skipped (already captioned)
⏩ [Claude - 883_Dashboard] Skipped (already captioned)
⏩ [Claude - 829_Dashboard] Skipped (already captioned)
⏩ [Claude - 516_Dashboard] Skipped (already captioned)
❌ [Claude - 894_Dashboard] Error: 400 Client Error: Bad Request for url: https://api.anthropic.com/v1/messages
⏩ [Claude - 814_Dashboard] Skipped (already captioned)
⏩ [Claude - 777_Dashboard] Skipped (already captioned)
⏩ [Claude - 627_Dashboard]

In [82]:
import pandas as pd

# Load your CSV
df = pd.read_csv("claude_captions.csv")

# Clean up captions (remove quotes + whitespace)
df["caption"] = df["caption"].astype(str).str.strip().str.strip('"')

# ✅ Detect question mark
df["has_question_mark"] = df["caption"].str.contains(r"\?")

# ✅ Detect ending punctuation (., !, ?)
df["ends_with_punctuation"] = df["caption"].str.extract(r"([\.\!\?])$").notnull()

# ✅ Caption length
df["caption_length"] = df["caption"].str.len()

# Save
df.to_csv("claude_captions_enriched.csv", index=False)
print("✅ Cleaned and saved as 'claude_captions_enriched.csv'")


✅ Cleaned and saved as 'claude_captions_enriched.csv'


In [83]:
df = pd.read_csv("claude_captions_enriched.csv")

# Remove '_Dashboard' from 'contest' column
df["contest"] = df["contest"].astype(str).str.replace("_Dashboard", "", regex=False)

# Save cleaned version
df.to_csv("claude_captions_enriched.csv", index=False)
print("✅ Cleaned 'contest' column and saved.")


✅ Cleaned 'contest' column and saved.
