In [None]:
import pandas as pd
import json
import os
from google import genai

# --- CONFIG ---
API_KEY = "ENTER API KEY"
client = genai.Client(api_key=API_KEY)
output_file = 'labeled_dataset.csv'

In [None]:
EMOTIONS = [
    "Contentment", "Tenderness", "Melancholy", "Bleakness", "Nostalgia",
    "Isolation", "Terror", "Excitement", "Euphoria", "Anxiety",
    "Despair", "Grief"
]

def get_emotions(input_dict):
    # standard python prompt, no complex schemas
    prompt = f"""
    You are an emotion classifier.
    Allowed emotions: {EMOTIONS}

    Input Dictionary: {input_dict}

    Task: Return a raw JSON dictionary where the keys are the input IDs and the values are the predicted emotions.
    Do not add Markdown formatting (like ```json). Just the raw JSON string.
    """

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
        config={ "response_mime_type": "application/json" }
    )

    # Simple text parsing
    return json.loads(response.text)

In [None]:
# --- MAIN LOOP ---
df = pd.read_csv('dataset_manifest.csv', header=None, names=['id', 'text', 'image_path'])

# Resume logic
start_index = 0
if os.path.exists(output_file):
    start_index = len(pd.read_csv(output_file))
    print(f"Resuming from index {start_index}")

for i in range(start_index, len(df), 200):
    batch = df.iloc[i : i+200]
    batch_dict = pd.Series(batch.text.values, index=batch.id).to_dict()

    try:
        emotions_dict = get_emotions(batch_dict)
    except Exception as e:
        print(f"Error at index {i}: {e}")
        break

    batch_results = []
    for row_id, description in batch_dict.items():
        batch_results.append({
            'id': row_id,
            'emotion': emotions_dict.get(row_id, "Unknown"),
            'image_path': batch.loc[batch.id == row_id, 'image_path'].values[0]
        })

    # Save immediately
    # If file doesn't exist, write header. If it does, append without header.
    file_exists = os.path.exists(output_file)
    pd.DataFrame(batch_results).to_csv(
        output_file,
        mode='a',
        header=not file_exists,
        index=False
    )

    print(f"Processed batch {i}")

Processed batch 0
Processed batch 200
Processed batch 400
Processed batch 600
Processed batch 800
Processed batch 1000
Processed batch 1200
Processed batch 1400
Processed batch 1600
Processed batch 1800
Processed batch 2000
Processed batch 2200
Processed batch 2400
Processed batch 2600
Processed batch 2800
Processed batch 3000
Processed batch 3200
Processed batch 3400
Processed batch 3600
Processed batch 3800
Processed batch 4000
Processed batch 4200
Processed batch 4400
Processed batch 4600
Processed batch 4800


In [None]:
df_2 = pd.read_csv("labeled_dataset.csv")
len(df_2)==len(df)

True

In [None]:
df_2.head(100)

Unnamed: 0,id,emotion,image_path
0,photo_id,Unknown,image_path
1,oSf8ePoG9NU,Contentment,adobe-colour-correction/dataset/images/oSf8ePo...
2,DlsOa5moK4w,Tenderness,adobe-colour-correction/dataset/images/DlsOa5m...
3,XBGacbT3vXI,Contentment,adobe-colour-correction/dataset/images/XBGacbT...
4,FjikPptEbZg,Contentment,adobe-colour-correction/dataset/images/FjikPpt...
...,...,...,...
95,fWb-lvqWjqI,Melancholy,adobe-colour-correction/dataset/images/fWb-lvq...
96,Yh6K2eTr_FY,Contentment,adobe-colour-correction/dataset/images/Yh6K2eT...
97,CTdSlskmvmU,Melancholy,adobe-colour-correction/dataset/images/CTdSlsk...
98,YXhV27bQ3dY,Contentment,adobe-colour-correction/dataset/images/YXhV27b...
