In [2]:
#!pip install transformers torch pandas

In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import torch
from tqdm import tqdm

# Load data
df = pd.read_csv("combined_captions_detailed.csv")

# Load HuggingFace sentiment pipeline
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Use GPU if available
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

# Function to get sentiment label + score
def analyze_sentiment(text):
    if pd.isna(text) or text.strip() == "":
        return {"label": None, "score": None}
    try:
        result = classifier(text[:512])[0]  # Truncate to max 512 tokens
        return result
    except Exception as e:
        return {"label": None, "score": None}

# Run for each caption type
for col in ["caption_human", "caption_chatgpt", "caption_claude"]:
    tqdm.pandas(desc=f"Analyzing {col}")
    sentiment_results = df[col].progress_apply(analyze_sentiment)
    df[f"sentiment_label_{col}"] = sentiment_results.apply(lambda x: x["label"])
    df[f"sentiment_score_{col}"] = sentiment_results.apply(lambda x: x["score"])

# Save
df.to_csv("combined_captions_with_hf_sentiment.csv", index=False)
print("✅ Sentiment saved to combined_captions_with_hf_sentiment.csv")


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Device set to use cpu
Analyzing caption_human:   0%|               | 4/3850 [00:00<02:00, 31.98it/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Analyzing caption_human: 100%|████████████| 3850/3850 [01:44<00:00, 36.77it/s]
Analyzing caption_chatgpt: 100%|██████████| 3850/3850 [01:40<00:00, 38.17it/s]
Analyzing caption_claude: 100%|███████████| 3850/3850 [01:48<00:00, 35.62it/s]


✅ Sentiment saved to combined_captions_with_hf_sentiment.csv


## Sarcasm detection 
https://huggingface.co/mrm8488/t5-base-finetuned-sarcasm-twitter

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_name = "mrm8488/t5-base-finetuned-sarcasm-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

sarcasm_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def detect_sarcasm_t5(text):
    input_text = f"sarcasm detection: {text}"
    result = sarcasm_pipe(input_text)[0]["generated_text"]
    return result

# Apply to your captions
df["sarcasm_label_caption_human"] = df["caption_human"].astype(str).apply(detect_sarcasm_t5)


Device set to use mps:0


KeyboardInterrupt: 

In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from tqdm import tqdm

# Load data
df = pd.read_csv("combined_captions_with_hf_sentiment.csv")  # Use your actual file

# Load sarcasm model
model_name = "mrm8488/t5-base-finetuned-sarcasm-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
sarcasm_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Sarcasm detection function
def detect_sarcasm(text):
    try:
        input_text = f"sarcasm detection: {text}"
        result = sarcasm_pipe(input_text, max_length=10)[0]["generated_text"]
        return result
    except Exception as e:
        return "error"

# Apply per caption (with progress bar)
tqdm.pandas()

df["sarcasm_caption_human"] = df["caption_human"].astype(str).progress_apply(detect_sarcasm)
df["sarcasm_caption_chatgpt"] = df["caption_chatgpt"].astype(str).progress_apply(detect_sarcasm)
df["sarcasm_caption_claude"] = df["caption_claude"].astype(str).progress_apply(detect_sarcasm)

# Save enriched file
df.to_csv("captions_with_sarcasm_labels.csv", index=False)
print("✅ Saved with sarcasm labels!")



Device set to use mps:0
100%|█████████████████████████████████████| 3850/3850 [19:37<00:00,  3.27it/s]
100%|█████████████████████████████████████| 3850/3850 [17:10<00:00,  3.74it/s]
100%|█████████████████████████████████████| 3850/3850 [20:35<00:00,  3.12it/s]

✅ Saved with sarcasm labels!





# This did not work! Try again when time allows 