## Computer Embeddings to find similarity 

In [3]:
#!pip install sentence_transformers

In [12]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load dataset
df = pd.read_csv("combined_captions_long_format.csv")

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# STEP 1: Get one top human caption per contest
top_humans = (
    df[df["model"] == "Human"]
    .groupby("contest")
    .first()
    .reset_index()[["contest", "caption"]]
    .rename(columns={"caption": "human_caption"})
)

# Compute embeddings for human captions
top_humans["human_embedding"] = top_humans["human_caption"].astype(str).apply(lambda x: model.encode(x))

# STEP 2: Drop duplicate AI captions
ai_df = (
    df[df["model"] != "Human"]
    .drop_duplicates(subset=["contest", "caption", "model"])
    .rename(columns={"caption": "ai_caption"})
)

# Compute embeddings for AI captions
ai_df["ai_embedding"] = ai_df["ai_caption"].astype(str).apply(lambda x: model.encode(x))

# STEP 3: Compute cosine similarity
results = []

for _, ai_row in ai_df.iterrows():
    contest_id = ai_row["contest"]
    ai_caption = ai_row["ai_caption"]
    ai_model = ai_row["model"]
    ai_emb = ai_row["ai_embedding"]

    human_row = top_humans[top_humans["contest"] == contest_id]
    if human_row.empty:
        continue

    human_caption = human_row["human_caption"].values[0]
    human_emb = human_row["human_embedding"].values[0]

    similarity = cosine_similarity([ai_emb], [human_emb])[0][0]

    results.append({
        "contest": contest_id,
        "ai_model": ai_model,
        "ai_caption": ai_caption,
        "human_caption": human_caption,
        "semantic_similarity": similarity
    })

# Save to CSV
sim_df = pd.DataFrame(results)
sim_df.to_csv("semantic_similarity_cleaned.csv", index=False)
print("✅ Semantic similarity results saved to 'semantic_similarity_cleaned.csv'")


KeyboardInterrupt: 

In [11]:
#!pip install sentence-transformers
#!pip install pandas scikit-learn sentence-transformers


In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load your data
df = pd.read_csv("combined_captions_long_format.csv")

# Normalize model names
df["model"] = df["model"].astype(str).str.strip().str.lower()

# 🔧 Normalize contest IDs (remove trailing labels like "_Dashboard")
df["contest"] = df["contest"].astype(str).str.extract(r"(\d+)")

# Drop rows with missing captions
df = df.dropna(subset=["caption"])
df["caption"] = df["caption"].astype(str)

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Separate human and AI captions
human_captions = df[df["model"] == "human"]
ai_captions = df[df["model"] != "human"]

# Sanity check
print(f"✅ Loaded rows: {len(df)}")
print(f"👤 Human captions: {len(human_captions)}")
print(f"🤖 AI captions: {len(ai_captions)}")
print(f"🧪 Contests to evaluate: {ai_captions['contest'].nunique()}")

results = []
skipped = []

# Loop over each contest
for contest_id in ai_captions["contest"].unique():
    ai_group = ai_captions[ai_captions["contest"] == contest_id]
    human_group = human_captions[human_captions["contest"] == contest_id]

    if ai_group.empty or human_group.empty:
        skipped.append(contest_id)
        continue

    # Encode all human captions once
    human_texts = human_group["caption"].tolist()
    human_embeddings = model.encode(human_texts)

    # Compare each AI caption to all human captions in the same contest
    for _, ai_row in ai_group.iterrows():
        ai_caption = ai_row["caption"]
        ai_model = ai_row["model"]
        ai_embedding = model.encode([ai_caption])[0]

        similarities = cosine_similarity([ai_embedding], human_embeddings)[0]

        for human_caption, sim in zip(human_texts, similarities):
            results.append({
                "contest": contest_id,
                "ai_model": ai_model,
                "ai_caption": ai_caption,
                "human_caption": human_caption,
                "semantic_similarity": sim
            })

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("semantic_similarity_detailed.csv", index=False)
print(f"✅ Saved to 'semantic_similarity_detailed.csv' with {len(results_df)} rows.")

# Optionally save skipped contests
if skipped:
    pd.Series(skipped, name="skipped_contests").to_csv("skipped_contests.csv", index=False)
    print(f"⚠️ Skipped {len(skipped)} contests with missing data. See 'skipped_contests.csv'.")


✅ Loaded rows: 4617
👤 Human captions: 3850
🤖 AI captions: 767
🧪 Contests to evaluate: 385
✅ Saved to 'semantic_similarity_detailed.csv' with 7670 rows.


## Remove duplicates 

In [17]:

# Load the detailed semantic similarity results
df = pd.read_csv("semantic_similarity_detailed.csv")

# Drop duplicate rows
df_deduplicated = df.drop_duplicates()

# Save to new file
output_path = "semantic_similarity_deduplicated.csv"
df_deduplicated.to_csv(output_path, index=False)

output_path


'semantic_similarity_deduplicated.csv'

## Matching unfunny or funny category

In [7]:
# Load the previous semantic similarity results that contain the matched human captions
semantic_df = pd.read_csv("semantic_similarity_deduplicated.csv")
captions_df = pd.read_csv("captions_long_format.csv")

# Ensure text columns are clean
semantic_df["human_caption"] = semantic_df["human_caption"].astype(str).str.strip()
captions_df["caption"] = captions_df["caption"].astype(str).str.strip()

# Only keep human captions with non-null category
human_captions_with_cat = captions_df[
    (captions_df["model"] == "Human") & (captions_df["category"].notna())
][["contest", "caption", "category"]].drop_duplicates()

# Merge on both contest + caption to get category from the human reference
merged = semantic_df.merge(
    human_captions_with_cat,
    left_on=["contest", "human_caption"],
    right_on=["contest", "caption"],
    how="left"
)

# Drop the duplicate 'caption' column from merge
merged.drop(columns=["caption"], inplace=True)

# Save the updated file
final_path = "semantic_similarity_with_matched_category.csv"
merged.to_csv(final_path, index=False)

final_path


FileNotFoundError: [Errno 2] No such file or directory: 'semantic_similarity_deduplicated.csv'