## Computer Embeddings to find similarity 

In [3]:
#!pip install sentence_transformers

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load long-format dataset
df = pd.read_csv("captions_long_format.csv")

# Load HuggingFace sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings
df["embedding"] = df["caption"].astype(str).apply(lambda x: model.encode(x))

# STEP 1: Get one top human caption per contest
top_humans = (
    df[df["model"] == "Human"]
    .groupby("contest")
    .first()
    .reset_index()[["contest", "caption", "embedding"]]
    .rename(columns={"caption": "human_caption", "embedding": "human_embedding"})
)

# STEP 2: Drop duplicate AI captions per contest/model/caption
ai_df = (
    df[df["model"] != "Human"]
    .drop_duplicates(subset=["contest", "caption", "model"])
    .rename(columns={"caption": "ai_caption", "embedding": "ai_embedding"})
)

# STEP 3: Compute cosine similarity between AI and top human caption
results = []

for _, ai_row in ai_df.iterrows():
    contest_id = ai_row["contest"]
    ai_caption = ai_row["ai_caption"]
    ai_model = ai_row["model"]
    ai_emb = ai_row["ai_embedding"]

    human_row = top_humans[top_humans["contest"] == contest_id]
    if human_row.empty:
        continue

    human_caption = human_row["human_caption"].values[0]
    human_emb = human_row["human_embedding"].values[0]

    similarity = cosine_similarity([ai_emb], [human_emb])[0][0]

    results.append({
        "contest": contest_id,
        "ai_model": ai_model,
        "ai_caption": ai_caption,
        "human_caption": human_caption,
        "semantic_similarity": similarity
    })

# Export results
sim_df = pd.DataFrame(results)
sim_df.to_csv("semantic_similarity_cleaned.csv", index=False)
print("✅ Fixed version saved: semantic_similarity_cleaned.csv")


✅ Fixed version saved: semantic_similarity_cleaned.csv


In [11]:
#!pip install sentence-transformers
#!pip install pandas scikit-learn sentence-transformers


In [16]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load your data
df = pd.read_csv("combined_captions_detailed.csv")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Rename columns for merging
human_df = df[["contest", "caption_human"]].rename(columns={"caption_human": "caption"})
human_df["model"] = "Human"

ai_chatgpt = df[["contest", "caption_chatgpt", "model_chatgpt"]].rename(
    columns={"caption_chatgpt": "caption", "model_chatgpt": "model"}
)
ai_claude = df[["contest", "caption_claude", "model_claude"]].rename(
    columns={"caption_claude": "caption", "model_claude": "model"}
)

# Combine into one long df
long_df = pd.concat([human_df, ai_chatgpt, ai_claude], ignore_index=True)
long_df = long_df.dropna(subset=["caption"])  # Drop missing
long_df["caption"] = long_df["caption"].astype(str)

# Separate
human_captions = long_df[long_df["model"] == "Human"]
ai_captions = long_df[long_df["model"] != "Human"]

results = []

for contest_id in ai_captions["contest"].unique():
    ai_group = ai_captions[ai_captions["contest"] == contest_id]
    human_group = human_captions[human_captions["contest"] == contest_id]

    if ai_group.empty or human_group.empty:
        continue

    # Pre-encode human captions
    human_texts = human_group["caption"].tolist()
    human_embeddings = model.encode(human_texts)

    for _, ai_row in ai_group.iterrows():
        ai_caption = ai_row["caption"]
        ai_model = ai_row["model"]
        ai_embedding = model.encode([ai_caption])[0]

        similarities = cosine_similarity([ai_embedding], human_embeddings)[0]

        # One row per human caption
        for human_caption, sim in zip(human_texts, similarities):
            results.append({
                "contest": contest_id,
                "ai_model": ai_model,
                "ai_caption": ai_caption,
                "human_caption": human_caption,
                "semantic_similarity": sim
            })

# Save
results_df = pd.DataFrame(results)
results_df.to_csv("semantic_similarity_detailed.csv", index=False)
print("✅ Saved to semantic_similarity_detailed.csv")


✅ Saved to semantic_similarity_detailed.csv


## Remove duplicates 

In [17]:

# Load the detailed semantic similarity results
df = pd.read_csv("semantic_similarity_detailed.csv")

# Drop duplicate rows
df_deduplicated = df.drop_duplicates()

# Save to new file
output_path = "semantic_similarity_deduplicated.csv"
df_deduplicated.to_csv(output_path, index=False)

output_path


'semantic_similarity_deduplicated.csv'

## Matching unfunny or funny category

In [21]:
# Load the previous semantic similarity results that contain the matched human captions
semantic_df = pd.read_csv("semantic_similarity_deduplicated.csv")
captions_df = pd.read_csv("captions_long_format.csv")

# Ensure text columns are clean
semantic_df["human_caption"] = semantic_df["human_caption"].astype(str).str.strip()
captions_df["caption"] = captions_df["caption"].astype(str).str.strip()

# Only keep human captions with non-null category
human_captions_with_cat = captions_df[
    (captions_df["model"] == "Human") & (captions_df["category"].notna())
][["contest", "caption", "category"]].drop_duplicates()

# Merge on both contest + caption to get category from the human reference
merged = semantic_df.merge(
    human_captions_with_cat,
    left_on=["contest", "human_caption"],
    right_on=["contest", "caption"],
    how="left"
)

# Drop the duplicate 'caption' column from merge
merged.drop(columns=["caption"], inplace=True)

# Save the updated file
final_path = "semantic_similarity_with_matched_category.csv"
merged.to_csv(final_path, index=False)

final_path


'semantic_similarity_with_matched_category.csv'