## sentence transformers + Cluster the embeddings

In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Load your data
df = pd.read_csv("semantic_similarity_with_matched_category.csv")

# Combine captions into one column
all_captions = pd.concat([
    df["human_caption"],
    df["ai_caption"]
]).dropna().unique().tolist()

# Load sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
caption_embeddings = model.encode(all_captions, show_progress_bar=True)

# Use PCA to reduce dimension for faster clustering (optional)
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(caption_embeddings)

# Cluster using KMeans
NUM_CLUSTERS = 10  # you can adjust this!
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42)
labels = kmeans.fit_predict(reduced_embeddings)

# Create DataFrame with results
theme_df = pd.DataFrame({
    "caption": all_captions,
    "theme_cluster": labels
})

# Extract keywords for each theme
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
tfidf_matrix = vectorizer.fit_transform(theme_df["caption"])

# Map cluster numbers to top keywords (for human-readable themes)
keywords_per_theme = []
for i in range(NUM_CLUSTERS):
    cluster_texts = tfidf_matrix[theme_df["theme_cluster"] == i]
    mean_tfidf = cluster_texts.mean(axis=0)
    top_keywords = [
        vectorizer.get_feature_names_out()[j]
        for j in mean_tfidf.argsort()[0, -5:].tolist()[0][::-1]
    ]
    keywords_per_theme.append(", ".join(top_keywords))

# Map cluster to theme name
theme_df["theme_name"] = theme_df["theme_cluster"].map({
    i: f"Theme {i+1}: {keywords_per_theme[i]}" for i in range(NUM_CLUSTERS)
})

# Merge themes back into original df
df_with_theme = df.merge(theme_df, how="left", left_on="ai_caption", right_on="caption")
df_with_theme = df_with_theme.drop(columns=["caption"])

# Save
df_with_theme.to_csv("semantic_similarity_with_themes.csv", index=False)
print("✅ Saved: semantic_similarity_with_themes.csv")


Batches:   0%|          | 0/106 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Saved: semantic_similarity_with_themes.csv


Did not work in a sense that themes aren't relevant 

# zero-shot classification from Hugging Face

In [2]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# Load your data
df = pd.read_csv("semantic_similarity_detailed.csv")

# Combine human + AI captions
all_captions = pd.concat([df["human_caption"], df["ai_caption"]]).dropna().unique().tolist()

# Define candidate labels (themes)
candidate_labels = [
    "religion", "politics", "family", "love", "work", "money", 
    "animals", "technology", "psychology", "health", "death", 
    "relationships", "law", "parenting", "marriage", "social media", "sports","others"
]

# Load zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Classify each caption
theme_results = []
for caption in tqdm(all_captions):
    output = classifier(caption, candidate_labels)
    top_theme = output["labels"][0]
    theme_results.append({"caption": caption, "theme": top_theme})

# Merge themes back to original dataframe
theme_df = pd.DataFrame(theme_results)
df_with_themes = df.merge(theme_df, how="left", left_on="ai_caption", right_on="caption")
df_with_themes = df_with_themes.drop(columns=["caption"])  # drop temp join column

# Save to CSV
df_with_themes.to_csv("semantic_similarity_with_clean_themes.csv", index=False)
print("✅ Saved: semantic_similarity_with_clean_themes.csv")


Device set to use mps:0
  1%|          | 22/3390 [00:13<33:25,  1.68it/s]


KeyboardInterrupt: 