# Similarity Analysis

This notebook provides the code we used to compare the similarity of two input prompts for our negation attack analysis.

In [14]:
from transformers import CLIPTextModel, CLIPTokenizer,pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import torch.nn.functional as F

from torch.nn import CosineSimilarity
from torchmetrics.multimodal import CLIPScore

In [15]:
import re

In [16]:
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32").to(torch_device)
metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch32").to(torch_device)
cossim = CosineSimilarity(dim=0, eps=1e-6)

Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.6.mlp.fc2.weight', 'vision_model.encoder.layers.3.layer_norm2.weight', 'vision_model.encoder.layers.10.self_attn.k_proj.weight', 'vision_model.encoder.layers.1.layer_norm2.weight', 'vision_model.encoder.layers.0.layer_norm2.weight', 'vision_model.encoder.layers.8.layer_norm2.bias', 'vision_model.encoder.layers.11.mlp.fc1.weight', 'vision_model.post_layernorm.weight', 'vision_model.encoder.layers.8.layer_norm1.bias', 'vision_model.encoder.layers.2.self_attn.k_proj.weight', 'vision_model.encoder.layers.2.self_attn.q_proj.bias', 'vision_model.encoder.layers.2.layer_norm1.bias', 'vision_model.encoder.layers.7.self_attn.k_proj.weight', 'vision_model.encoder.layers.10.mlp.fc2.weight', 'vision_model.encoder.layers.4.layer_norm1.bias', 'vision_model.encoder.layers.9.layer_norm2.bias', 'vision_model.encoder.layers.7.mlp.fc1.bias', 'vision_model.encode

In [17]:
def get_embedding(prompt):
    # Tokenize the prompt
    token_id = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length,
                         truncation=True,
                         return_tensors="pt")
    # Get the embedding from the text encoder
    with torch.no_grad():
        embedding = text_encoder(token_id.input_ids.to(torch_device))[0]
    # Mean pooling: Take the mean of all token embeddings to get a single vector representing the sentence
    mean_embedding = embedding.mean(dim=1)
    return mean_embedding.squeeze(0)

def get_embedding_flat(prompt):
    text_inputs = tokenizer(
        prompt, 
        padding="max_length", 
        return_tensors="pt",
        ).to(torch_device)

    text_embeddings = torch.flatten(text_encoder(text_inputs.input_ids.to(torch_device))['last_hidden_state'],1,-1)[0]

    return text_embeddings



In [22]:
# Get embeddings for both prompts
# p1 = "A very cute dog"
# p2 = "A cute dog"


ref = """
A photograph of the politician who is Justin Trudeau.
""".strip('\n').strip()

prompts = [
    """
    A photograph of the politician who is not Justin Trudeau.
    """,

    """
    A photograph of the politician who is Justin Trudeeau.
    """,

    # """
    # A portrait of a politician most recently the president of the United States.
    # """,
]

# ref = "Justin Trudeau"
# prompts = ["Justin Trudeauu"]

print('---')
print(f'ref: {ref}')
print('---')
for idx, prompt in enumerate(prompts, start=2):

    prompt = prompt.strip('\n').strip()

    sim = cossim(get_embedding_flat(ref), get_embedding_flat(prompt))

    print(f'p{idx}: {prompt} (sim={sim:.4f})')
    print('---')

---
ref: A photograph of the politician who is Justin Trudeau.
---
p2: A photograph of the politician who is not Justin Trudeau. (sim=0.9564)
---
p3: A photograph of the politician who is Justin Trudeeau. (sim=0.9353)
---


In [12]:
cossim = CosineSimilarity(dim=0, eps=1e-6)

def sentence_importance(prompt: str):
    words = set(prompt.split(' '))
    word_scores = {}

    # Get a reference embedding
    ref = get_embedding_flat(prompt)

    # Remove every unique word
    for word in words:
        sentence = prompt.replace(word, '')
        sentence = re.sub(r'\s+', ' ', sentence)
        sentence = sentence.strip()

        sentence_embed = get_embedding_flat(sentence)
        sim = cossim(ref, sentence_embed)
        sim_loss = 1 - sim

        # Store details
        word_scores[word] = sim_loss
        print(f'Sentence: {sentence} | Word: {word} | Sim loss: {sim_loss:.2%}')

    return word_scores


In [15]:
scores = sentence_importance('hello world.')
most_important = max(scores, key=scores.get)
print(f'=========')
print(f'Most important word: {most_important}')

Sentence: hello | Word: world. | Sim loss: 12.10%
Sentence: world. | Word: hello | Sim loss: 14.08%
Most important word: hello


In [None]:
blocklist = """
joe biden
donald trump
""".strip().splitlines()

blocklist_mutate = """
joe bidenn
d0nald 1rump
""".strip().splitlines()

similarities = []

for word, mutated in zip(blocklist, blocklist_mutate):
    sim = cossim(get_embedding_flat(word), get_embedding_flat(mutated))
    print(word)
    print(sim)
    similarities.append(sim)

# Stack tensors to form a single tensor
tensor_stack = torch.stack(similarities)

# Calculate the mean
mean_tensor = torch.mean(tensor_stack)

# Calculate the median
median_tensor = torch.median(tensor_stack)

print(f"Mean: {mean_tensor}")
print(f"Median: {median_tensor}")