In [1]:
import torch
from PIL import Image
import clip

In [2]:
def load_clip_model():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    return model, preprocess, device

def calculate_similarity(model, text1, text2, image_path, device):
    # Load and preprocess image
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    
    # Encode texts
    text_features1 = model.encode_text(clip.tokenize(text1).to(device))
    text_features2 = model.encode_text(clip.tokenize(text2).to(device))
    image_features = model.encode_image(image)
    
    # Calculate similarities
    sim1 = torch.cosine_similarity(text_features1, image_features)
    sim2 = torch.cosine_similarity(text_features2, image_features)
    
    return sim1.item(), sim2.item()

In [3]:
# Test cases
suspicious_guess = "[-h]"
legitimate_guess = "Cat sanctuary with woman wearing snoopy sweater"
round0_image_path = "../rounds/round0/targetframe20250223_133057EST.jpg"  # You'll need to add the actual path

# Compare with other meaningless strings
test_cases = [
    "[-h]",
    "[--]",
    "[-]",
    "[h]",
    "h",
    "-",
    "Cat sanctuary with woman wearing snoopy sweater",
    "A cat shelter",
    "Empty room",
    "Random text"
]

In [4]:
# Initialize model and get preprocessing function
model, preprocess, device = load_clip_model()
print(f"Using device: {device}")

Using device: cpu


In [5]:
# Test the specific anomaly case
sim_suspicious, sim_legitimate = calculate_similarity(
    model,
    suspicious_guess,
    legitimate_guess,
    round0_image_path,
    device
)

print(f"Similarity scores:")
print(f"'{suspicious_guess}': {sim_suspicious:.3f}")
print(f"'{legitimate_guess}': {sim_legitimate:.3f}")
print(f"\nRatio of suspicious to legitimate score: {sim_suspicious/sim_legitimate:.3f}")

Similarity scores:
'[-h]': 0.180
'Cat sanctuary with woman wearing snoopy sweater': 0.278

Ratio of suspicious to legitimate score: 0.649


In [6]:
# Test all cases against the image
results = []
for text in test_cases:
    with torch.no_grad():
        text_features = model.encode_text(clip.tokenize(text).to(device))
        image_features = model.encode_image(preprocess(Image.open(round0_image_path)).unsqueeze(0).to(device))
        
        # Normalize features
        text_features /= text_features.norm(dim=-1, keepdim=True)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        
        # Calculate similarity
        similarity = (100.0 * text_features @ image_features.T).item()
        results.append((text, similarity))

# Sort by similarity score
results.sort(key=lambda x: x[1], reverse=True)

print("All test cases ranked by similarity:")
print("-" * 50)
for text, sim in results:
    print(f"'{text}': {sim:.3f}")

All test cases ranked by similarity:
--------------------------------------------------
'Cat sanctuary with woman wearing snoopy sweater': 27.824
'A cat shelter': 21.721
'[h]': 18.952
'[-]': 18.207
'Empty room': 18.168
'[--]': 18.156
'-': 18.068
'[-h]': 18.047
'h': 17.729
'Random text': 17.467


In [13]:
import open_clip

def compare_clip_implementations(image_path, text):
    # Original CLIP
    clip_model, clip_preprocess, device = load_clip_model()
    
    # OpenCLIP (ViT-B/32)
    openclip_model, _, openclip_preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
    openclip_tokenizer = open_clip.get_tokenizer('ViT-B-32')
    
    # Prepare inputs
    image = Image.open(image_path)
    
    # Original CLIP
    clip_image = clip_preprocess(image).unsqueeze(0).to(device)
    clip_text = clip.tokenize(text).to(device)
    with torch.no_grad():
        clip_image_features = clip_model.encode_image(clip_image)
        clip_text_features = clip_model.encode_text(clip_text)
        clip_sim = torch.cosine_similarity(clip_text_features, clip_image_features).item()
    
    # OpenCLIP
    openclip_image = openclip_preprocess(image).unsqueeze(0).to(device)
    openclip_text = openclip_tokenizer(text).to(device)
    with torch.no_grad():
        openclip_image_features = openclip_model.encode_image(openclip_image)
        openclip_text_features = openclip_model.encode_text(openclip_text)
        openclip_sim = torch.cosine_similarity(openclip_text_features, openclip_image_features).item()
    
    return {
        'Original CLIP': clip_sim * 100,
        'OpenCLIP (LAION)': openclip_sim * 100
    }

# Test our problem case
results = compare_clip_implementations(
    round0_image_path,
    "[-h]"
)

print(f"Comparison for '[-h]':")
for model_name, score in results.items():
    print(f"{model_name}: {score:.3f}")
    

Comparison for '[-h]':
Original CLIP: 18.047
OpenCLIP (LAION): 22.719


In [12]:
# New function to calculate baseline
def calculate_baseline(model, device, baseline_text="[UNUSED]"):
    with torch.no_grad():
        baseline_features = model.encode_text(clip.tokenize(baseline_text).to(device))
        baseline_features /= baseline_features.norm(dim=-1, keepdim=True)
    return baseline_features

# Modified test function with baseline adjustment
def run_tests_with_baseline(image_path, test_cases):
    model, preprocess, device = load_clip_model()
    baseline = calculate_baseline(model, device)
    
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    image_features = model.encode_image(image)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    
    results = []
    for text in test_cases:
        text_features = model.encode_text(clip.tokenize(text).to(device))
        text_features /= text_features.norm(dim=-1, keepdim=True)
        
        raw_score = (text_features @ image_features.T).item()
        baseline_score = (baseline @ image_features.T).item()
        adjusted_score = (raw_score - baseline_score) / (1 - baseline_score)
        
        results.append((text, raw_score*100, adjusted_score*100))
    
    return sorted(results, key=lambda x: x[2], reverse=True)

# Run tests with baseline adjustment
adjusted_results = run_tests_with_baseline(round0_image_path, test_cases)

# Print formatted results
print(f"{'Text':<50} | {'Raw Score':>10} | {'Adjusted Score':>14}")
print("-"*80)
for text, raw, adj in adjusted_results:
    print(f"{text[:48]:<50} | {raw:10.3f} | {adj:14.3f}")

Text                                               |  Raw Score | Adjusted Score
--------------------------------------------------------------------------------
Cat sanctuary with woman wearing snoopy sweater    |     27.824 |         12.362
A cat shelter                                      |     21.721 |          4.951
[h]                                                |     18.952 |          1.588
[-]                                                |     18.207 |          0.684
Empty room                                         |     18.168 |          0.637
[--]                                               |     18.156 |          0.622
-                                                  |     18.068 |          0.516
[-h]                                               |     18.047 |          0.490
h                                                  |     17.729 |          0.104
Random text                                        |     17.467 |         -0.214
