In [None]:
import os
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, pipeline
import pandas as pd
%pip install open-clip-torch
import open_clip
import torch
from torchvision import transforms
from diffusers import DiffusionPipeline


In [None]:
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
#pipe.to("cuda")

# if using torch < 2.0
# pipe.enable_xformers_memory_efficient_attention()


In [None]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k') # https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
tokenizer = open_clip.get_tokenizer('ViT-B-32')

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe.to(device)
model = model.to(device)

In [None]:
df = pd.read_parquet("/Users//SPUND/2025/hux/tabelle_rnd.parquet")

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
df["text_chunk"] = df["list_of_cunks"].apply(lambda x: " ".join(x))

In [None]:
df.head(2)

In [None]:
df.iloc[1000,-1]

In [None]:
def get_clips_scores(text):
    
    text_tokens = tokenizer(text)
    text_tokens = text_tokens.to(device)
    text_features = model.encode_text(text_tokens)
    
    clips = []
    
    for i in range(1, 26):
        result = pipe(prompt=text)
        image = result.images[0]
        image_tensor = preprocess(image).unsqueeze(0)  # shape: [1, 3, 224, 224]
        image_tensor = image_tensor.to(device)

        with torch.no_grad():
            image_features = model.encode_image(image_tensor)
            text_features = model.encode_text(text_tokens)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            clip_score = (image_features @ text_features.T).item()

        clips.append(clip_score)

    return clips 
        

This line calculates the **cosine similarity** between the image and text embeddings in CLIP's shared vector space:





Breaking it down:

1. **`image_features @ text_features.T`** — The `@` operator performs matrix multiplication (dot product) between:
   - `image_features`: The normalized embedding vector of the generated image
   - `text_features.T`: The transposed (normalized) embedding vector of the text prompt

2. **Normalized vectors** — Since both vectors are unit-normalized (lines above: `image_features /= image_features.norm(...)` and `text_features /= text_features.norm(...)`), their dot product equals their cosine similarity.

3. **`.item()`** — Extracts the scalar value from the resulting single-element tensor.

**Result**: A similarity score between **-1 and 1**, where:
- **1** = perfect alignment (image matches text perfectly)
- **0** = no correlation
- **-1** = complete opposition

In your code, this measures how well each generated image (from the Stable Diffusion pipeline) matches the text prompt—essentially a quality metric for the text-to-image generation.

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
df["text_chunk_clips"] = df["text_chunk"].apply(get_clips_scores)
df.to_parquet("tabelle_2.parquet")