# test

In [None]:
# import test
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import torch.nn.functional as F

In [3]:
# model

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# 진공관 속 양초 사진

image = Image.open("image/A candle is placed inside a sealed glass vacuum chamber.png")
texts = ["The light is on inside the vacuum chamber.", "The light is off inside the vacuum chamber."]

inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)  # 어떤 설명이 가장 잘 맞는지 확률로 출력


tensor([[0.5003, 0.4997]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [35]:
# 진공관 양초
image = Image.open("image/A candle is placed inside a sealed glass vacuum chamber.png")

# Multiple text descriptions
texts = [
    "A candle is placed inside a sealed glass vacuum chamber", # prompt
    "A candle is burning in a glass container.", # logically true, physically true
    "A lit candle is placed inside a transparent glass dome.", # true, true
    "A flame continues to burn without any air.", # logically false, physically true
    "A candle is on fire inside a chamber with no oxygen.", # logically false, physically true
    "A flame can not continue to burn without any air." ,# t, f
    "A candle is on fire inside a chamber with oxygen.", #t,f
    "It is cold inside the vacuum chamber", #t, f 
    "It is hot inside the vacuum chamber",  #f, t
    "There is a person inside the vacuum chamber" # f, f
]

In [28]:
# 거울 앞 춤추기
image = Image.open("image/A person is dancing in front of a mirror.png")

# Multiple text descriptions
texts = [
    "A person is dancing in front of a mirror", # prompt
    "A dancer is facing a mirror during practice", # logically true, physically true
    "A person is doing something different from a mirror.", # logically false, physically true
    "A person is doing same thing from a mirror.", # logically true, physically false
    "Two people are dancing face to face.", # logically true, physically false
    "Two people are dancing.",
    "The image of a chicken"
]

In [None]:
# 코끼리와 쥐가 시소타는 장면. 근데 쥐 쪽으로 기울어짐
image = Image.open("image/An elephant and a mouse stand on either side of a seesaw.png")

# Multiple text descriptions
texts = [
    "a mouse heavier than an elephant",   # false
    "an elephant heavier than a mouse",   # true
    "The seesaw tilted toward the mouse", # false
    "The seesaw tilted toward the elephant", # true
    "An elephant and a mouse stand on either side of a seesaw",  #프롬프트
    "A elephant and a snake are making a swing." # f f
]

In [61]:
#정확도 체크



# Preprocess inputs
inputs = processor(text=texts, images=[image]*len(texts), return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}

# Get features
with torch.no_grad():
    image_features = model.get_image_features(pixel_values=inputs["pixel_values"])  # shape: (N, D)
    text_features = model.get_text_features(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

# Normalize
image_features = F.normalize(image_features, p=2, dim=-1)
text_features = F.normalize(text_features, p=2, dim=-1)

# Cosine similarity: diag(img[i] @ text[i]) for each pair
similarities = torch.sum(image_features * text_features, dim=1)

# Print result
for text, sim in zip(texts, similarities):
    print(f"Similarity with \"{text}\": {sim.item():.4f}")

Similarity with "a mouse heavier than an elephant": 0.3025
Similarity with "an elephant heavier than a mouse": 0.2958
Similarity with "The seesaw tilted toward the mouse": 0.3105
Similarity with "The seesaw tilted toward the elephant": 0.3353
Similarity with "An elephant and a mouse stand on either side of a seesaw": 0.3706
Similarity with "A elephant and a snake are making a car.": 0.2796


In [10]:
# 가위바위보 거울 사진(거울과 사람이 다름)

image = Image.open("image/A person is playing rock-paper-scissors with a mirror.png")
texts = ["There is a person ", "The seesaw tilted toward the mouse."]

inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
inputs = {k: v.to("cuda") for k, v in inputs.items()}
outputs = model(**inputs)

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
print(probs)  # 어떤 설명이 가장 잘 맞는지 확률로 출력
print(logits_per_image)

tensor([[0.9974, 0.0026]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[21.8407, 15.8862]], device='cuda:0', grad_fn=<TBackward0>)
