In [44]:
import torch
from PIL import Image
import requests

In [2]:
import transformers

In [45]:
device = "cuda"
torch_dtype = torch.float16

In [59]:
model = transformers.CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map=device, torch_dtype=torch_dtype)
processor = transformers.CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [47]:
tokenizer = transformers.CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

In [48]:
tokenizer.batch_decode([[49406,   320,  1125,   539,   320,  2368, 49407],[49406,   320,  1125,   539,   320,  1929, 49407]])

['<|startoftext|>a photo of a cat <|endoftext|>',
 '<|startoftext|>a photo of a dog <|endoftext|>']

In [49]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"

In [50]:
image = Image.open(requests.get(url, stream=True).raw)

In [51]:
inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

In [52]:
inputs['input_ids']

tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407]])

In [53]:
inputs.to(device)

{'input_ids': tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[[[ 0.5873,  0.5873,  0.6165,  ...,  0.0617,  0.0471, -0.0259],
          [ 0.5727,  0.5727,  0.6603,  ...,  0.1201,  0.0763,  0.0909],
          [ 0.5873,  0.5435,  0.6165,  ...,  0.0325,  0.1201,  0.0617],
          ...,
          [ 1.8719,  1.8573,  1.8719,  ...,  1.3902,  1.4340,  1.4194],
          [ 1.8281,  1.8719,  1.8427,  ...,  1.4486,  1.4340,  1.5070],
          [ 1.8573,  1.9011,  1.8281,  ...,  1.3756,  1.3610,  1.4486]],

         [[-1.3169, -1.3019, -1.3169,  ..., -1.4970, -1.4369, -1.4820],
          [-1.2418, -1.2718, -1.2268,  ..., -1.4369, -1.4669, -1.4519],
          [-1.2568, -1.3169, -1.2268,  ..., -1.4669, -1.4069, -1.4519],
          ...,
          [ 0.1239,  0.1089,  0.1239,  ..., -0.7016, -

In [54]:
with torch.no_grad():
    with torch.autocast(device):
        outputs = model(**inputs)

In [55]:
logits_per_image = outputs.logits_per_image

In [56]:
probs = logits_per_image.softmax(dim=1)

In [57]:
probs

tensor([[0.9946, 0.0051]], device='cuda:0')

# FineTuning From Pretrained Vision and Language Models

In [62]:
import datasets

In [66]:
!wget -P ../data/ https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
!wget -P ../data/ https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip

--2024-08-18 13:34:45--  https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/124585957/47f52b80-3501-11e9-8d2e-dd69a21a4362?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240818%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240818T203446Z&X-Amz-Expires=300&X-Amz-Signature=01cd74764068186326da800b7366adfcb6bed8fe383d1de51045e73f57237193&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=124585957&response-content-disposition=attachment%3B%20filename%3DFlickr8k_text.zip&response-content-type=application%2Foctet-stream [following]
--2024-08-18 13:34:46--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/124585957/47f52b80-3501-11e9-8d2e-dd69a21a4362?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240818%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240818T203446Z&X-Amz-Expires=300&X-Amz-Signature=01cd74764068186326da800b7366adfcb6bed8fe383d1de5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/124585957/47f52b80-3501-11e9-8f49-4515a2a3339b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240818%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240818T203447Z&X-Amz-Expires=300&X-Amz-Signature=b4bfa3bdb14854df501192ffc86e1e2bb89a60fefa059abd99b43157d62769de&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=124585957&response-content-disposition=attachment%3B%20filename%3DFlickr8k_Dataset.zip&response-content-type=application%2Foctet-stream [following]
--2024-08-18 13:34:47--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/124585957/47f52b80-3501-11e9-8f49-4515a2a3339b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240818%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240818T203447Z&X-Amz-Expires=300&X-Amz-Signature=b4bfa3bdb14854df501192ffc86e1e2bb89a60fefa059

In [69]:
ds = datasets.load_dataset("atasoglu/flickr8k-dataset", data_dir="../data")

FileNotFoundError: Couldn't find file at https://huggingface.co/datasets/atasoglu/flickr8k-dataset/resolve/main/../data/Flickr8k_Dataset.zip