### ViT默认配置

```python
    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        image_size=224,
        patch_size=16,
        num_channels=3,
        qkv_bias=True,
        encoder_stride=16,
        **kwargs,
    ):
```

In [2]:
from transformers import ViTConfig, ViTModel

# Initializing a ViT vit-base-patch16-224 style configuration
configuration = ViTConfig()

# Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
model = ViTModel(configuration)

# Accessing the model configuration
configuration = model.config

print(configuration)

ViTConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.49.0"
}



In [5]:
# !pip install datasets

使用预训练的ViT模型处理图像数据，并提取图像的最后一层隐藏状态。这种隐藏状态可以用于各种任务，如图像分类、特征提取等。

In [None]:
from transformers import AutoImageProcessor, ViTModel
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad(): # 禁用梯度计算
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

Downloading data: 100%|██████████| 173k/173k [00:00<00:00, 1.27MB/s]
Generating test split: 1 examples [00:00, 30.01 examples/s]
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Error while downloading from https://cdn-lfs.hf.co/google/vit-base-patch16-224-in21k/fd4e1169c7aa6c2dbfa8a6448be13b35abc0ee256190857c90009d12c094619b?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1740626380&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MDYyNjM4MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9nb29nbGUvdml0LWJhc2UtcGF0Y2gxNi0yMjQtaW4yMWsvZmQ0ZTExNjljN2FhNmMyZGJmYThhNjQ0OGJlMTNiMzVhYmMwZWUyNTYxOTA4NTdjOTAwMDlkMTJjMDk0NjE5Yj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=UabnbSYRBjfU

[1, 197, 768]

加载一个预训练的视觉Transformer（ViT）模型，并对一张图像进行掩码图像建模。

In [20]:
from transformers import AutoImageProcessor, ViTForMaskedImageModeling
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# 加载预训练的图像处理器
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
# 加载预训练的ViT模型
model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

# 根据图像大小和patch大小计算图像被分割成的patch数量
print("原本的图片大小: ", image.size)
print("patch大小: ", model.config.patch_size)

num_patches = (model.config.image_size // model.config.patch_size) ** 2
print("patch数量: ", num_patches)

# 使用图像处理器将图像转换为模型所需的像素值张量
pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
# 生成一个随机的布尔掩码，用于指定哪些patch将被掩码
bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

# 输出重建像素值的形状
outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
list(reconstructed_pixel_values.shape)

print("1表示批量大小，这里只处理一张图像。3表示图像的通道数，通常是RGB图像的三个通道。224*224表示重建像素值的形状。")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Some weights of ViTForMaskedImageModeling were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['decoder.0.bias', 'decoder.0.weight', 'embeddings.mask_token']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


原本的图片大小:  (640, 480)
patch大小:  16
patch数量:  196
1表示批量大小，这里只处理一张图像。3表示图像的通道数，通常是RGB图像的三个通道。224*224表示重建像素值的形状。


加载一个预训练的视觉Transformer（ViT）模型，并对一张猫的图像进行分类

In [None]:
from transformers import AutoImageProcessor, ViTForImageClassification
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits # logits是模型的原始输出，表示每个类别的得分。

# model predicts one of the 1000 ImageNet classes
predicted_label = logits.argmax(-1).item() # 使用argmax函数找到得分最高的类别索引。
print(predicted_label)1
print(model.config.id2label[predicted_label])

# 打印前五个得分最高的类别以及它们的得分
result = logits.topk(5, dim=-1)
for i in range(5):
    print(f"{model.config.id2label[result.indices[0][i].item()]}: {result.values[0][i].item()}")

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


285
Egyptian cat
Egyptian cat: 12.4185791015625
tabby, tabby cat: 9.224590301513672
tiger cat: 8.243441581726074
lynx, catamount: 6.76153564453125
Siamese cat, Siamese: 5.189163684844971
