# Image Classification
# ViTForImageClassification

In [7]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
from PIL import Image
import requests
import torch



In [20]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
inputs = feature_extractor(image, return_tensors="pt")
inputs


{'pixel_values': tensor([[[[ 0.1137,  0.1686,  0.1843,  ..., -0.1922, -0.1843, -0.1843],
          [ 0.1373,  0.1686,  0.1843,  ..., -0.1922, -0.1922, -0.2078],
          [ 0.1137,  0.1529,  0.1608,  ..., -0.2314, -0.2235, -0.2157],
          ...,
          [ 0.8353,  0.7882,  0.7333,  ...,  0.7020,  0.6471,  0.6157],
          [ 0.8275,  0.7961,  0.7725,  ...,  0.5843,  0.4667,  0.3961],
          [ 0.8196,  0.7569,  0.7569,  ...,  0.0745, -0.0510, -0.1922]],

         [[-0.8039, -0.8118, -0.8118,  ..., -0.8902, -0.8902, -0.8980],
          [-0.7882, -0.7882, -0.7882,  ..., -0.8745, -0.8745, -0.8824],
          [-0.8118, -0.8039, -0.7882,  ..., -0.8902, -0.8902, -0.8902],
          ...,
          [-0.2706, -0.3176, -0.3647,  ..., -0.4275, -0.4588, -0.4824],
          [-0.2706, -0.2941, -0.3412,  ..., -0.4824, -0.5451, -0.5765],
          [-0.2784, -0.3412, -0.3490,  ..., -0.7333, -0.7804, -0.8353]],

         [[-0.5451, -0.4667, -0.4824,  ..., -0.7412, -0.6941, -0.7176],
          [-0

In [21]:
inputs["pixel_values"].shape

torch.Size([1, 3, 224, 224])

In [22]:

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7440e-01,  8.2152e-01, -8.3649e-02,  4.1587e-01,  5.6233e-01,
          1.8593e-01, -5.7729e-01, -4.6004e-01, -5.3389e-01,  2.4016e-01,
         -3.1957e-01, -5.9910e-01, -6.6403e-01, -4.9756e-01, -6.2448e-01,
         -1.3501e+00, -1.0016e-01, -6.2171e-01,  1.1087e-01, -1.1060e+00,
         -2.0846e-01,  3.1696e-01, -9.3153e-01, -3.0693e-01, -1.0124e+00,
         -1.8751e-01,  5.8825e-01, -3.6161e-01, -7.4697e-01,  7.4134e-01,
         -3.6653e-01, -2.7586e-01,  3.6595e-01, -1.1206e+00, -8.8848e-02,
         -1.1328e+00,  1.5458e-01, -1.0399e+00,  1.0136e+00, -1.0395e+00,
         -2.4214e+00,  5.1124e-01,  4.9458e-01, -7.4005e-01, -1.5815e+00,
         -3.2452e-01, -2.0448e+00, -4.8128e-01, -6.3616e-01, -1.1355e+00,
         -1.0902e+00, -4.5298e-02, -6.4045e-01, -2.3987e-01,  1.3110e-01,
         -1.2665e+00, -4.7161e-01, -4.3717e-01, -9.5664e-01, -5.9686e-01,
          5.0885e-01, -8.4840e-02,  2.6987e-01, -1.5028e-03, -5.3330e

In [23]:
outputs.logits.shape

torch.Size([1, 1000])

In [49]:
probabilities = outputs.logits.softmax(-1)[0]
topk = 10
scores, ids = probabilities.topk(topk)
scores.tolist(), ids.tolist()

([0.9374412894248962,
  0.038442566990852356,
  0.014411398209631443,
  0.003274323185905814,
  0.0006795922527089715,
  0.00011758987966459244,
  0.00010496059985598549,
  0.00010180875688092783,
  9.21403625397943e-05,
  6.921341264387593e-05],
 [285, 281, 282, 287, 284, 283, 289, 293, 785, 292])

In [51]:
predictions = [{"score": score, "label": model.config.id2label[_id.item()]} for score, _id in zip(scores, ids)]
predictions

[{'score': tensor(0.9374, grad_fn=<UnbindBackward0>), 'label': 'Egyptian cat'},
 {'score': tensor(0.0384, grad_fn=<UnbindBackward0>),
  'label': 'tabby, tabby cat'},
 {'score': tensor(0.0144, grad_fn=<UnbindBackward0>), 'label': 'tiger cat'},
 {'score': tensor(0.0033, grad_fn=<UnbindBackward0>),
  'label': 'lynx, catamount'},
 {'score': tensor(0.0007, grad_fn=<UnbindBackward0>),
  'label': 'Siamese cat, Siamese'},
 {'score': tensor(0.0001, grad_fn=<UnbindBackward0>), 'label': 'Persian cat'},
 {'score': tensor(0.0001, grad_fn=<UnbindBackward0>),
  'label': 'snow leopard, ounce, Panthera uncia'},
 {'score': tensor(0.0001, grad_fn=<UnbindBackward0>),
  'label': 'cheetah, chetah, Acinonyx jubatus'},
 {'score': tensor(9.2140e-05, grad_fn=<UnbindBackward0>),
  'label': 'seat belt, seatbelt'},
 {'score': tensor(6.9213e-05, grad_fn=<UnbindBackward0>),
  'label': 'tiger, Panthera tigris'}]

# Image Classification using a Pipeline

In [1]:
from transformers import pipeline
from PIL import Image
import requests

clf = pipeline("image-classification")

2022-02-19 08:54:19.815157: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-19 08:54:19.815194: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
No model was supplied, defaulted to google/vit-base-patch16-224 (https://huggingface.co/google/vit-base-patch16-224)


In [2]:
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

In [6]:
clf(url)

[{'score': 0.9374412894248962, 'label': 'Egyptian cat'},
 {'score': 0.038442566990852356, 'label': 'tabby, tabby cat'},
 {'score': 0.014411398209631443, 'label': 'tiger cat'},
 {'score': 0.003274323185905814, 'label': 'lynx, catamount'},
 {'score': 0.0006795922527089715, 'label': 'Siamese cat, Siamese'}]