In [9]:
from transformers import SegformerImageProcessor, ResNetForImageClassification
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]

# Image processor does some basic image preprocessing and normalizing
# usually to match the image size and format the model was trained on
image_processor = SegformerImageProcessor.from_pretrained("microsoft/resnet-18",
                                                     do_resize=False, 
                                                     # since you are already resizing the image in your transformations to the appropriate size (224,224)
                                                     do_recale=False, 
                                                     # this is the division by 255 to normalize the pixel values to [0,1], you are already doing this in your transformations
                                                     do_normalize=True, # let it be true, here the image gets nnormalized to the mean and std of the imagenet dataset
                                                    )


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [38]:
# from pretrained is similar to the load from checkpoint in the previous example
# huggingface has a hub which contains all the pretrained models, it is like github for models
# so when you provide the model name, it will download the model from the hub if available 
# and if not, it will download the model from the model zoo and cache it in the hub
# it can also used to load the model from a local directory
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-18", 
                                                     num_labels=2, # we only have 2 labels
                                                     ignore_mismatched_sizes=True, 
                                                    )
                                                     # the model was trained to predict 1000 classes, but we only have 2, so we ignore the mismatched weights
# you will see some warnings about the weight mismatch, but you can ignore them                                                     
                                                     
                                                     

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-18 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 512]) in the checkpoint and torch.Size([2, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# now lets see the model in detail
import torchinfo
torchinfo.summary(model, input_size=[1, 3, 224 ,224]) 

Layer (type:depth-idx)                                            Output Shape              Param #
ResNetForImageClassification                                      [1, 2]                    --
├─ResNetModel: 1-1                                                [1, 512, 1, 1]            --
│    └─ResNetEmbeddings: 2-1                                      [1, 64, 56, 56]           --
│    │    └─ResNetConvLayer: 3-1                                  [1, 64, 112, 112]         9,536
│    │    └─MaxPool2d: 3-2                                        [1, 64, 56, 56]           --
│    └─ResNetEncoder: 2-2                                         [1, 512, 7, 7]            --
│    │    └─ModuleList: 3-3                                       --                        11,166,976
│    └─AdaptiveAvgPool2d: 2-3                                     [1, 512, 1, 1]            --
├─Sequential: 1-2                                                 [1, 2]                    --
│    └─Flatten: 2-4               

In [12]:
# as you can see, the Sequential: 1-2 is similar to the classifier head in the previous example

In [42]:
import numpy as np
# image is the X in the kaggle code
# y is the label of the image, i.e. the class of the image
# since we loaded a cat image dataset, the label will be 0

y = np.array([[0]]) # this is the label of the image, 0 is the class of the image 
X  = np.array(image)

# you need to preprocess the image before passing it to the model
inputs = image_processor(X, y, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits 
# logits are unnormalized probabilities, may be not the most accurate definition
# so usually probabilities lies between 0-1. Logits are real values (can be negative or positive)
# normalizing the logits with softmax function will give you the probabilities

loss = outputs.loss
# some models in transformers library can compute the loss when you provide the labels (y)
# you can call backward on the loss to compute the gradients and update the weights of the model


probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_class = torch.argmax(probabilities, dim=-1) 
# you can also call argmax on logits, will give you the same result

print(f"Predicted class: {predicted_class.item()}")
print(f"logits: {logits}")
print(f"Loss: {loss.item()}")


Predicted class: 1
logits: tensor([[-32.6756,  -9.8944]], grad_fn=<AddmmBackward0>)
Loss: 22.78120231628418
