OVERALL From : https://www.youtube.com/watch?v=mn5QDKQ54dQ

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from collections import Counter
import os

following about data augmentation: https://pytorch.org/vision/main/transforms.html

Torchvision supports common computer vision transformations in the torchvision.transforms and torchvision.transforms.v2 modules. Transforms can be used to transform or augment data for training or inference of different tasks (image classification, detection, segmentation, video classification).

In [3]:
# Define data transformations for data augmentation and normalization
# https://pytorch.org/vision/0.9/transforms.html

# Link: https://neptune.ai/blog/data-augmentation-in-python
# Data augmentation is a technique that can be used to artificially
# expand the size of a training set by creating modified data from the
# existing one. It is a good practice to use DA if you want to prevent 
# overfitting, or the initial dataset is too small to train on, or even 
# if you want to squeeze better performance from your model.

# Benefits of data augmenting:
# 1. prevent overfitting
# 2. improving the performance of the model 

# Techniques of data augmentation used here (for images):
# 1. geometric transformations: randomly flip, crop, rotate images 
# 2. color space transformations: change RGB color channels

transforms = {
    'train': transforms.Compose([
        # ADDED IN FROM https://pytorch.org/vision/0.9/transforms.html
    
        # horizontally flip the image randomly with given probability
        transforms.RandomHorizontalFlip(p=0.5),
        # randomly resize and crop all the input images to 224 pixel size
        transforms.RandomResizedCrop(size=(224, 224), antialias=True),
        # rotate the image by angle
        transforms.RandomRotation(degrees=90),
        # randomly convert image to grayscale with probability of p
        # (default 0.1)
        transforms.RandomGrayscale(p=0.1),
        # performs a random persepctive transformation of the image with
        # given probability 
        transforms.RandomPerspective(distortion_scale=0.5, p=0.5),
        # vertically flip the image randomly with given probability
        transforms.RandomVerticalFlip(p=0.5),
        # converting all the image data to a tensor BECAUSE PyTorch accepts
        # the data in the form of tensor
        # when image is transformed into PyTorch tensor, the pixel values 
        # are scaled between 0.0 and 1.0
        # converts the PIL image with a pixel range of [0, 255] to a PyTorch
        # FloatTensor of shape (C, H, W) with range [0.0, 1.0]
        # this is a MANDATORY step
        transforms.ToTensor(),
        # Link : https://www.geeksforgeeks.org/how-to-normalize-images-in-pytorch/
        # normalizing images means transforming them into such values that
        # the means and std dev of the image become 0.0 and 1.0 respectively
        # normalization helps get data within a range and reduces the 
        # skewness which helps learn faster and better, and can also tackle
        # the diminishing and exploding gradients problem
        # first parameter is the mean, second is the std
        # the mean and std of ImageNet are: mean = [0.485, 0.465, 0.406]
        # and std = [0.229, 0.224, 0.225]
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
    # repeat steps from training data to testing (validation) data 
    'test': transforms.Compose([    
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomResizedCrop(size=(224, 224), antialias=True),
        transforms.RandomRotation(degrees=90),
        transforms.RandomGrayscale(p=0.1),
        transforms.RandomPerspective(distortion_scale=0.5, p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]),
}

# performing this augmentation allows us to provide a variety of different scenarios to a model so that our model will 
# learn to generalize in a better way



In [4]:
# Define the data directory
# image_classification notebook is in the same source/repos directory in local and Jupyter Notebook directory
# this Notebook has to be in the same location as the dataset folder
dataset_directory = 'dataset'

# # Create data loaders
# # data loaders are responsible to load the data so we are loading the training and testing data 
# # inside dataset folder, there are two folders called train and test containing all the folders of the images
# datasets_images = {img: datasets.ImageFolder(os.path.join(dataset_directory, img), transforms[img]) 
#                    for img in dataset_directory}

# print(datasets_images)

# torch.utils.data.random_split(dataset, lengths)


# Create data loaders
# data loaders are responsible to load the data so we are loading the training and testing data 
# inside dataset folder, there are two folders called train and test containing all the folders of the images
datasets_images = {img: datasets.ImageFolder(os.path.join(dataset_directory, img), 
                                          transforms[img]) for img in ['train', 'test']}
print(datasets_images)

# # 80% of the d's go into train
train_size = int(0.8 * len(datasets_images['train']))
test_size = len(datasets_images['train']) - train_size
# datasets_images['train'] contains all original images in their respective folders
train_dataset, test_dataset = torch.utils.data.random_split(datasets_images['train'], [train_size, test_size])
print(len(train_dataset))
print(len(test_dataset))

{'train': Dataset ImageFolder
    Number of datapoints: 700
    Root location: dataset\train
    StandardTransform
Transform: Compose(
               RandomHorizontalFlip(p=0.5)
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear, antialias=True)
               RandomRotation(degrees=[-90.0, 90.0], interpolation=nearest, expand=False, fill=0)
               RandomGrayscale(p=0.1)
               RandomPerspective(p=0.5)
               RandomVerticalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           ), 'test': Dataset ImageFolder
    Number of datapoints: 700
    Root location: dataset\test
    StandardTransform
Transform: Compose(
               RandomHorizontalFlip(p=0.5)
               RandomResizedCrop(size=(224, 224), scale=(0.08, 1.0), ratio=(0.75, 1.3333), interpolation=bilinear, antialias=True)
               RandomRotation(degrees=[-90.0, 90

In [5]:
# # first parameter: training_data
# # shuffle means while training the data it will shuffle the data
# # num_workers means parallelizing the process (4 different processes can work at the same time)
# dataloaders = {img: DataLoader(datasets_images[img], batch_size=4, shuffle=True, num_workers=4) for img in ['train', 'test']}
# datasets_sizes = {img: len(datasets_images[img]) for img in ['train', 'test']}
# # number of images per "train" and "test" folder
# print(datasets_sizes)

# # number of folders and their names in the "train" and "test" folders
# # these are the letters we are working with
# class_names = datasets_images['train'].classes
# print(class_names)

# first parameter: training_data
# shuffle means while training the data it will shuffle the data
# num_workers means parallelizing the process (4 different processes can work at the same time)
dataloaders = {img: DataLoader(datasets_images[img], batch_size=4, shuffle=True, num_workers=4) for img in ['train', 'test']}
datasets_sizes = {img: len(datasets_images[img]) for img in ['train', 'test']}
# number of images per "train" and "test" folder
print(datasets_sizes)

# number of folders and their names in the "train" and "test" folders
# these are the letters we are working with
class_names = datasets_images['train'].classes
print(class_names)

print(train_dataset.classes)





{'train': 700, 'test': 700}
['d', 'e', 'h', 'l', 'o', 'r', 'w']


AttributeError: 'Subset' object has no attribute 'classes'

In [5]:
# Load the pre-trained ResNet-18 model
# pretrained = true means we are using a pretrained model
# picking the model from torch vision model zoo
model = models.resnet18(pretrained=True)

# Freeze all layers except the final classification layer and then fine tune this model on our custom data set 
# to detect whether the given image to a model is what letter
# freezing all layers except the final classification layers which is responsible for performing classfication
for name, param in model.named_parameters():
    # if the paramater contains this FC (FC means fully connected layer), then set the required grads equal to 
    # true
    if "fc" in name:  # Unfreeze the final classification layer
        param.requires_grad = True
    # if FC is not in the parameter, then set the required grads equal to false
    else:
        param.requires_grad = False
    # what happens is wherever fc parameter is present, all those layers will be trained because we are setting
    # the value true over there and wherever we have written false, all those layers will be freeze. this is how
    # we freeze all the layers and we can only fine tune the final layer

# Define the loss function and optimizer
# whenever working on image classification, this is most commonly method to calculate loss. 
criterion = nn.CrossEntropyLoss()
# optimizer is a stochastic gradient descent optimizer (lr is learning range and momentum)
# CAN CHANGE THESE VALUES to test how the model performs
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  # Use all parameters


# Move the model to the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# sending all the models here to device
model = model.to(device)




In [6]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for phase in ['train', 'test']:
        if phase == 'train':
            model.train()
        else:
            model.eval()

        # these two variables are used to store the loss and the correct 
        # predictions inside every epoch
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in dataloaders[phase]:
            # inputs means the image 
            inputs = inputs.to(device)
            # labels means the output label (the class name)
            labels = labels.to(device)
            # all being sent to device because the model is on the device.
            # which device depends on whether this is running on CPU or GPU
            
            # clear the gradients from the previous iterations 
            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):
                # if phase is train, then we are using the model to make predictions 
                # and providing the inputs which are the images 
                outputs = model(inputs)
                # prediction of the model is stored in outputs
                # this line is responsible for showing you the predictions 
                _, preds = torch.max(outputs, 1)
                # now providing the outputs which are the predictions 
                # "outputs" is the predictions of the model and "labels" is the actual labels
                # so we are comparing the output label and our current label, on the basis
                # of that we are getting the loss value 
                loss = criterion(outputs, labels)

                # how deep learning/CNN works: 
                # there is backward pass and forward pass. first we go the forward pass. 
                # forward pass means you provide input to the model, the model processes
                # the image and provide you the corresponding label. then in back propogation, 
                # we have backward pass. during backward pass, we calculate the gradients and 
                # then update the weights and after that we again use the forward pass to work
                # on the model again, train it with the updated weights. 
                
                # if the phase is train, we will perform backward pass. 
                if phase == 'train':
                    # in backward pass, we are calculating the gradients
                    loss.backward()
                    # and then we are updating the weights on the basis of calculated gradients 
                    optimizer.step()

            # storing all the losses and all the correct predictions 
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
            
        # these two lines helping you to see the epoch loss and epoch accuracy
        epoch_loss = running_loss / dataset_sizes[phase]
        epoch_acc = running_corrects.double() / dataset_sizes[phase]

        print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

print("Training complete!")


train Loss: 3.3654 Acc: 0.1524
test Loss: 1.8379 Acc: 0.3476
train Loss: 1.8918 Acc: 0.3405
test Loss: 1.0831 Acc: 0.5833
train Loss: 1.6896 Acc: 0.4167
test Loss: 0.8927 Acc: 0.7238
train Loss: 1.6497 Acc: 0.4429
test Loss: 0.6354 Acc: 0.8238
train Loss: 1.4886 Acc: 0.4595
test Loss: 0.5863 Acc: 0.8048
train Loss: 1.3913 Acc: 0.5167
test Loss: 0.5167 Acc: 0.8405
train Loss: 1.2768 Acc: 0.5476
test Loss: 0.4021 Acc: 0.8762
train Loss: 1.2814 Acc: 0.5357
test Loss: 0.4010 Acc: 0.8762
train Loss: 1.3050 Acc: 0.5238
test Loss: 0.4085 Acc: 0.8571
train Loss: 1.3541 Acc: 0.5119
test Loss: 0.4406 Acc: 0.8381
Training complete!


In [7]:
# Save the model
torch.save(model.state_dict(), 'letter_classification_model.pth')

# Classification on Unseen Image

To use the saved model to classify unseen images, you need to load the model and then apply it to the new images for inference. 

In [8]:
import torch
from torchvision import models, transforms
from PIL import Image

# Load the saved model
model = models.resnet18(pretrained=True)
# the pretrained model is trained on imageNet dataset with a thousand classes 
# freezing all the layers except for the last layer means we are using transfer learning which
# means the model is already trained on some data 
# that means this model is ready to extract the features so we can use that knowledge from the 
# pretrained model to extract the feature and in the final layer we are only using the two 
# neurons which are responsible for telling us what letter class it is
model.fc = nn.Linear(model.fc.in_features, 1000)  # Adjust to match the original model's output units
model.load_state_dict(torch.load('letter_classification_model.pth'))
model.eval()

# Create a new model with the correct final layer
new_model = models.resnet18(pretrained=True)
# 7 for the number of classes
new_model.fc = nn.Linear(new_model.fc.in_features, 7)  # Adjust to match the desired output units

# Copy the weights and biases from the loaded model to the new model
new_model.fc.weight.data = model.fc.weight.data[0:2]  # Copy only the first 2 output units
new_model.fc.bias.data = model.fc.bias.data[0:2]

Prepare your new image for classification. You should use the same data transformations you used during training. Here's an example of how to prepare an image for inference:

In [9]:
# REPEAT steps from training data also on this testing data 

# Load and preprocess the unseen image
image_path = 'E_test_image.jpg'  # Replace with the path to your image
image = Image.open(image_path)
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# all these tasks are in preprocess variable
# in preprocess, we want to preprocess the image
input_tensor = preprocess(image)
# adding a batch dimension
input_batch = input_tensor.unsqueeze(0)  


Perform inference using the model:

In [10]:
# Perform inference
with torch.no_grad():
    # providing the input with input_batch
    output = model(input_batch)

# Get the predicted class
_, predicted_class = output.max(1)

# Map the predicted class to the class name
class_names = ['d', 'e', 'h', 'l', 'o', 'r', 'w']  # Make sure these class names match your training data
predicted_class_name = class_names[predicted_class.item()]

print(f'The predicted class is: {predicted_class_name}')


The predicted class is: e


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

# Display the image with the predicted class name
image = np.array(image)
plt.imshow(image)
plt.axis('off')
plt.text(10, 10, f'Predicted: {predicted_class_name}', fontsize=12, color='white', backgroundcolor='red')
plt.show()

NameError: name 'image' is not defined