# Optical Character Recognition with PyTorch

## Import Libraries
### The code uses PyTorch's utils.data and torchvision for dataset handling and transformations.


In [20]:
import torch
from torch.utils.data import DataLoader, ConcatDataset, random_split
from torchvision import datasets, transforms

## Applies Transformations: 
   ### First we convert the images to grayscale (since we train on EMNIST AND it has only grayscale images)
   ### Normalizes the pixel values (to help the model train more effectively).

In [21]:
transform = transforms.Compose([
    transforms.Grayscale(),  
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

## Loads the EMNIST Dataset:
   ### Downloads the EMNIST dataset, which includes images of letters.
   ### Combines the train and test parts to form the usable dataset

In [22]:
train_dataset = datasets.EMNIST(root='./data', split='letters', train=True, download=True, transform=transform)
test_dataset = datasets.EMNIST(root='./data', split='letters', train=False, download=True, transform=transform)

# Making the full datatset by joining the train_dataset and test_dataset
full_dataset = ConcatDataset([train_dataset, test_dataset])
total_len = len(full_dataset)

## Extracts a Subset
### Takes 30% of the combined dataset for quick training (don't want to spend all day here).

In [23]:
# defining percent of portion to use and not to use
portion_size = int(0.3 * total_len)   
unused_size = total_len - portion_size

# actually getting the 30% from the dataset
subset_30_percent, _ = random_split(
    full_dataset,
    [portion_size, unused_size],
    generator=torch.Generator().manual_seed(42)   # this here makes sure we always get the same examples in the 30% we choose
)

print(f"Subset size (30% of total): {len(subset_30_percent)}")

Subset size (30% of total): 43680


## Splits the Subset: 
### Divides the 30% subset into:
   - ### 90% for training.
   - ### 10% for testing.

In [24]:
# Within that 30%, split 90% for training and 10% for testing
train_size = int(0.9 * len(subset_30_percent))  
test_size = len(subset_30_percent) - train_size
new_train_subset, new_test_subset = random_split(
    subset_30_percent,
    [train_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

# prints out the number of examples we have in the train/test sets
print(f"New training size (90% of 30% subset): {len(new_train_subset)}")
print(f"New testing size (10% of 30% subset): {len(new_test_subset)}")

New training size (90% of 30% subset): 39312
New testing size (10% of 30% subset): 4368


## Creates DataLoaders

### Converts the training and testing subsets into DataLoaders for easier batch processing during training.

In [25]:
# Create DataLoaders from these new subsets --> Basically loading stuff up for our use
train_loader = DataLoader(new_train_subset, batch_size=64, shuffle=True)
test_loader = DataLoader(new_test_subset, batch_size=64, shuffle=False)

## Once again, setting up some important external stuff
### We are using the torch.nn library to define our Neural Network

In [26]:
import torch.nn as nn
import torch.nn.functional as F

## Don't Worry About the fancy stuff!

### `class` --> Our Blueprint, under here we define our Neural Network
### `__init__` --> Sets up stuff (A Constructor for those of you who paid attention in OOPs class)
### `super()` --> Gives access to properties of the parent (nn.Module)
### `forward()` --> Defines how input is processed, one instance of front prop 

### Convolution -> https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html

### Max Pool -> https://pytorch.org/docs/stable/generated/torch.nn.functional.max_pool2d.html

### ReLU (Rectified Linear Unit) -> https://pytorch.org/docs/stable/generated/torch.nn.functional.relu.html

### Fully Connected (FC Linear Layer) -> https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

### Flatten has been done for you through view(), do it before you start putting FC layers

In [27]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1)  # Output: (16, 28, 28)
        # Fully connected layers
        self.fc1 = nn.Linear(8*14*14, 64)
        self.fc2 = nn.Linear(64, 27)  # 27 classes for EMNIST (letters)

    def forward(self, x):
        # Convolutional layers with ReLU and MaxPooling
        x = F.relu(F.max_pool2d(self.conv1(x), 2))  # Output: (16, 14, 14)
        # Flatten
        x = x.view(x.size(0), -1)  # Shape: (batch_size, 64*3*3)
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## Let's count the number of trainable parameters we have

#### we define our `model` to be the `SimpleCNN()` object we wrote down
#### the `p.numel() for p in model.parameters() if p.requires_grad` only picks those params which undergo training  
#### i.e. it only picks trainable parameters

In [28]:
model = SimpleCNN()
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Parameters: {total_params}")


Total Parameters: 102251


## Now we define the loss function we are going to use!
### We will use the CrossEntropyLoss function
### We also use an optimizer function called Adam to make our Gradient Descent quicker

In [29]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Set the Device

### `torch.device("cuda" if torch.cuda.is_available() else "cpu")` checks if a GPU (CUDA) is available and sets the computation device accordingly. If no GPU is available, it defaults to the CPU.
    
### `model.to(device)` moves the model to the selected device (GPU or CPU).

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


SimpleCNN(
  (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=1568, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=27, bias=True)
)

## The training loop

### we define the number of epochs, running_loss keeps track of the loss per epoch
### the inner loop, passes the images and their labels (what's written on it... the answers) to the model in device

In [31]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Resets the derivative turn in gradient descent after every backprop
        optimizer.zero_grad()
        
        # Forward pass, criteria for determining the loss is the output answer and the answers we have (label)
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # add whatever loss was calculated per image-label to the running loss
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

Epoch [1/5], Loss: 1.1540
Epoch [2/5], Loss: 0.5775
Epoch [3/5], Loss: 0.4597
Epoch [4/5], Loss: 0.4019
Epoch [5/5], Loss: 0.3609


## Let's see how accurate it is!

In [34]:
model.eval()
correct = 0
total = 0

for images, labels in test_loader:
    images, labels = images.to(device), labels.to(device)
    outputs = model(images)
    _, predicted = torch.max(outputs, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")


Test Accuracy: 87.25%


## Let's save our trained Neural Network

In [33]:
torch.save(model.state_dict(), "pytorch_ocr.pth")