<h1>Imports</h1>

In [5]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import datasets, transforms
from tqdm import tqdm 
from collections import defaultdict
import random

<h1> Dataloader </h1>

In [19]:
class CustomDataset(Dataset):
    def __init__(self, dataset_folder, labels_file, transform=None):
        self.dataset_folder = dataset_folder
        self.labels_file = labels_file
        self.transform = transform
        self.data = []
        self.labels = []
        self.label_dict = {}
        self.label_to_idx = {}
        
        with open(labels_file, 'r', encoding='utf-8') as f:
            for line in f:
                img_path, label = line.strip().split('\t')
                img_name = img_path.split('/')[-1]  
                self.data.append(img_name)
                self.labels.append(label)
                
                if label not in self.label_dict:
                    self.label_dict[label] = len(self.label_dict)
                    self.label_to_idx[len(self.label_dict) - 1] = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data[idx]
        label = self.labels[idx]
        img_path = os.path.join(self.dataset_folder, img_name)
        image = Image.open(img_path).convert("L")
        label_encoded = self.label_dict[label]
        if self.transform:
            image = self.transform(image)
        return image, label_encoded

transform = transforms.Compose([
    transforms.Resize((28, 28)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

dataset_folder = r'D:\405 FOUND\Comp-Vision\OCR-With-CNN\CNN-TextRecognition\Dataset'
labels_file = r'D:\405 FOUND\Comp-Vision\OCR-With-CNN\CNN-TextRecognition\Dataset\Labels.txt'
dataset = CustomDataset(dataset_folder, labels_file, transform=transform)

train_data, val_data = train_test_split(list(zip(dataset.data, dataset.labels)), test_size=0.2, random_state=42)

train_dataset = CustomDataset(dataset_folder, labels_file, transform=transform)
train_dataset.data, train_dataset.labels = zip(*train_data)

val_dataset = CustomDataset(dataset_folder, labels_file, transform=transform)
val_dataset.data, val_dataset.labels = zip(*val_data)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

num_classes = len(dataset.label_dict)

print(f"Train dataset size: {len(train_loader.dataset)}")
print(f"Validation dataset size: {len(val_loader.dataset)}")
print(f"Number of classes: {num_classes}")


Train dataset size: 30936
Validation dataset size: 7734
Number of classes: 3867


<h1> Architecture </h1>
CNN-Fully Connected Layers

In [25]:
class CRNNForOCR(nn.Module):
    def __init__(self, num_classes, dropout=0.3):
        super(CRNNForOCR, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.conv1x1 = nn.Conv2d(128, 256, kernel_size=1)
        self.lstm = nn.LSTM(input_size=768, hidden_size=256, num_layers=3, batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(256 * 2, num_classes)

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.conv1x1(x)
        batch_size, channels, height, width = x.size()
        x = x.permute(0, 3, 1, 2).contiguous()
        x = x.view(batch_size, width, channels * height)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc(x)
        return x


# Training Configuration
device = torch.device("cuda")
model = CRNNForOCR(num_classes=len(dataset.label_dict), dropout=0.4).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
num_epochs = 200

<h1> Training </h1>

In [26]:
for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    avg_train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    print(f"Epoch {epoch}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

    if epoch % 5 == 0:
        model.eval()
        correct_val = 0
        total_val = 0
        val_loss = 0.0

        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)

                outputs = model(images)

                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = 100 * correct_val / total_val

        print(f"Epoch {epoch}/{num_epochs}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")


Epoch 1/250, Train Loss: 8.2586, Train Accuracy: 0.04%
Epoch 2/250, Train Loss: 7.8901, Train Accuracy: 0.15%
Epoch 3/250, Train Loss: 7.0415, Train Accuracy: 0.50%
Epoch 4/250, Train Loss: 6.5259, Train Accuracy: 1.05%
Epoch 5/250, Train Loss: 6.1594, Train Accuracy: 1.84%
Epoch 5/250, Val Loss: 6.1917, Val Accuracy: 1.12%
Epoch 6/250, Train Loss: 5.8593, Train Accuracy: 2.92%
Epoch 7/250, Train Loss: 5.5828, Train Accuracy: 4.61%
Epoch 8/250, Train Loss: 5.3212, Train Accuracy: 6.63%
Epoch 9/250, Train Loss: 5.0805, Train Accuracy: 9.20%
Epoch 10/250, Train Loss: 4.8450, Train Accuracy: 11.91%
Epoch 10/250, Val Loss: 4.8915, Val Accuracy: 8.30%
Epoch 11/250, Train Loss: 4.6152, Train Accuracy: 15.42%
Epoch 12/250, Train Loss: 4.3963, Train Accuracy: 19.18%
Epoch 13/250, Train Loss: 4.1734, Train Accuracy: 23.42%
Epoch 14/250, Train Loss: 3.9730, Train Accuracy: 27.52%
Epoch 15/250, Train Loss: 3.7661, Train Accuracy: 31.54%
Epoch 15/250, Val Loss: 3.8597, Val Accuracy: 24.71%
Epoch 1

In [27]:
os.makedirs('weights', exist_ok=True)
torch.save(model.state_dict(), 'weights/model_final.pth')