<h1>Imports</h1>

In [9]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import os
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

<h1> Dataloader </h1>

In [14]:
class CustomDataset(Dataset):
    def __init__(self, labels_file, dataset_folder, transform=None):
        self.dataset_folder = dataset_folder
        self.transform = transform
        self.data = []
        self.label_encoder = LabelEncoder()
        with open(labels_file, 'r', encoding='utf-8') as f:
            for line in f:
                img_path, label = line.strip().split('\t')
                full_img_path = os.path.join(dataset_folder, img_path)
                self.data.append((full_img_path, label))
        
        self.labels = [label for _, label in self.data]
        self.label_encoder.fit(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path).convert("RGB")
        label_encoded = self.label_encoder.transform([label])[0]
        if self.transform:
            image = self.transform(image)
        return image, label_encoded

transform = transforms.Compose([
    transforms.Resize((28, 28)),
    transforms.ToTensor(),
    transforms.Grayscale(num_output_channels=1),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset_folder = r'CNN-TextRecognition\Dataset'
labels_file = r'CNN-TextRecognition\Dataset\Labels.txt'
dataset = CustomDataset(labels_file, dataset_folder, transform=transform)
train_data, val_data = train_test_split(dataset.data, test_size=0.2, random_state=42)
train_dataset = CustomDataset(labels_file, dataset_folder, transform=transform)
train_dataset.data = train_data
val_dataset = CustomDataset(labels_file, dataset_folder, transform=transform)
val_dataset.data = val_data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Train dataset size: {len(train_loader.dataset)}")
print(f"Validation dataset size: {len(val_loader.dataset)}")


Train dataset size: 30936
Validation dataset size: 7734


<h1> Architecture </h1>
CNN-Fully Connected Layers

In [18]:
class CNNForOCR(nn.Module):
    def __init__(self, num_classes, dropout=0.5):
        super(CNNForOCR, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1), 
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(256, 256, kernel_size=1), 
            nn.ReLU(),
        )
        self.fc1 = nn.Linear(256 * 7 * 7, 1024)  
        self.fc2 = nn.Linear(1024, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

num_classes = len(dataset.label_encoder.classes_)
model = CNNForOCR(num_classes=num_classes, dropout=0.5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


cuda


<h1> Training </h1>