<h1>Imports</h1>

In [14]:
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import os
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split

<h1> Dataloader </h1>

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder

class CustomDataset(Dataset):
    def __init__(self, labels_file, dataset_folder, transform=None):
        self.dataset_folder = dataset_folder
        self.transform = transform
        self.data = []
        self.label_encoder = LabelEncoder()
        with open(labels_file, 'r', encoding='utf-8') as f:
            for line in f:
                img_path, label = line.strip().split('\t')
                full_img_path = os.path.join(dataset_folder, img_path)
                self.data.append((full_img_path, label))
        
        self.labels = [label for _, label in self.data]
        self.label_encoder.fit(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path).convert("RGB")
        label_encoded = self.label_encoder.transform([label])[0]
        if self.transform:
            image = self.transform(image)
        return image, label_encoded

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset_folder = r'CNN-TextRecognition\Datasetr'
labels_file = 'Labels.txt'

dataset = CustomDataset(labels_file, dataset_folder, transform=transform)
train_data, val_data = train_test_split(dataset.data, test_size=0.2, random_state=42)

train_dataset = CustomDataset(labels_file, dataset_folder, transform=transform)
train_dataset.data = train_data
val_dataset = CustomDataset(labels_file, dataset_folder, transform=transform)
val_dataset.data = val_data

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"Train dataset size: {len(train_loader.dataset)}")
print(f"Validation dataset size: {len(val_loader.dataset)}")


<h1> Architecture </h1>
CNN-RNN Layers

In [47]:
class CNNRNNForOCR(nn.Module):
    def __init__(self, vocab_size, embed_size=128, hidden_size=256, num_layers=2, dropout=0.5):
        super(CNNRNNForOCR, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.lstm = nn.LSTM(input_size=128, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 5)
    def forward(self, x):
        x = self.conv_layers(x)
        batch_size, channels, height, width = x.size()
        x = x.view(batch_size, channels, -1).permute(0, 2, 1)
        lstm_out, _ = self.lstm(x)
        output = self.fc1(lstm_out)
        return output

vocab_size = len(dataset.word_to_idx)
model = CNNRNNForOCR(vocab_size=vocab_size, embed_size=128, hidden_size=256, num_layers=2, dropout=0.5)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


<h1> Training </h1>

In [52]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        optimizer.zero_grad()
        output = model(images)
        print(output)
        # output = output.view(-1, 5)
        # labels = torch.cat([label for label in labels], dim=0)
        # print
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")


tensor([[[-0.0294, -0.0519, -0.0130,  0.0154,  0.0400],
         [-0.0260, -0.0490, -0.0080,  0.0144,  0.0386],
         [-0.0195, -0.0401, -0.0030,  0.0141,  0.0314],
         [-0.0180, -0.0397, -0.0065,  0.0175,  0.0351],
         [-0.0171, -0.0403, -0.0085,  0.0165,  0.0436],
         [-0.0148, -0.0374, -0.0100,  0.0138,  0.0397],
         [-0.0168, -0.0383, -0.0102,  0.0146,  0.0382],
         [-0.0184, -0.0366, -0.0114,  0.0111,  0.0378],
         [-0.0205, -0.0395, -0.0062,  0.0092,  0.0397]]],
       grad_fn=<ViewBackward0>)


IndexError: Target 21106 is out of bounds.