In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.utils.data as data
import torchvision
from torchvision import transforms
import sklearn.model_selection

## Dataset and Variables

The dataset is a combined dataset from a natural image dataset and an image with text dataset.
The first consist of 6,899 images where the classes are: Airplanes, Cars, Cats, Dogs, Flowers, Fruits, Motorcycles and Persons. 

Load dataset and set training variables

In [2]:
!git clone https://github.com/dank100/ML-A2.git
TRAIN_DATA_PATH = "ML-A2/ImageTextDataSet/train/"
TEST_DATA_PATH = "ML-A2/ImageTextDataSet/test/"


TRANSFORM_IMG = transforms.Compose(
                   [transforms.Resize((32,32)),
                    transforms.ToTensor(),
                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

classes = ('NoText', 'Text')
CLASSES_NUM = 2

Cloning into 'ML-A2'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 13664 (delta 5), reused 26 (delta 3), pack-reused 13633[K
Receiving objects: 100% (13664/13664), 658.63 MiB | 11.88 MiB/s, done.
Resolving deltas: 100% (8/8), done.
Checking out files: 100% (12358/12358), done.


Load train/test data and create dataloader

In [3]:
use_cuda = torch.cuda.is_available()
# device = torch.device("cuda:0" if use_cuda else "cpu")

train_data = torchvision.datasets.ImageFolder(root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG)
train_data_loader  = data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
test_data = torchvision.datasets.ImageFolder(root=TEST_DATA_PATH, transform=TRANSFORM_IMG)
test_data_loader  = data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)



Define the Convolutional Neural Network

In [5]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size = 3)
        self.pool = nn.MaxPool2d(stride = 2, kernel_size = 2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size = 5)
        self.fc1 = nn.Linear(2704, 120)
        self.fc2 = nn.Linear(120, 84)
        #Reduced to number of labels
        self.fc3 = nn.Linear(84, CLASSES_NUM)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Training, use the training variables to train the network

In [6]:
print("Number of train samples: ", len(train_data))
print("Number of test samples: ", len(test_data))
print("Detected Classes are: ", train_data.class_to_idx)

EPOCHS = 5
BATCH_SIZE = 4
LEARNING_RATE = 0.001

model = CNN()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.CrossEntropyLoss()

# Training
for epoch in range(EPOCHS):
    for i, data in enumerate(train_data_loader, 0):

        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_func(outputs, labels)
        loss.backward()
        optimizer.step()

print("Done")

Number of train samples:  9948
Number of test samples:  2407
Detected Classes are:  {'notext': 0, 'text': 1}
Training finished


# Evaluation
Test the model on test data

In [7]:
correct = 0
total = 0

class_correct = list(0. for i in range(CLASSES_NUM))
class_total = list(0. for i in range(CLASSES_NUM))

with torch.no_grad():
    for data in test_data_loader:
        inputs, labels = data
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        c = (predicted == labels).squeeze()  
        for i in range(BATCH_SIZE):
            try:
              label = labels[i]
              class_correct[label] += c[i].item()
              class_total[label] += 1
            except:
              print(labels)


for i in range(CLASSES_NUM):
  print('Accuracy of %5s : %2d %%' % (classes[i], 100 * class_correct[i] / class_total[i]))
    
print('Accuracy of the network on the test images: %d %%' % (
  100 * correct / total))    

tensor([0, 0, 1, 0, 1, 1, 1])
Accuracy of NoText : 90 %
Accuracy of  Text : 76 %
Accuracy of the network on the test images: 83 %
