# Small Project

## Architecture

Data
|
DataLoader
|
Resize (224x224)
|


## Inspecting Datasets

In [1]:
import os

for root, dirs, files in os.walk("dataset"):
    if len(files) > 0:
        print("Dataset directory: {:30}\tCount: {}".format(root, len(files)))

Dataset directory: dataset/train/normal          	Count: 1341
Dataset directory: dataset/train/infected/non-covid	Count: 2530
Dataset directory: dataset/train/infected/covid  	Count: 1345
Dataset directory: dataset/val/normal            	Count: 8
Dataset directory: dataset/val/infected/non-covid	Count: 8
Dataset directory: dataset/val/infected/covid    	Count: 9
Dataset directory: dataset/test/normal           	Count: 234
Dataset directory: dataset/test/infected/non-covid	Count: 242
Dataset directory: dataset/test/infected/covid   	Count: 139


## DataLoader

### Imports

In [2]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

### Global Dataset Object

In [85]:
# Custom Lungs Dataset that can be constructed into Train, Test, Validation dataset respectively, and select a Dataset (Normal-Infected or Covid-NonCovid) to use, based on the binary classifier implementation.
class LungDataset(Dataset):
    def __init__(self, group, mode=0):

        # Select the mode for Dataset
        if mode not in [0, 1]:
            raise ValueError("Please input a mode as integers: 0: [Normal-Infected], 1: [Covid-NonCovid]")
        self.mode = mode

        # All images are of size 150 x 150
        self.img_size = (150, 150)
        
        # Only two classes will be considered here (normal and infected)
        self.classes = {0: 'normal', 1: 'infected_covid', 2: 'infected_non_covid'}
        
        # The dataset has been split in training, testing and validation datasets
        self.groups = [group]
        
        # Number of images in each part of the dataset
        self.dataset_numbers = {'train_normal': 1341,\
                                'train_infected_covid': 1345,\
                                'train_infected_non_covid': 2530,\
                                'val_normal': 8,\
                                'val_infected_covid': 9,\
                                'val_infected_non_covid': 8,\
                                'test_normal': 234,\
                                'test_infected_covid': 139,\
                                'test_infected_non_covid': 242}
        
    def get_dataset_path(self, _class):
        sub_path = None
        group = self.groups[0]
        if _class == self.classes[1]:
            sub_path = os.path.join("infected", "covid")
        elif _class == self.classes[2]:
            sub_path = os.path.join("infected", "non-covid")
        else:
            sub_path = "normal"
        return os.path.join("dataset", group, sub_path)

    def filter_dataset_numbers(self):
        filtered_dataset_numbers_map = dict()
        for key, value in self.dataset_numbers.items():
            if self.groups[0] in key:
                filtered_dataset_numbers_map[key] = value
        return filtered_dataset_numbers_map

    def describe(self):
        if self.mode == 0:
            mode_str = "Normal-Infected"
        elif self.mode == 1:
            mode_str = "Covid-NonCovid"
        filtered_dataset_numbers_map = self.filter_dataset_numbers()
        # Generate description
        msg = "This is the Lung {} {} Dataset used for the Small Project Demo in the 50.039 Deep Learning class".format(self.groups[0].upper(), mode_str)
        msg += " in Feb-March 2021. \n"
        msg += "It contains a total of {} images, ".format(len(self))
        msg += "of size {} by {}.\n".format(self.img_size[0], self.img_size[1])
        msg += "Images have been split in three groups: training, testing and validation sets.\n"
        msg += "The images are stored in the following locations "
        msg += "and each one contains the following number of images:\n"
        for group in self.groups:
            for _class in self.classes.values():
                label = "{}_{}".format(group, _class)
                path = self.get_dataset_path(_class)
                if self.mode == 1 and "normal" not in _class:
                    msg += " - {}, in folder {}: {} images.\n".format(label, path, filtered_dataset_numbers_map[label])
                elif self.mode == 0:
                    msg += " - {}, in folder {}: {} images.\n".format(label, path, filtered_dataset_numbers_map[label])
        print(msg)
    
    def open_img(self, _class, index):
        group = self.groups[0]
        if _class not in self.classes.values():
            raise ValueError("Input class not found! Please input: {}. Got: {}".format(list(self.classes.values()), _class))
        max_val = self.dataset_numbers['{}_{}'.format(group, _class)]
        if index < 0 or index >= max_val:
            raise ValueError("Index out of range! Should be (0 ~ {}) but got {}".format(max_val-1, index))
        path_to_file = os.path.join(self.get_dataset_path(_class), "{}.jpg".format(index))
        with open(path_to_file, 'rb') as f:
            im = np.asarray(Image.open(f)) / 255    # Normalize
        f.close()
        return im
    
    def show_img(self, _class, index):
        # Open image
        im = self.open_img(_class, index)
        
        # Display
        plt.imshow(im)

    def __len__(self):
        length = 0
        for key, item in self.dataset_numbers.items():
            if self.groups[0] in key:
                if self.mode == 0:
                    length += item
                elif self.mode == 1 and "normal" not in key:
                    length += item
        return length

    def __getitem__(self, index):
        filtered_dataset_numbers_map = self.filter_dataset_numbers()
        if self.mode == 0:
            first_val = int(list(filtered_dataset_numbers_map.values())[0])
            second_val = int(list(filtered_dataset_numbers_map.values())[1])
            if index < first_val:
                _class = 'normal'
                label = torch.Tensor([1, 0])
            elif first_val <= index < first_val + second_val:
                _class = 'infected_covid'
                index = index - first_val
                label = torch.Tensor([0, 1])
            else:
                _class = 'infected_non_covid'
                index = index - first_val - second_val
                label = torch.Tensor([0, 1])
        elif self.mode == 1:
            first_val = int(list(filtered_dataset_numbers_map.values())[1])
            if index < first_val:
                _class = 'infected_covid'
                label = torch.Tensor([1, 0])
            else:
                _class = 'infected_non_covid'
                index = index - first_val
                label = torch.Tensor([0, 1])
        im = self.open_img(_class, index)
        im = transforms.functional.to_tensor(np.array(im)).float()
        return im, label


#### To get the training set for Normal-Infected:

In [86]:
trainset = LungDataset(group="train", mode=0)
trainset.describe()
print("This train set contains {} images in total.".format(len(trainset)))
im, label = trainset[5215]
print(im)

This is the Lung TRAIN Normal-Infected Dataset used for the Small Project Demo in the 50.039 Deep Learning class in Feb-March 2021. 
It contains a total of 5216 images, of size 150 by 150.
Images have been split in three groups: training, testing and validation sets.
The images are stored in the following locations and each one contains the following number of images:
 - train_normal, in folder dataset/train/normal: 1341 images.
 - train_infected_covid, in folder dataset/train/infected/covid: 1345 images.
 - train_infected_non_covid, in folder dataset/train/infected/non-covid: 2530 images.

This train set contains 5216 images in total.
tensor([[[0.0275, 0.0510, 0.0667,  ..., 0.0980, 0.1333, 0.1647],
         [0.0510, 0.0431, 0.0314,  ..., 0.1020, 0.1255, 0.1412],
         [0.0392, 0.0275, 0.0157,  ..., 0.0902, 0.1098, 0.1137],
         ...,
         [0.0471, 0.0275, 0.0235,  ..., 0.0510, 0.0627, 0.0784],
         [0.0392, 0.0275, 0.0314,  ..., 0.0510, 0.0627, 0.0784],
         [0.0314,

#### To get the testing set for Covid-NonCovid:

In [87]:
testset = LungDataset(group="test", mode=1)
testset.describe()
print("This test set contains {} images in total.".format(len(testset)))
im, label = testset[380]
print(im)

This is the Lung TEST Covid-NonCovid Dataset used for the Small Project Demo in the 50.039 Deep Learning class in Feb-March 2021. 
It contains a total of 381 images, of size 150 by 150.
Images have been split in three groups: training, testing and validation sets.
The images are stored in the following locations and each one contains the following number of images:
 - test_infected_covid, in folder dataset/test/infected/covid: 139 images.
 - test_infected_non_covid, in folder dataset/test/infected/non-covid: 242 images.

This test set contains 381 images in total.
tensor([[[0.0471, 0.0588, 0.0784,  ..., 0.3020, 0.2941, 0.2902],
         [0.0627, 0.0824, 0.1020,  ..., 0.3255, 0.3176, 0.3098],
         [0.0941, 0.1137, 0.1451,  ..., 0.3333, 0.3294, 0.3216],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]])


### Global DataLoader Object

#### Create DataLoaders for Normal-Infected

In [88]:
trainset_normal_infected = LungDataset(group="train", mode=0)
testset_nomral_infected = LungDataset(group="test", mode=0)
valset_normal_infected = LungDataset(group="val", mode=0)
train_loader = DataLoader(trainset_normal_infected, batch_size=4, shuffle=True)
test_loader = DataLoader(testset_nomral_infected, batch_size=4, shuffle=True)
val_loader = DataLoader(valset_normal_infected, batch_size=4, shuffle=True)
print(train_loader, test_loader, val_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7fb1c2595520> <torch.utils.data.dataloader.DataLoader object at 0x7fb1c2585640> <torch.utils.data.dataloader.DataLoader object at 0x7fb1c2595880>


In [89]:
count = 0
for k, v in enumerate(train_loader):
    count += 1
print(count)

1304


#### Create DataLoaders for Covid-NonCovid

In [90]:
trainset_covid_noncovid = LungDataset(group="train", mode=1)
testset_covid_noncovid = LungDataset(group="test", mode=1)
valset_covid_noncovid = LungDataset(group="val", mode=1)
train_loader = DataLoader(trainset_covid_noncovid, batch_size=4, shuffle=True)
test_loader = DataLoader(testset_covid_noncovid, batch_size=4, shuffle=True)
val_loader = DataLoader(valset_covid_noncovid, batch_size=4, shuffle=True)
print(train_loader, test_loader, val_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7fb1c25dbca0> <torch.utils.data.dataloader.DataLoader object at 0x7fb1c257c970> <torch.utils.data.dataloader.DataLoader object at 0x7fb1c257caf0>


In [91]:
count = 0
for k, v in enumerate(train_loader):
    count += 1
print(count)

969
